Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

589 lines
8.8 KiB

  1. #include "ksia64.h"
  2. //++
  3. //
  4. // VOID
  5. // run_fms (
  6. // IN ULONGLONG *fpsr,
  7. // OUT FLOAT128 *fr1,
  8. // IN FLOAT128 *fr2,
  9. // IN FLOAT128 *fr3,
  10. // IN FLOAT128 *fr4
  11. // )
  12. //
  13. // Routine Description:
  14. //
  15. // This function runs FMS operation with the specified inputs and FPSR.
  16. //
  17. //--
  18. LEAF_ENTRY(run_fms)
  19. alloc r31=ar.pfs,5,2,0,0 // r32, r33, r34, r35, r36, r37, r38
  20. ARGPTR (r32)
  21. ARGPTR (r33)
  22. ARGPTR (r34)
  23. ARGPTR (r35)
  24. ARGPTR (r36)
  25. // &fpsr is in r32
  26. // &fr1 (output) is in r33
  27. // &fr2 (input) is in r34
  28. // &fr3 (input) is in r35
  29. // &fr4 (input) is in r36
  30. // save old FPSR in r37
  31. mov r37 = ar40
  32. nop.i 0;;
  33. // load new fpsr in r38
  34. ld8 r38 = [r32];;
  35. // set new value of FPSR
  36. mov ar40 = r38
  37. nop.i 0;;
  38. // load first input argument into f8
  39. ldf.fill f8 = [r34]
  40. // load second input argument into f9
  41. ldf.fill f9 = [r35]
  42. nop.i 0;;
  43. // load third input argument into f10
  44. ldf.fill f10 = [r36]
  45. nop.m 0
  46. nop.i 0;;
  47. nop.m 0
  48. (p0) fms.s0 f11 = f8, f9, f10 // f11 = f8 * f9 - f10
  49. nop.i 0;;
  50. // store result
  51. stf.spill [r33] = f11
  52. // save new FPSR in r38
  53. mov r38 = ar40
  54. nop.i 0;;
  55. // store new fpsr from r38
  56. st8 [r32] = r38
  57. // restore FPSR
  58. mov ar40 = r37
  59. nop.i 0;;
  60. nop.m 0
  61. nop.i 0
  62. // return
  63. LEAF_RETURN
  64. LEAF_EXIT(run_fms)
  65. //++
  66. //
  67. // VOID
  68. // thmF (
  69. // IN ULONGLONG *fpsr,
  70. // OUT FLOAT128 *fr1,
  71. // IN FLOAT128 *fr2,
  72. // IN FLOAT128 *fr3
  73. // )
  74. //
  75. // Routine Description:
  76. //
  77. //--
  78. LEAF_ENTRY(thmF)
  79. alloc r31=ar.pfs,4,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39
  80. ARGPTR (r32)
  81. ARGPTR (r33)
  82. ARGPTR (r34)
  83. ARGPTR (r35)
  84. // &fpsr is in r32
  85. // &a is in r33
  86. // &b is in r34
  87. // &div is in r35 (the address of the divide result)
  88. // save old FPSR in r36
  89. mov r36 = ar40
  90. // save predicates in r37
  91. mov r37 = pr;;
  92. // load new fpsr in r39
  93. ld8 r39 = [r32];;
  94. // set new value of FPSR
  95. mov ar40 = r39
  96. nop.i 0;;
  97. nop.m 0
  98. // clear predicates
  99. movl r38 = 0x0000000000000001;;
  100. nop.m 0
  101. // load clear predicates from r38
  102. mov pr = r38,0x1ffff
  103. nop.i 0;;
  104. // load a, the first argument, in f6
  105. ldf.fill f6 = [r33]
  106. // load b, the second argument, in f7
  107. ldf.fill f7 = [r34]
  108. nop.i 0;;
  109. nop.m 0
  110. // Step (1)
  111. // y0 = 1 / b in f8
  112. frcpa.s0 f8,p2=f6,f7
  113. nop.i 0;;
  114. nop.m 0
  115. // Step (2)
  116. // e0 = 1 - b * y0 in f9
  117. (p2) fnma.s1 f9=f7,f8,f1
  118. nop.i 0
  119. nop.m 0
  120. // Step (10)
  121. // q0 = a * y0 in f10
  122. (p2) fma.s1 f10=f6,f8,f0
  123. nop.i 0;;
  124. nop.m 0
  125. // Step (3)
  126. // y1 = y0 + e0 * y0 in f8
  127. (p2) fma.s1 f8=f9,f8,f8
  128. nop.i 0
  129. nop.m 0
  130. // Step (4)
  131. // e1 = e0 * e0 in f9
  132. (p2) fma.s1 f9=f9,f9,f0
  133. nop.i 0
  134. nop.m 0
  135. // Step (11)
  136. // r0 = a - b * q0 in f11
  137. (p2) fnma.s1 f11=f7,f10,f6
  138. nop.i 0;;
  139. nop.m 0
  140. // Step (5)
  141. // y2 = y1 + e1 * y1 in f8
  142. (p2) fma.s1 f8=f8,f9,f8
  143. nop.i 0;;
  144. nop.m 0
  145. // Step (6)
  146. // e2 = 1 - b * y2 in f9
  147. (p2) fnma.s1 f9=f7,f8,f1
  148. nop.i 0;;
  149. nop.m 0
  150. // Step (7)
  151. // y3 = y2 + e2 * y2 in f8
  152. (p2) fma.s1 f8=f8,f9,f8
  153. nop.i 0;;
  154. nop.m 0
  155. // Step (8)
  156. // e3 = 1 - b * y3 in f9
  157. (p2) fnma.s1 f9=f7,f8,f1
  158. nop.i 0
  159. nop.m 0
  160. // Step (12)
  161. // q1 = q0 + r0 * y3 in f10
  162. (p2) fma.s1 f10=f11,f8,f10
  163. nop.i 0;;
  164. nop.m 0
  165. // Step (9)
  166. // y4 = y3 + e3 * y3 in f8
  167. (p2) fma.s1 f8=f8,f9,f8
  168. nop.i 0
  169. nop.m 0
  170. // Step (13)
  171. // r1 = a - b * q1 in f11
  172. (p2) fnma.s1 f11=f7,f10,f6
  173. nop.i 0;;
  174. nop.m 0
  175. // Step (14)
  176. // q2 = q1 + r1 * y4 in f8
  177. (p2) fma.s0 f8=f11,f8,f10
  178. nop.i 0;;
  179. // save new FPSR in r39
  180. mov r39 = ar40;;
  181. // store new fpsr from r39
  182. st8 [r32] = r39
  183. // restore predicates from r37
  184. mov pr = r37,0x1ffff;;
  185. // store result
  186. stf.spill [r35]=f8
  187. // restore FPSR
  188. mov ar40 = r36
  189. // return
  190. LEAF_RETURN
  191. LEAF_EXIT(thmF)
  192. //++
  193. //
  194. // VOID
  195. // thmL (
  196. // IN ULONGLONG *fpsr,
  197. // OUT FLOAT128 *fr1,
  198. // IN FLOAT128 *fr2
  199. // )
  200. //
  201. // Routine Description:
  202. //
  203. //--
  204. LEAF_ENTRY(thmL)
  205. alloc r31=ar.pfs,3,5,0,0 // r32, r33, r34, r35, r36, r37, r38, r39
  206. ARGPTR (r32)
  207. ARGPTR (r33)
  208. ARGPTR (r34)
  209. // &fpsr is in r32
  210. // &a is in r33
  211. // &sqrt is in r34 (the address of the sqrt result)
  212. // save old FPSR in r35
  213. mov r35 = ar40
  214. // save predicates in r36
  215. mov r36 = pr;;
  216. // load new fpsr in r38
  217. ld8 r38 = [r32];;
  218. // set new value of FPSR
  219. mov ar40 = r38
  220. nop.i 0;;
  221. nop.m 0
  222. // clear predicates
  223. movl r37 = 0x0000000000000001;;
  224. nop.m 0
  225. // load clear predicates from r37
  226. mov pr = r37,0x1ffff
  227. nop.i 0;;
  228. // load the argument a in f6
  229. ldf.fill f6 = [r33]
  230. nop.m 0
  231. nop.i 0;;
  232. nop.m 0
  233. // Step (1)
  234. // y0 = 1/sqrt(a) in f8
  235. frsqrta.s0 f8,p2=f6
  236. nop.i 0;;
  237. nop.m 0
  238. // Step (2)
  239. // load 1/2 in f7; h = 1/2 * a in f9
  240. (p2) movl r39 = 0x0fffe;;
  241. (p2) setf.exp f7 = r39
  242. nop.i 0;;
  243. nop.m 0
  244. (p2) fma.s1 f9=f7,f6,f0
  245. nop.i 0;;
  246. nop.m 0
  247. // Step (3)
  248. // t1 = y0 * y0 in f10
  249. (p2) fma.s1 f10=f8,f8,f0
  250. nop.i 0;;
  251. nop.m 0
  252. // Step (4)
  253. // t2 = 1/2 - t1 * h in f10
  254. (p2) fnma.s1 f10=f10,f9,f7
  255. nop.i 0;;
  256. nop.m 0
  257. // Step (5)
  258. // y1 = y0 + t2 * y0 in f8
  259. (p2) fma.s1 f8=f10,f8,f8
  260. nop.i 0;;
  261. nop.m 0
  262. // Step (6)
  263. // t3 = y1 * h in f10
  264. (p2) fma.s1 f10=f8,f9,f0
  265. nop.i 0;;
  266. nop.m 0
  267. // Step (7)
  268. // t4 = 1/2 - t3 * y1 in f10
  269. (p2) fnma.s1 f10=f10,f8,f7
  270. nop.i 0;;
  271. nop.m 0
  272. // Step (8)
  273. // y2 = y1 + t4 * y1 in f8
  274. (p2) fma.s1 f8=f10,f8,f8
  275. nop.i 0;;
  276. nop.m 0
  277. // Step (9)
  278. // S = a * y2 in f10
  279. (p2) fma.s1 f10=f6,f8,f0
  280. nop.i 0;;
  281. nop.m 0
  282. // Step (10)
  283. // t5 = y2 * h in f9
  284. (p2) fma.s1 f9=f8,f9,f0
  285. nop.i 0;;
  286. nop.m 0
  287. // Step (11)
  288. // H = 1/2 * y2 in f11
  289. (p2) fma.s1 f11=f7,f8,f0
  290. nop.i 0;;
  291. nop.m 0
  292. // Step (13)
  293. // t6 = 1/2 - t5 * y2 in f7
  294. (p2) fnma.s1 f7=f9,f8,f7
  295. nop.i 0;;
  296. nop.m 0
  297. // Step (12)
  298. // d = a - S * S in f8
  299. (p2) fnma.s1 f8=f10,f10,f6
  300. nop.i 0;;
  301. nop.m 0
  302. // Step (14)
  303. // S1 = S + d * H in f8
  304. (p2) fma.s1 f8=f8,f11,f10
  305. nop.i 0;;
  306. nop.m 0
  307. // Step (15)
  308. // H1 = H + t6 * h in f7
  309. (p2) fma.s1 f7=f11,f7,f11
  310. nop.i 0;;
  311. nop.m 0
  312. // Step (16)
  313. // d1 = a - S1 * S1 in f6
  314. (p2) fnma.s1 f6=f8,f8,f6
  315. nop.i 0;;
  316. nop.m 0
  317. // Step (17)
  318. // R = S1 + d1 * H1 in f8
  319. (p2) fma.s0 f8=f6,f7,f8
  320. nop.i 0;;
  321. // save new FPSR in r38
  322. mov r38 = ar40;;
  323. // store new fpsr from r38
  324. st8 [r32] = r38
  325. // restore predicates from r36
  326. mov pr = r36,0x1ffff;;
  327. // store result
  328. stf.spill [r34]=f8
  329. // restore FPSR
  330. mov ar40 = r35
  331. // return
  332. LEAF_RETURN
  333. LEAF_EXIT(thmL)
  334. //++
  335. //
  336. // VOID
  337. // KiEmulateLoadFloat80(
  338. // IN PVOID UnalignedAddress,
  339. // OUT PVOID FloatData
  340. // );
  341. //
  342. //--
  343. LEAF_ENTRY(KiEmulateLoadFloat80)
  344. ARGPTR(a0)
  345. ARGPTR(a1)
  346. ldfe ft0 = [a0]
  347. ;;
  348. stf.spill [a1] = ft0
  349. LEAF_RETURN
  350. LEAF_EXIT(KiEmulateLoadFloat80)
  351. //++
  352. //
  353. // VOID
  354. // KiEmulateLoadFloatInt(
  355. // IN PVOID UnalignedAddress,
  356. // OUT PVOID FloatData
  357. // );
  358. //
  359. //--
  360. LEAF_ENTRY(KiEmulateLoadFloatInt)
  361. ARGPTR(a0)
  362. ARGPTR(a1)
  363. ldf8 ft0 = [a0]
  364. ;;
  365. stf.spill [a1] = ft0
  366. LEAF_RETURN
  367. LEAF_EXIT(KiEmulateLoadFloatInt)
  368. //++
  369. //
  370. // VOID
  371. // KiEmulateLoadFloat32(
  372. // IN PVOID UnalignedAddress,
  373. // OUT PVOID FloatData
  374. // );
  375. //
  376. //--
  377. LEAF_ENTRY(KiEmulateLoadFloat32)
  378. ARGPTR(a0)
  379. ARGPTR(a1)
  380. ldfs ft0 = [a0]
  381. ;;
  382. stf.spill [a1] = ft0
  383. LEAF_RETURN
  384. LEAF_EXIT(KiEmulateLoadFloat32)
  385. //++
  386. //
  387. // VOID
  388. // KiEmulateLoadFloat64(
  389. // IN PVOID UnalignedAddress,
  390. // OUT PVOID FloatData
  391. // );
  392. //
  393. //--
  394. LEAF_ENTRY(KiEmulateLoadFloat64)
  395. ARGPTR(a0)
  396. ARGPTR(a1)
  397. ldfd ft0 = [a0]
  398. ;;
  399. stf.spill [a1] = ft0
  400. LEAF_RETURN
  401. LEAF_EXIT(KiEmulateLoadFloat64)
  402. //++
  403. //
  404. // VOID
  405. // KiEmulateStoreFloat80(
  406. // IN PVOID UnalignedAddress,
  407. // OUT PVOID FloatData
  408. // );
  409. //
  410. //--
  411. LEAF_ENTRY(KiEmulateStoreFloat80)
  412. ARGPTR(a0)
  413. ARGPTR(a1)
  414. ldf.fill ft0 = [a1]
  415. ;;
  416. stfe [a0] = ft0
  417. LEAF_RETURN
  418. LEAF_EXIT(KiEmulateStoreFloat80)
  419. //++
  420. //
  421. // VOID
  422. // KiEmulateStoreFloatInt(
  423. // IN PVOID UnalignedAddress,
  424. // OUT PVOID FloatData
  425. // );
  426. //
  427. //--
  428. LEAF_ENTRY(KiEmulateStoreFloatInt)
  429. ARGPTR(a0)
  430. ARGPTR(a1)
  431. ldf.fill ft0 = [a1]
  432. ;;
  433. stfd [a0] = ft0
  434. LEAF_RETURN
  435. LEAF_EXIT(KiEmulateStoreFloatInt)
  436. //++
  437. //
  438. // VOID
  439. // KiEmulateStoreFloat32(
  440. // IN PVOID UnalignedAddress,
  441. // OUT PVOID FloatData
  442. // );
  443. //
  444. //--
  445. LEAF_ENTRY(KiEmulateStoreFloat32)
  446. ARGPTR(a0)
  447. ARGPTR(a1)
  448. ldf.fill ft0 = [a1]
  449. ;;
  450. stfs [a0] = ft0
  451. LEAF_RETURN
  452. LEAF_EXIT(KiEmulateStoreFloat32)
  453. //++
  454. //
  455. // VOID
  456. // KiEmulateStoreFloat64(
  457. // IN PVOID UnalignedAddress,
  458. // OUT PVOID FloatData
  459. // );
  460. //
  461. //--
  462. LEAF_ENTRY(KiEmulateStoreFloat64)
  463. ARGPTR(a0)
  464. ARGPTR(a1)
  465. ldf.fill ft0 = [a1]
  466. ;;
  467. stfd [a0] = ft0
  468. LEAF_RETURN
  469. LEAF_EXIT(KiEmulateStoreFloat64)