Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1118 lines
21 KiB

  1. // SAC MMx utilities
  2. #include <memory.h>
  3. #include "mmxutil.h"
  4. #include "opt.h"
  5. #define I2FTEST 0
  6. #if I2FTEST
  7. #include "stdio.h"
  8. #endif
  9. //------------------------------------------------------
  10. int IsMMX() // does the processor I'm running have MMX(tm) technology?
  11. {
  12. int retu;
  13. #ifdef _ALPHA_
  14. return 0;
  15. #endif
  16. #ifdef _X86_
  17. __asm
  18. {
  19. push ebx
  20. pushfd
  21. pop edx
  22. mov eax,edx
  23. xor edx,200000h
  24. push edx
  25. popfd
  26. pushfd
  27. pop edx
  28. //
  29. // DON'T do this. This clears EAX, but the code is relying
  30. // on edx being 0 in the bail out case!!!
  31. //
  32. // -mikeg
  33. //
  34. // xor eax,edx
  35. //
  36. //
  37. xor edx,eax //This is the right way
  38. je no_cpuid
  39. mov eax,1
  40. _emit 0x0f //CPUID magic incantation
  41. _emit 0xa2
  42. and edx,000800000h
  43. shr edx,23
  44. no_cpuid:
  45. mov retu,edx
  46. pop ebx
  47. }
  48. return(retu);
  49. #endif
  50. }
  51. //------------------------------------------------------
  52. /* The following 4 routines make an 8-byte-aligned 'output' array
  53. from an 'input' array with various alignments. MakeAlignedN assumes
  54. that 'input' starts on an address equal to N mod 8. For now we
  55. only handle even N.
  56. */
  57. //------------------------------------------------------
  58. void MakeAligned0(void *input, void *output, int numbytes)
  59. {
  60. memcpy(output,input,numbytes);
  61. }
  62. //------------------------------------------------------
  63. void MakeAligned2(void *input, void *output, int numbytes)
  64. {
  65. memcpy(output,input,numbytes);
  66. }
  67. //------------------------------------------------------
  68. void MakeAligned4(void *input, void *output, int numbytes)
  69. {
  70. memcpy(output,input,numbytes);
  71. }
  72. //------------------------------------------------------
  73. void MakeAligned6(void *input, void *output, int numbytes)
  74. {
  75. memcpy(output,input,numbytes);
  76. }
  77. //------------------------------------------------------
  78. int FloatToShortScaled(float *input, short *output, int len, int guard)
  79. {
  80. int max;
  81. /* Convert an array of floats to an array of shorts with dynamic scaling.
  82. If guard=0 the array is scaled so that the largest power of 2 contained
  83. in the input comes out as 16384, which means all values fit in 16 bits
  84. without overflow. If guard>0 the outputs are shifted an extra 'guard'
  85. bits to the right.
  86. */
  87. max = FloatMaxExp(input, len);
  88. ScaleFloatToShort(input, output, len, max + guard);
  89. return max;
  90. }
  91. int FloatToIntScaled(float *input, int *output, int len, int guard)
  92. {
  93. int max;
  94. /* Convert an array of floats to an array of shorts with dynamic scaling.
  95. If guard=0 the array is scaled so that the largest power of 2 contained
  96. in the input comes out as 2^30, which means all values fit in 32 bits
  97. without overflow. If guard>0 the outputs are shifted an extra 'guard'
  98. bits to the right.
  99. */
  100. max = FloatMaxExp(input, len);
  101. ScaleFloatToInt(input, output, len, max + guard);
  102. return max;
  103. }
  104. int FloatMaxExp(float *input, int len)
  105. {
  106. int max;
  107. #if ASM_FTOSS
  108. ASM
  109. {
  110. mov esi,input;
  111. xor eax,eax;
  112. mov ebx,len;
  113. xor edi,edi; // max
  114. loop2:
  115. mov ecx,DP[esi+4*eax];
  116. mov edx,DP[esi+4*eax+4];
  117. and ecx,07f800000h;
  118. and edx,07f800000h;
  119. cmp edi,ecx;
  120. jge skip1;
  121. mov edi,ecx;
  122. skip1:
  123. cmp edi,edx;
  124. jge skip2;
  125. mov edi,edx;
  126. skip2:
  127. add eax,2;
  128. cmp eax,ebx;
  129. jl loop2;
  130. mov max,edi;
  131. }
  132. #else
  133. int exp,i;
  134. max = 0;
  135. for (i=0; i<len; i++)
  136. {
  137. exp = (*((int *)(input + i))) & 0x7f800000;
  138. if (exp > max)
  139. max = exp;
  140. }
  141. #endif
  142. return max >> 23;
  143. }
  144. void ScaleFloatToShort(float *input, short *output, int len, int newmax)
  145. {
  146. int i;
  147. float scale;
  148. /*
  149. If max exponent is 14, we want a scale factor of 1, since
  150. then values will be at most +/- 32727. So scale factor multiplier
  151. should be 2^(14 - max - guard). But 'max' has the exponent bias
  152. built in, so we must add BIAS once to the exponent to get a "real"
  153. exponent. But then we want a FP exponent that has bias, so we
  154. need to add BIAS again! So we get 2^(2*BIAS+14 - max - guard).
  155. 2*BIAS+14 is 254 + 14 = 252+12, so it's 0x86000000 (first 9 bits 1 0000 1100)
  156. */
  157. i = 0x86000000 - (newmax << 23);
  158. scale = (*(float *)&i);
  159. #if ASM_FTOSS
  160. ASM
  161. {
  162. mov esi,input;
  163. mov edi,output;
  164. xor eax,eax;
  165. mov ebx,len;
  166. loop1:
  167. fld DP[esi+4*eax];
  168. fmul scale;
  169. fld DP[esi+4*eax+4];
  170. fmul scale;
  171. fxch(1);
  172. fistp WP[edi+2*eax];
  173. fistp WP[edi+2*eax+2];
  174. add eax,2;
  175. cmp eax,ebx;
  176. jl loop1;
  177. }
  178. #else
  179. for (i=0; i<len; i++)
  180. output[i] = (short)(input[i]*scale);
  181. #endif
  182. return;
  183. }
  184. void ConstFloatToShort(float *input, short *output, int len, float scale)
  185. {
  186. #if ASM_FTOSS
  187. ASM
  188. {
  189. mov esi,input;
  190. mov edi,output;
  191. xor eax,eax;
  192. mov ebx,len;
  193. loop1:
  194. fld DP[esi+4*eax];
  195. fmul scale;
  196. fld DP[esi+4*eax+4];
  197. fmul scale;
  198. fxch(1);
  199. fistp WP[edi+2*eax];
  200. fistp WP[edi+2*eax+2];
  201. add eax,2;
  202. cmp eax,ebx;
  203. jl loop1;
  204. }
  205. #else
  206. int i;
  207. for (i=0; i<len; i++)
  208. output[i] = (short)(input[i]*scale);
  209. #endif
  210. return;
  211. }
  212. //------------------------------------------------------
  213. void ScaleFloatToInt(float *input, int *output, int len, int newmax)
  214. {
  215. int i;
  216. float scale;
  217. i = 0x8E000000 - (newmax << 23);
  218. scale = (*(float *)&i);
  219. #if ASM_FTOSS
  220. ASM
  221. {
  222. mov esi,input;
  223. mov edi,output;
  224. xor eax,eax;
  225. mov ebx,len;
  226. loop1:
  227. fld DP[esi+4*eax];
  228. fmul scale;
  229. fld DP[esi+4*eax+4];
  230. fmul scale;
  231. fxch(1);
  232. fistp DP[edi+4*eax];
  233. fistp DP[edi+4*eax+4];
  234. add eax,2;
  235. cmp eax,ebx;
  236. jl loop1;
  237. }
  238. #else
  239. for (i=0; i<len; i++)
  240. output[i] = (int)(input[i]*scale);
  241. #endif
  242. return;
  243. }
  244. void ConstFloatToInt(float *input, int *output, int len, float scale)
  245. {
  246. #if ASM_FTOSS
  247. ASM
  248. {
  249. mov esi,input;
  250. mov edi,output;
  251. xor eax,eax;
  252. mov ebx,len;
  253. loop1:
  254. fld DP[esi+4*eax];
  255. fmul scale;
  256. fld DP[esi+4*eax+4];
  257. fmul scale;
  258. fxch(1);
  259. fistp DP[edi+4*eax];
  260. fistp DP[edi+4*eax+4];
  261. add eax,2;
  262. cmp eax,ebx;
  263. jl loop1;
  264. }
  265. #else
  266. int i;
  267. for (i=0; i<len; i++)
  268. output[i] = (int)(input[i]*scale);
  269. #endif
  270. return;
  271. }
  272. //------------------------------------------------------
  273. void CorrelateInt(short *taps, short *array, int *corr, int len, int num)
  274. {
  275. int i,j;
  276. for (i=0; i<num; i++) // for each correlation
  277. {
  278. corr[i] = 0;
  279. for (j=0; j<len; j++)
  280. corr[i] += (int)taps[j] * (int)array[i+j];
  281. }
  282. }
  283. #if ASM_CORR
  284. //------------------------------------------------------
  285. void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor)
  286. {
  287. #define rega0 mm0
  288. #define regb0 mm1
  289. #define rega1 mm2
  290. #define regb1 mm3
  291. #define rega2 mm4
  292. #define regb2 mm5
  293. #define acc0 mm6
  294. #define acc1 mm7
  295. #define arr esi
  296. #define tap edi
  297. #define cor eax
  298. #define icnt ebx
  299. // In the following macros, 'n' is the column number and 'i' is the
  300. // iteration number.
  301. #define la(n,i) ASM movq rega##n,QP[arr+8*i]
  302. #define lb(n,i) ASM movq regb##n,QP[tap+8*i+8]
  303. #define m0(n,i) ASM pmaddwd regb##n,rega##n
  304. #define m1(n,i) ASM pmaddwd rega##n,QP[tap+8*i]
  305. #define a0(n,i) ASM paddd acc0,regb##n
  306. #define a1(n,i) ASM paddd acc1,rega##n
  307. ASM
  308. {
  309. shr ntaps,2;
  310. sub taps,8; // point to 1 before start of taps array
  311. mov cor,corr;
  312. ForEachCorrPair:
  313. mov icnt,ntaps;
  314. pxor acc0,acc0;
  315. pxor acc1,acc1;
  316. mov tap,taps;
  317. mov arr,array;
  318. }
  319. // prime the pump
  320. la(0,0);
  321. lb(0,0);
  322. m0(0,0);
  323. ASM pxor rega0,rega0; // to make first a1(0,0) a nop
  324. la(1,1);
  325. lb(1,1);
  326. inner:
  327. la(2,2);
  328. m0(1,1);
  329. m1(1,1);
  330. a0(0,0);
  331. lb(2,2);
  332. a1(0,0);
  333. la(0,3);
  334. m0(2,2);
  335. m1(2,2);
  336. a0(1,1);
  337. lb(0,3);
  338. a1(1,1);
  339. la(1,4);
  340. m0(0,3);
  341. m1(0,3);
  342. a0(2,2);
  343. lb(1,4);
  344. a1(2,2);
  345. ASM add arr,24;
  346. ASM add tap,24;
  347. ASM sub icnt,3;
  348. ASM jg inner;
  349. a1(0,0);
  350. // Done with one correlation pair. First need to add halves of
  351. // acc0 and acc1 together and then store 2 results in corr array
  352. ASM
  353. {
  354. movq mm0,acc0;
  355. psrlq acc0,32;
  356. paddd acc0,mm0;
  357. movq mm1,acc1;
  358. psrlq acc1,32;
  359. movd DP[cor],acc0;
  360. paddd acc1,mm1;
  361. movd DP[cor+16],acc1;
  362. add cor,32;
  363. add array,16;
  364. sub ncor,2;
  365. jg ForEachCorrPair;
  366. emms;
  367. }
  368. }
  369. #undef rega0
  370. #undef regb0
  371. #undef rega1
  372. #undef regb1
  373. #undef rega2
  374. #undef regb2
  375. #undef acc0
  376. #undef acc1
  377. #undef arr
  378. #undef tap
  379. #undef cor
  380. #undef icnt
  381. #undef la
  382. #undef lb
  383. #undef m0
  384. #undef m1
  385. #undef a0
  386. #undef a1
  387. #else
  388. //------------------------------------------------------
  389. void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor)
  390. {
  391. int i,j,k;
  392. k = 0;
  393. for (i=0; i<ncor; i++) // for each correlation
  394. {
  395. corr[k] = 0;
  396. for (j=0; j<ntaps; j++)
  397. corr[k] += (int)taps[j] * (int)array[k+j];
  398. k += 4;
  399. }
  400. }
  401. #endif
  402. #if COMPILE_MMX
  403. #undef icnt
  404. void ab2abbcw(const short *input, short *output, int n)
  405. {
  406. #define in edi
  407. #define out esi
  408. #define icnt ecx
  409. #define L(m,i) ASM movq mm##m,QP[in+8*(i/2)]
  410. #define PL(m) ASM punpcklwd mm##m,mm##m
  411. #define PH(m) ASM punpckhwd mm##m,mm##m
  412. #define SL(m) ASM psllq mm##m,16
  413. #define SR(m) ASM psrlq mm##m,48
  414. #define O(m,n) ASM por mm##m,mm##n
  415. #define S(m,i) ASM movq QP[out+8*i],mm##m
  416. ASM {
  417. mov in, input;
  418. mov out, output;
  419. mov icnt, n;
  420. ASM pxor mm3,mm3;
  421. sub icnt, 8;
  422. jl odd_ends;
  423. }
  424. //prime pump
  425. L(0,0);
  426. PL(0);
  427. L(1,1);
  428. SL(0);
  429. PH(1);
  430. SL(1);
  431. O(3,0);
  432. L(2,2);
  433. SR(0);
  434. S(3,0);
  435. PL(2);
  436. ASM sub icnt, 8;
  437. ASM jl cleanup;
  438. inner:
  439. SL(2);
  440. O(0,1);
  441. L(3,3)
  442. SR(1);
  443. S(0,1);
  444. PH(3);
  445. SL(3);
  446. O(1,2);
  447. L(0,4);
  448. SR(2);
  449. S(1,2);
  450. PL(0);
  451. SL(0);
  452. O(2,3);
  453. L(1,5);
  454. SR(3);
  455. S(2,3);
  456. PH(1);
  457. SL(1);
  458. O(3,0);
  459. L(2,6);
  460. SR(0);
  461. S(3,4);
  462. PL(2);
  463. ASM add in, 16;
  464. ASM add out, 32;
  465. ASM sub icnt, 8;
  466. ASM jg inner;
  467. cleanup:
  468. SL(2);
  469. O(0,1);
  470. L(3,2);
  471. SR(1);
  472. S(0,1);
  473. PH(3);
  474. SL(3);
  475. O(1,2);
  476. SR(2);
  477. S(1,2);
  478. O(2,3);
  479. S(2,3);
  480. odd_ends:
  481. ASM add icnt, 8-4;
  482. ASM jl end; // jump if no sign change
  483. L(0,4);
  484. SR(3);
  485. PL(0);
  486. L(1,5);
  487. SL(0);
  488. PH(1);
  489. O(3,0);
  490. SL(1);
  491. SR(0);
  492. S(3,4);
  493. O(0,1);
  494. S(0,5);
  495. end:
  496. ASM emms;
  497. #undef in
  498. #undef out
  499. #undef icnt
  500. #undef L
  501. #undef PL
  502. #undef PH
  503. #undef SL
  504. #undef SR
  505. #undef O
  506. #undef S
  507. return;
  508. }
  509. void ab2ababw(const short *input, short *output, int n)
  510. {
  511. #define in edi
  512. #define out esi
  513. #define icnt ecx
  514. #define L(m,i) ASM movq mm##m,QP[in+4*i]
  515. #define C(m,n) ASM movq mm##m,mm##n
  516. #define PL(m) ASM punpckldq mm##m,mm##m
  517. #define PH(m) ASM punpckhdq mm##m,mm##m
  518. #define S(m,i) ASM movq [out+8*i],mm##m
  519. ASM {
  520. mov in, input;
  521. mov out, output;
  522. mov icnt, n;
  523. sub icnt, 8;
  524. jl odd_ends;
  525. }
  526. //prime pump
  527. L(0,0);
  528. C(1,0);
  529. PL(0);
  530. L(2,2);
  531. PH(1);
  532. S(0,0);
  533. C(3,2);
  534. S(1,1);
  535. PL(2);
  536. ASM add in, 16;
  537. ASM add out, 32;
  538. ASM sub icnt, 8;
  539. ASM jl cleanup;
  540. inner:
  541. L(0,0);
  542. PH(3);
  543. S(2,-2);
  544. C(1,0);
  545. S(3,-1);
  546. PL(0);
  547. L(2,2);
  548. PH(1);
  549. S(0,0);
  550. C(3,2);
  551. S(1,1);
  552. PL(2);
  553. ASM add in, 16;
  554. ASM add out, 32;
  555. ASM sub icnt, 8;
  556. ASM jg inner;
  557. cleanup:
  558. PH(3);
  559. S(2,-2);
  560. S(3,-1);
  561. odd_ends:
  562. ASM add icnt, 8-2;
  563. ASM jl end; // jump if no sign change
  564. inner_by2:
  565. ASM movd mm0, DP[in];
  566. PL(0);
  567. S(0,0);
  568. ASM add in, 4;
  569. ASM add out, 8;
  570. ASM sub icnt, 2;
  571. ASM jge inner_by2;
  572. end:
  573. ASM emms;
  574. return;
  575. }
  576. #undef in
  577. #undef out
  578. #undef icnt
  579. #undef L
  580. #undef C
  581. #undef PL
  582. #undef PH
  583. #undef S
  584. void ConvMMX(short *input1, short *input2, int *output, int ncor)
  585. {
  586. #define rega0 mm0
  587. #define regb0 mm1
  588. #define rega1 mm2
  589. #define regb1 mm3
  590. #define rega2 mm4
  591. #define regb2 mm5
  592. #define acc0 mm6
  593. #define acc1 mm7
  594. #define in2 esi
  595. #define in1 edi
  596. #define out eax
  597. #define icnt ecx
  598. #define tmp ebx
  599. // In the following macros, 'n' is the column number and 'i' is the
  600. // iteration number.
  601. // we use "the convolution trick" or using la twice so that one
  602. // of the pmadd's is reg,reg and thus can be in the V-slot.
  603. // NOTE: we have read ahead up to 2 quadwords
  604. // so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
  605. // and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal
  606. #define la(n,i) ASM movq rega##n,QP[in2+8*i]
  607. #define lb(n,i) ASM movq regb##n,QP[in1+8*i-8]
  608. #define m0(n,i) ASM pmaddwd regb##n,rega##n
  609. #define m1(n,i) ASM pmaddwd rega##n,QP[in1+8*i]
  610. #define a0(n,i) ASM paddd acc0,regb##n
  611. #define a1(n,i) ASM paddd acc1,rega##n
  612. ASM
  613. {
  614. mov tmp,ncor;
  615. shl tmp,2;
  616. shr ncor,1;
  617. mov out,output;
  618. add out,tmp;
  619. add out,16;
  620. mov in1,input1;
  621. mov in2,input2;
  622. mov icnt,ncor;
  623. }
  624. ForEachCorrPair:
  625. // prime the pump
  626. la(0,0);
  627. ASM pxor regb0,regb0; // to avoid lb(0,0) reading taps[-1]
  628. la(1,1);
  629. ASM pxor acc0,acc0; // clear accumulator
  630. m1(0,0);
  631. ASM pxor acc1,acc1; // clear accumulator
  632. lb(1,1);
  633. ASM sub icnt, 1; // account for pump priming
  634. ASM jle cleanup; // bypass if only one to do
  635. inner:
  636. la(2,2);
  637. m0(1,1);
  638. m1(1,1);
  639. a0(0,0);
  640. lb(2,2);
  641. a1(0,0);
  642. la(0,3);
  643. m0(2,2);
  644. m1(2,2);
  645. a0(1,1);
  646. lb(0,3);
  647. a1(1,1);
  648. la(1,4);
  649. m0(0,3);
  650. m1(0,3);
  651. a0(2,2);
  652. lb(1,4);
  653. a1(2,2);
  654. ASM add in2,24;
  655. ASM add in1,24;
  656. ASM sub icnt,3;
  657. ASM jg inner;
  658. cleanup: // last two adds
  659. a0(0,0);
  660. a1(0,0);
  661. // Done with one correlation pair. Pack and store 2 results in corr array
  662. ASM
  663. {
  664. sub out,16;
  665. mov in2, input2;
  666. mov in1,input1;
  667. add in2,16;
  668. mov icnt, ncor;
  669. mov input2, in2;
  670. sub icnt,2; //set flags for jump
  671. movq QP[out-16],acc0;
  672. movq QP[out-8],acc1;
  673. mov ncor, icnt;
  674. jg ForEachCorrPair;
  675. emms;
  676. }
  677. }
  678. #undef rega0
  679. #undef regb0
  680. #undef rega1
  681. #undef regb1
  682. #undef rega2
  683. #undef regb2
  684. #undef acc0
  685. #undef acc1
  686. #undef in2
  687. #undef in1
  688. #undef out
  689. #undef icnt
  690. #undef tmp
  691. #undef la
  692. #undef lb
  693. #undef m0
  694. #undef m1
  695. #undef a0
  696. #undef a1
  697. // 16 bit output
  698. // psrad acc0,16;//this could be less in some cases
  699. // psrad acc1,16;
  700. // packssdw acc1,acc0;
  701. // movq QP[cor-8],acc0;
  702. //#else
  703. //------------------------------------------------------
  704. /*
  705. void ConvMMX(short *in1, short *in2, int *out, int ncor)
  706. {
  707. int i,j;
  708. for (i=0; i < 2*ncor; i+=4) {
  709. int acc0 = 0, acc1 = 0;
  710. for (j=0; j < 2*ncor - i; j+=4) {
  711. acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1];
  712. acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3];
  713. }
  714. corr[i/2] = acc0 ;
  715. corr[i/2+1] = acc1 ;
  716. }
  717. return;
  718. }*/
  719. void ab2abzaw(const short *input, short *output, int n)
  720. {
  721. register int i;
  722. register unsigned *in, *out;
  723. register unsigned x, y; //tread two words at a time as raw bits
  724. in = (unsigned *)input;
  725. out = (unsigned *)output;
  726. //unroll by two
  727. for (i = n/2 - 2; i>0; i-=2) {
  728. x = in[i];
  729. y = in[i+1];
  730. out[2*(i+1)] = y;
  731. out[2*(i+1)+1] = (y<<16 | x>>16);
  732. x = in[i-1];
  733. y = in[i];
  734. out[2*i] = y;
  735. out[2*i+1] = (y<<16 | x>>16);
  736. }
  737. //odd ends
  738. for (i++; i>=0; i--) {
  739. x = (i>0)?in[i-1]:0;
  740. y = in[i];
  741. out[2*i] = y;
  742. out[2*i+1] = (y<<16 | x>>16);
  743. }
  744. return;
  745. }
  746. void ShortToFloatScale(short *x, float scale, int N, float *y)
  747. {
  748. /*
  749. short i;
  750. float yy[100];
  751. for (i=0; i<N; i++)
  752. { yy[i]=x[i]*scale; }
  753. ASM
  754. {
  755. mov esi,x;
  756. mov edi,y;
  757. lea ecx,scale;
  758. mov eax, N
  759. sub eax, 2
  760. loop1:
  761. fild WORD PTR [esi+eax*2]
  762. fmul DWORD PTR [ecx]
  763. fstp DWORD PTR [edi+eax*4]
  764. fild WORD PTR [esi+eax*2+2]
  765. fmul DWORD PTR [ecx]
  766. fstp DWORD PTR [edi+eax*4+4]
  767. sub eax, 2
  768. jge loop1;
  769. }
  770. */
  771. ASM
  772. {
  773. mov esi,x;
  774. mov edi,y;
  775. lea ecx,scale;
  776. mov eax, N
  777. sub eax, 6
  778. fld DP [ecx] ; c
  779. fild WORD PTR [esi+eax*2+8] ; L0 c
  780. fild WORD PTR [esi+eax*2+10] ; L1 L0 c
  781. fxch ST(1) ; L0 L1 c
  782. fmul ST(0), ST(2) ; M0 L1 c
  783. fxch ST(1) ; L1 M0 c
  784. fmul ST(0),ST(2) ; M1 M0 c
  785. fild WORD PTR [esi+eax*2+4] ; L0 M1 M0 c
  786. fild WORD PTR [esi+eax*2+6]; L1 L0 M1 M0 c
  787. fxch ST(3) ; M0 L0 M1 L1 c
  788. fstp DWORD PTR [edi+eax*4+16]; L0 M1 L1 c
  789. loop1: ; L0 M1 L1 c
  790. fmul ST(0),ST(3) ; M0 M1 L1 c
  791. fxch ST(1) ; M1 M0 L1 c
  792. fstp DWORD PTR [edi+eax*4+20]; M0 L1 c
  793. fxch ST(1) ; L1 M0 c
  794. fmul ST(0),ST(2) ; M1 M0 c
  795. fild WORD PTR [esi+eax*2] ; L0 M1 M0 c
  796. fild WORD PTR [esi+eax*2+2] ; L1 L0 M1 M0 c
  797. fxch ST(3) ; M0 L0 M1 L1 c
  798. fstp DWORD PTR [edi+eax*4+8]; L0 M1 L1 c
  799. sub eax, 2
  800. jge loop1;
  801. fmul ST(0),ST(3) ;eax==-2 M0 M1 L1 c
  802. fxch ST(1) ; M1 M0 L1 c
  803. fstp DWORD PTR [edi+eax*4+20] ; M0 L1 c
  804. fxch ST(1) ; L1 M0 c
  805. fmulp ST(2), st(0) ; M0 M1
  806. fstp DWORD PTR [edi+eax*4+8] ; M1
  807. fstp DWORD PTR [edi+eax*4+12] ;
  808. }
  809. /*
  810. for (i=0; i<N; i++)
  811. {
  812. if (y[i]!=yy[i])
  813. {
  814. fprintf(stdout,"\nfloat problem\n");
  815. break;
  816. }
  817. }
  818. */
  819. }
  820. //assumes N is even
  821. void IntToFloatScale(int *x, float scale, int N, float *y)
  822. {
  823. #if I2FTEST //test code
  824. int i;
  825. float yy[1000];
  826. for (i=0; i<N; i++)
  827. { yy[i]=(float)x[i]*scale; }
  828. #endif //test code
  829. #if 0 //simple code
  830. //simple assembly version
  831. ASM
  832. {
  833. mov esi,x;
  834. mov edi,y;
  835. lea ecx,scale;
  836. mov eax, N
  837. sub eax, 2
  838. loop1:
  839. fild DWORD PTR [esi+eax*4]
  840. fmul DWORD PTR [ecx]
  841. fstp DWORD PTR [edi+eax*4]
  842. fild DWORD PTR [esi+eax*4+4]
  843. fmul DWORD PTR [ecx]
  844. fstp DWORD PTR [edi+eax*4+4]
  845. sub eax, 2
  846. jge loop1;
  847. }
  848. #endif //test code
  849. ASM
  850. {
  851. mov esi,x;
  852. mov edi,y;
  853. lea ecx,scale;
  854. mov eax, N
  855. sub eax, 6
  856. fld DP [ecx] ; c
  857. fild DWORD PTR [esi+eax*4+16] ; L0 c
  858. fild DWORD PTR [esi+eax*4+20] ; L1 L0 c
  859. fxch ST(1) ; L0 L1 c
  860. fmul ST(0), ST(2) ; M0 L1 c
  861. fxch ST(1) ; L1 M0 c
  862. fmul ST(0),ST(2) ; M1 M0 c
  863. fild DWORD PTR [esi+eax*4+8] ; L0 M1 M0 c
  864. fild DWORD PTR [esi+eax*4+12];L1 L0 M1 M0 c
  865. fxch ST(3) ; M0 L0 M1 L1 c
  866. fstp DWORD PTR [edi+eax*4+16]; L0 M1 L1 c
  867. loop1: ; L0 M1 L1 c
  868. fmul ST(0),ST(3) ; M0 M1 L1 c
  869. fxch ST(1) ; M1 M0 L1 c
  870. fstp DWORD PTR [edi+eax*4+20]; M0 L1 c
  871. fxch ST(1) ; L1 M0 c
  872. fmul ST(0),ST(2) ; M1 M0 c
  873. fild DWORD PTR [esi+eax*4] ; L0 M1 M0 c
  874. fild DWORD PTR [esi+eax*4+4] ;L1 L0 M1 M0 c
  875. fxch ST(3) ; M0 L0 M1 L1 c
  876. fstp DWORD PTR [edi+eax*4+8]; L0 M1 L1 c
  877. sub eax, 2
  878. jge loop1;
  879. fmul ST(0),ST(3) ;eax==-2 M0 M1 L1 c
  880. fxch ST(1) ; M1 M0 L1 c
  881. fstp DWORD PTR [edi+eax*4+20] ; M0 L1 c
  882. fxch ST(1) ; L1 M0 c
  883. fmulp ST(2), st(0) ; M0 M1
  884. fstp DWORD PTR [edi+eax*4+8] ; M1
  885. fstp DWORD PTR [edi+eax*4+12] ;
  886. }
  887. #if I2FTEST
  888. for (i=0; i<N; i++)
  889. {
  890. if (y[i]!=yy[i])
  891. {
  892. printf("F2I %3d %8f %8f\n", i, y[i], yy[i]);
  893. }
  894. }
  895. #endif //test code
  896. }
  897. //assumes N is even
  898. void IntToFloat(int *x, int N, float *y)
  899. {
  900. #if I2FTEST //test code
  901. int i;
  902. float yy[1000];
  903. for (i=0; i<N; i++)
  904. { yy[i]=(float)x[i]; }
  905. #endif //test code
  906. //simple assembly version
  907. ASM
  908. {
  909. mov esi,x;
  910. mov edi,y;
  911. mov eax, N
  912. sub eax, 2
  913. loop1:
  914. fild DWORD PTR [esi+eax*4]
  915. fild DWORD PTR [esi+eax*4+4]
  916. fxch ST(1) ;
  917. fstp DWORD PTR [edi+eax*4]
  918. fstp DWORD PTR [edi+eax*4+4]
  919. sub eax, 2
  920. jge loop1;
  921. }
  922. #if I2FTEST
  923. for (i=0; i<N; i++)
  924. {
  925. if (y[i]!=yy[i])
  926. {
  927. printf("F2I %3d %8f %8f\n", i, y[i], yy[i]);
  928. }
  929. }
  930. #endif //test code
  931. }
  932. #endif