Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1020 lines
21 KiB

  1. // cb53mmx.c
  2. #include "cst_lbc.h"
  3. #include "mmxutil.h"
  4. #include "opt.h"
  5. #include "exc_lbc.h"
  6. #include "timer.h"
  7. #include <math.h>
  8. #include <stdlib.h>
  9. #include <stdio.h>
  10. #include "util_lbc.h"
  11. #define ASM_CORHPL 1
  12. #define ASM_CORHDL 1
  13. #define TESTME 0
  14. #define CHTEST 0
  15. #if COMPILE_MMX
  16. void CorrelateIntTri(short *taps, short *array, int *corr, int ncor);
  17. void CorrelateInt22(short *taps, short *array, int *corr, int ncor);
  18. void Cor_h_Xint(short h[],short X[],int D[]);
  19. void Cor_hint0(short *H, int *rr);
  20. void Cor_hint1(short *H, int *rr);
  21. void cor_h_prodloop(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0);
  22. void cor_h_diag(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0);
  23. //------------------------------------------------------------
  24. int ACELP_LBC_code_int(float X[], float h[], int T0, float code[],
  25. int *ind_gain, int *shift, int *sign, float gain_T0, int flags)
  26. {
  27. int i, index;
  28. float gain_q;
  29. float Dn[SubFrLen2], tmp_code[SubFrLen2];
  30. float rr[DIM_RR];
  31. DECLARE_INT(rrint, DIM_RR);
  32. DECLARE_SHORT(hint, SubFrLen2);
  33. DECLARE_INT(Dnint, SubFrLen2);
  34. DECLARE_SHORT(Xint, SubFrLen2);
  35. int XScale;
  36. float hScale;
  37. int m;
  38. #if 0//TESTME
  39. float htest[SubFrLen], Xtest[SubFrLen];
  40. for (i = 0; i<SubFrLen; i++)
  41. {
  42. htest[i] = i; //(float)(i<30?i:60-i);
  43. Xtest[i] = (float)(i<30?i:60-i);
  44. }
  45. h = htest;
  46. X = Xtest;
  47. #endif //TESTME
  48. // Include fixed-gain pitch contribution into impulse resp. h[]
  49. if (T0 < SubFrLen-2)
  50. for (i = T0; i < SubFrLen; i++)
  51. h[i] += gain_T0*h[i-T0];
  52. ALIGN_ARRAY(rrint);
  53. ALIGN_ARRAY(hint);
  54. ALIGN_ARRAY(Dnint);
  55. ALIGN_ARRAY(Xint);
  56. //hScale = FloatToShortScaled(h, hint, SubFrLen, 3);
  57. hScale = (float)sqrt(DotProd(h,h,SubFrLen)/(double)SubFrLen);
  58. m = (asint(hScale) & 0x7f800000) >> 23;
  59. ScaleFloatToShort(h, hint, SubFrLen, m+3);
  60. XScale = FloatToShortScaled(X, Xint, SubFrLen, 3); //would be better to normalize based on engery, not max
  61. #if 0
  62. for (i = 0; i<SubFrLen; i++)
  63. {
  64. hint[i] = i;
  65. }
  66. #endif
  67. // Compute correlations of h[] needed for the codebook search
  68. //TIMER_STAMP(a);
  69. Cor_hint1(hint, rrint);
  70. IntToFloat(rrint, DIM_RR, rr);
  71. //TIMER_STAMP(b);
  72. // Cor_h(h, rr);
  73. ////TIMER_STAMP(c);
  74. #if CHTEST
  75. {
  76. DECLARE_INT(rrint2, DIM_RR);
  77. ALIGN_ARRAY(rrint2);//debug
  78. Cor_hint0(hint, rrint2);
  79. for(i = 0; i<DIM_RR; i++) //debug
  80. if(rrint[i] != rrint2[i])
  81. printf("%3d: %8d %8d %8d\n",i, rrint[i], rrint2[i], rrint[i] - rrint2[i]);
  82. }
  83. #endif //CHTEST
  84. // Compute correlation of target vector with impulse response.
  85. //TIMER_STAMP(c);
  86. Cor_h_Xint(hint, Xint, Dnint);
  87. //TIMER_STAMP(d);
  88. IntToFloat(Dnint, SubFrLen, Dn);
  89. //TIMER_STAMP(a);
  90. #if TESTME //test
  91. {
  92. int fpDnint[SubFrLen2];
  93. // float scale;
  94. // scale =
  95. Cor_h_X(h,X,Dn);
  96. FloatToIntScaled(Dn, fpDnint, SubFrLen, 7);
  97. for (i = 0; i<SubFrLen; i++)
  98. if(fpDnint[i] != Dnint[i])
  99. printf("%3d: %8x %8x %8x\n", i, Dnint[i] - fpDnint[i],Dnint[i], fpDnint[i]);
  100. }
  101. #endif //test
  102. // Find codebook index
  103. //TIMER_STAMP(c);
  104. index = D4i64_LBC(Dn, rr, h, tmp_code, rr, shift, sign, flags);
  105. //TIMER_STAMP(f);
  106. // Compute innovation vector gain.
  107. // Include fixed-gain pitch contribution into code[].
  108. *ind_gain = G_code(X, rr, &gain_q);
  109. for (i=0; i < SubFrLen; i++)
  110. code[i] = tmp_code[i]*gain_q;
  111. if(T0 < SubFrLen-2)
  112. for (i=T0; i < SubFrLen; i++)
  113. code[i] += code[i-T0]*gain_T0;
  114. return index;
  115. }
  116. //---------------------------------------------------------------
  117. //---------------------------------------------------------------
  118. void Cor_hint0(short *H, int *rr)
  119. {
  120. // Compute correlations of h[] needed for the codebook search.
  121. // h[] :Impulse response.
  122. // rr[] :Correlations.
  123. int *rri0i0, *rri1i1, *rri2i2, *rri3i3;
  124. int *rri0i1, *rri0i2, *rri0i3;
  125. int *rri1i2, *rri1i3, *rri2i3;
  126. int *p0, *p1, *p2, *p3;
  127. int cor;
  128. int i, k, m, t;
  129. DECLARE_SHORT(h,SubFrLen2);
  130. DECLARE_SHORT(h2,SubFrLen2);
  131. ALIGN_ARRAY(h);
  132. ALIGN_ARRAY(h2);
  133. for(i=0; i<4; i++)
  134. h[i] = (short)0;
  135. for(i=0; i<SubFrLen; i++)
  136. h2[i+2] = h[i+4] = H[i];
  137. // Init pointers
  138. rri0i0 = rr;
  139. rri1i1 = rri0i0 + NB_POS;
  140. rri2i2 = rri1i1 + NB_POS;
  141. rri3i3 = rri2i2 + NB_POS;
  142. rri0i1 = rri3i3 + NB_POS;
  143. rri0i2 = rri0i1 + MSIZE;
  144. rri0i3 = rri0i2 + MSIZE;
  145. rri1i2 = rri0i3 + MSIZE;
  146. rri1i3 = rri1i2 + MSIZE;
  147. rri2i3 = rri1i3 + MSIZE;
  148. // Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]
  149. cor = 0;
  150. m = 0;
  151. for(i=NB_POS-1; i>=0; i--)
  152. {
  153. cor += h[m+0]*h[m+0] + h[m+1]*h[m+1]; rri3i3[i] = cor;
  154. cor += h[m+2]*h[m+2] + h[m+3]*h[m+3]; rri2i2[i] = cor;
  155. cor += h[m+4]*h[m+4] + h[m+5]*h[m+5]; rri1i1[i] = cor;
  156. cor += h[m+6]*h[m+6] + h[m+7]*h[m+7]; rri0i0[i] = cor;
  157. m += 8;
  158. }
  159. // Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
  160. h2 = h+2;
  161. p3 = rri2i3 + MSIZE-1;
  162. p2 = rri1i2 + MSIZE-1;
  163. p1 = rri0i1 + MSIZE-1;
  164. p0 = rri0i3 + MSIZE-2;
  165. for (k=0; k<NB_POS; k++)
  166. {
  167. cor = 0;
  168. m = 0;
  169. t = 0;
  170. for(i=k+1; i<NB_POS; i++)
  171. {
  172. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  173. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  174. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  175. cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
  176. t -= (NB_POS+1);
  177. m += 8;
  178. }
  179. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  180. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  181. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  182. h2 += STEP;
  183. p3 -= NB_POS;
  184. p2 -= NB_POS;
  185. p1 -= NB_POS;
  186. p0 -= 1;
  187. }
  188. // Compute elements of: rri0i2[], rri1i3[]
  189. h2 = h+4;
  190. p3 = rri1i3 + MSIZE-1;
  191. p2 = rri0i2 + MSIZE-1;
  192. p1 = rri1i3 + MSIZE-2;
  193. p0 = rri0i2 + MSIZE-2;
  194. for (k=0; k<NB_POS; k++)
  195. {
  196. cor = 0;
  197. m = 0;
  198. t = 0;
  199. for(i=k+1; i<NB_POS; i++)
  200. {
  201. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  202. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  203. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  204. cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
  205. t -= (NB_POS+1);
  206. m += 8;
  207. }
  208. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  209. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  210. h2 += STEP;
  211. p3 -= NB_POS;
  212. p2 -= NB_POS;
  213. p1 -= 1;
  214. p0 -= 1;
  215. }
  216. // Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
  217. h2 = h+6;
  218. p3 = rri0i3 + MSIZE-1;
  219. p2 = rri2i3 + MSIZE-2;
  220. p1 = rri1i2 + MSIZE-2;
  221. p0 = rri0i1 + MSIZE-2;
  222. for (k=0; k<NB_POS; k++)
  223. {
  224. cor = 0;
  225. m = 0;
  226. t = 0;
  227. for(i=k+1; i<NB_POS; i++)
  228. {
  229. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  230. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  231. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  232. cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
  233. t -= (NB_POS+1);
  234. m += 8;
  235. }
  236. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  237. h2 += STEP;
  238. p3 -= NB_POS;
  239. p2 -= 1;
  240. p1 -= 1;
  241. p0 -= 1;
  242. }
  243. return;
  244. }
  245. //---------------------------------------------------------------
  246. void cor_h_prods(int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0,int dp3,int dp2,int dp1,int dp0){
  247. int k;
  248. for (k=0; k<NB_POS; k++)
  249. {
  250. cor_h_prodloop(NB_POS-(k+1),oddn,h,h2,p3,p2,p1,p0);
  251. h2 += STEP;
  252. p3 -= dp3;
  253. p2 -= dp2;
  254. p1 -= dp1;
  255. p0 -= dp0;
  256. }
  257. return;
  258. }
  259. #if _MSC_FULL_VER >= 13008827 && defined(_M_IX86)
  260. #pragma warning(disable:4731) // EBP modified with inline asm
  261. #endif
  262. void cor_h_prodloop(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0)
  263. {
  264. #if ASM_CORHPL
  265. n = n * 4 + oddn;
  266. #define in edi
  267. #define inoff edx
  268. #define out esi
  269. #define out3 out+eax
  270. #define out2 out+ebx
  271. #define out1 out+ebp
  272. #define out0 out
  273. #define L(m,n) ASM movq mm##m, QP[in+8*n]
  274. #define M(m,n) ASM pmaddwd mm##m, QP[in+inoff+8*n]
  275. #define S(m) ASM psrlq mm##m, 32
  276. #define AH(m,n) ASM paddd mm##m, mm##n
  277. #define WH(m,o) ASM movd DP[out##o], mm##m
  278. #define AL(m,n) ASM paddd mm##m, mm##n
  279. #define WL(m,o) ASM movd DP[out##o], mm##m
  280. ASM {
  281. push ebp;
  282. mov ecx, n;
  283. mov in, h;
  284. mov inoff, h2;
  285. sub inoff, in;
  286. mov out, p0;
  287. mov eax, p3;
  288. mov ebx, p2;
  289. mov ebp, p1;
  290. sub eax, out;
  291. sub ebx, out;
  292. sub ebp, out;
  293. }
  294. L(0,0);
  295. ASM pxor mm3,mm3;
  296. M(0,0);
  297. L(1,1);
  298. AL(3,0); //really a copy
  299. M(1,1);
  300. S(0);
  301. ASM sub ecx,8;
  302. ASM jl oddends;
  303. inner:
  304. L(2,2);
  305. AH(0,3);
  306. WL(3,3);
  307. WH(0,2);
  308. AL(0,1);
  309. M(2,2);
  310. S(1);
  311. L(3,3);
  312. AH(1,0);
  313. WL(0,1);
  314. WH(1,0);
  315. AL(1,2);
  316. M(3,3);
  317. S(2);
  318. ASM sub out, 4*(NB_POS+1);
  319. L(0,4);
  320. AH(2,1);
  321. WL(1,3);
  322. WH(2,2);
  323. AL(2,3);
  324. M(0,4);
  325. S(3);
  326. L(1,5);
  327. AH(3,2);
  328. WL(2,1);
  329. WH(3,0);
  330. AL(3,0);
  331. M(1,5);
  332. S(0);
  333. ASM sub out, 4*(NB_POS+1);
  334. ASM add in, 16*2;
  335. ASM sub ecx, 8;
  336. ASM jge inner;
  337. oddends:
  338. ASM add ecx, 4;
  339. ASM jl cleanup;
  340. //four more
  341. L(2,2);
  342. AH(0,3);
  343. WL(3,3);
  344. WH(0,2);
  345. AL(0,1);
  346. M(2,2);
  347. S(1);
  348. L(3,3);
  349. AH(1,0);
  350. WL(0,1);
  351. WH(1,0);
  352. AL(1,2);
  353. M(3,3);
  354. S(2);
  355. ASM sub out, 4*(NB_POS+1);
  356. AH(2,1);
  357. ASM dec ecx;
  358. ASM jl innerdone;
  359. WL(1,3);
  360. ASM dec ecx;
  361. ASM jl innerdone;
  362. WH(2,2);
  363. AL(2,3);
  364. ASM dec ecx;
  365. ASM jl innerdone;
  366. WL(2,1);
  367. ASM jmp innerdone;
  368. cleanup:
  369. ASM add ecx, 4;
  370. ASM dec ecx;
  371. ASM jl innerdone;
  372. AH(0,3);
  373. WL(3,3);
  374. ASM dec ecx;
  375. ASM jl innerdone;
  376. WH(0,2);
  377. AL(0,1);
  378. ASM dec ecx;
  379. ASM jl innerdone;
  380. WL(0,1);
  381. innerdone:
  382. ASM emms;
  383. ASM pop ebp;
  384. #undef in
  385. #undef inoff
  386. #undef out
  387. #undef out3
  388. #undef out2
  389. #undef out1
  390. #undef out0
  391. #undef L
  392. #undef M
  393. #undef S
  394. #undef AH
  395. #undef WH
  396. #undef AL
  397. #undef WL
  398. #else //ASM_CORHPL
  399. int cor;
  400. int i,m,t;
  401. cor = 0;
  402. m = 0;
  403. t = 0;
  404. for(i=n; i; i--)
  405. {
  406. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  407. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  408. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  409. cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
  410. t -= (NB_POS+1);
  411. m += 8;
  412. }
  413. if(oddn >= 1) {
  414. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  415. if(oddn >= 2) {
  416. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  417. if(oddn >= 3) {
  418. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  419. }
  420. }
  421. }
  422. #endif //ASM_CORHPL
  423. return;
  424. }
  425. void cor_h_diag(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0)
  426. {
  427. #if ASM_CORHDL
  428. n = n * 4 + oddn;
  429. #define in edi
  430. #define inoff edx
  431. #define out esi
  432. #define out3 out+eax
  433. #define out2 out+ebx
  434. #define out1 out+ebp
  435. #define out0 out
  436. #define L(m,n) ASM movq mm##m, QP[in+8*n]
  437. #define M(m,n) ASM pmaddwd mm##m, QP[in+inoff+8*n]
  438. #define R(m) ASM psrad mm##m, 1
  439. #define S(m) ASM psrlq mm##m, 32
  440. #define AH(m,n) ASM paddd mm##m, mm##n
  441. #define WH(m,o) ASM movd DP[out##o], mm##m
  442. #define AL(m,n) ASM paddd mm##m, mm##n
  443. #define WL(m,o) ASM movd DP[out##o], mm##m
  444. ASM {
  445. push ebp;
  446. mov ecx, n;
  447. mov in, h;
  448. mov inoff, h2;
  449. sub inoff, in;
  450. mov out, p0;
  451. mov eax, p3;
  452. mov ebx, p2;
  453. mov ebp, p1;
  454. sub eax, out;
  455. sub ebx, out;
  456. sub ebp, out;
  457. }
  458. L(0,0);
  459. ASM pxor mm3,mm3;
  460. M(0,0);
  461. L(1,1);
  462. AL(3,0); //really a copy
  463. M(1,1);
  464. R(0);
  465. S(0);
  466. ASM sub ecx,8;
  467. ASM jl oddends;
  468. inner:
  469. L(2,2);
  470. AH(0,3);
  471. WL(3,3);
  472. R(1);
  473. WH(0,2);
  474. AL(0,1);
  475. M(2,2);
  476. S(1);
  477. L(3,3);
  478. AH(1,0);
  479. WL(0,1);
  480. R(2);
  481. WH(1,0);
  482. AL(1,2);
  483. M(3,3);
  484. S(2);
  485. ASM sub out, 4*1;
  486. L(0,4);
  487. AH(2,1);
  488. WL(1,3);
  489. R(3);
  490. WH(2,2);
  491. AL(2,3);
  492. M(0,4);
  493. S(3);
  494. L(1,5);
  495. AH(3,2);
  496. WL(2,1);
  497. R(0);
  498. WH(3,0);
  499. AL(3,0);
  500. M(1,5);
  501. S(0);
  502. ASM sub out, 4*1;
  503. ASM add in, 16*2;
  504. ASM sub ecx, 8;
  505. ASM jge inner;
  506. oddends:
  507. ASM add ecx, 4;
  508. ASM jl cleanup;
  509. //four more
  510. L(2,2);
  511. AH(0,3);
  512. WL(3,3);
  513. R(1);
  514. WH(0,2);
  515. AL(0,1);
  516. M(2,2);
  517. S(1);
  518. L(3,3);
  519. AH(1,0);
  520. WL(0,1);
  521. R(2);
  522. WH(1,0);
  523. AL(1,2);
  524. M(3,3);
  525. S(2);
  526. ASM sub out, 4*1;
  527. AH(2,1);
  528. ASM dec ecx;
  529. ASM jl innerdone;
  530. WL(1,3);
  531. ASM dec ecx;
  532. ASM jl innerdone;
  533. WH(2,2);
  534. AL(2,3);
  535. ASM dec ecx;
  536. ASM jl innerdone;
  537. WL(2,1);
  538. ASM jmp innerdone;
  539. cleanup:
  540. ASM add ecx, 4;
  541. ASM dec ecx;
  542. ASM jl innerdone;
  543. AH(0,3);
  544. WL(3,3);
  545. ASM dec ecx;
  546. ASM jl innerdone;
  547. WH(0,2);
  548. AL(0,1);
  549. ASM dec ecx;
  550. ASM jl innerdone;
  551. WL(0,1);
  552. innerdone:
  553. ASM emms;
  554. ASM pop ebp;
  555. #undef in
  556. #undef inoff
  557. #undef out
  558. #undef out3
  559. #undef out2
  560. #undef out1
  561. #undef out0
  562. #undef L
  563. #undef M
  564. #undef R
  565. #undef S
  566. #undef AH
  567. #undef WH
  568. #undef AL
  569. #undef WL
  570. #else //ASM_CORHDL
  571. int cor;
  572. int i,m,t;
  573. cor = 0;
  574. m = 0;
  575. t = 0;
  576. for(i=n; i; i--)
  577. {
  578. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor>>1;
  579. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor>>1;
  580. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor>>1;
  581. cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor>>1;
  582. t -= 1;
  583. m += 8;
  584. }
  585. if(oddn >= 1) {
  586. cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
  587. if(oddn >= 2) {
  588. cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
  589. if(oddn >= 3) {
  590. cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
  591. }
  592. }
  593. }
  594. #endif //ASM_CORHDL
  595. return;
  596. }
  597. void Cor_hint1(short *H, int *rr)
  598. {
  599. // Compute correlations of h[] needed for the codebook search.
  600. // h[] :Impulse response.
  601. // rr[] :Correlations.
  602. int *rri0i0, *rri1i1, *rri2i2, *rri3i3;
  603. int *rri0i1, *rri0i2, *rri0i3;
  604. int *rri1i2, *rri1i3, *rri2i3;
  605. int *p0, *p1, *p2, *p3;
  606. short *h2;
  607. int i;
  608. DECLARE_SHORT(h,SubFrLen2);
  609. DECLARE_SHORT(hp2,SubFrLen2);
  610. ALIGN_ARRAY(h);
  611. ALIGN_ARRAY(hp2);
  612. for(i=0; i<4; i++)
  613. h[i] = (short)0;
  614. for(i=0; i<SubFrLen; i++)
  615. hp2[i+2] = h[i+4] = H[i];
  616. // Init pointers
  617. rri0i0 = rr;
  618. rri1i1 = rri0i0 + NB_POS;
  619. rri2i2 = rri1i1 + NB_POS;
  620. rri3i3 = rri2i2 + NB_POS;
  621. rri0i1 = rri3i3 + NB_POS;
  622. rri0i2 = rri0i1 + MSIZE;
  623. rri0i3 = rri0i2 + MSIZE;
  624. rri1i2 = rri0i3 + MSIZE;
  625. rri1i3 = rri1i2 + MSIZE;
  626. rri2i3 = rri1i3 + MSIZE;
  627. //TIMER_STAMP(a);
  628. // Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]
  629. cor_h_diag(NB_POS,0,h,h,&rri3i3[NB_POS-1],&rri2i2[NB_POS-1],&rri1i1[NB_POS-1],&rri0i0[NB_POS-1]);
  630. //TIMER_STAMP(b);
  631. // Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
  632. h2 = hp2;
  633. p3 = rri2i3 + MSIZE-1;
  634. p2 = rri1i2 + MSIZE-1;
  635. p1 = rri0i1 + MSIZE-1;
  636. p0 = rri0i3 + MSIZE-2;
  637. cor_h_prods(4-1,h,h2,p3,p2,p1,p0,NB_POS,NB_POS,NB_POS,1);
  638. // Compute elements of: rri0i2[], rri1i3[]
  639. h2 = h+4;
  640. p3 = rri1i3 + MSIZE-1;
  641. p2 = rri0i2 + MSIZE-1;
  642. p1 = rri1i3 + MSIZE-2;
  643. p0 = rri0i2 + MSIZE-2;
  644. cor_h_prods(4-2,h,h2,p3,p2,p1,p0,NB_POS,NB_POS,1,1);
  645. // Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
  646. h2 = hp2+4;
  647. p3 = rri0i3 + MSIZE-1;
  648. p2 = rri2i3 + MSIZE-2;
  649. p1 = rri1i2 + MSIZE-2;
  650. p0 = rri0i1 + MSIZE-2;
  651. cor_h_prods(4-3,h,h2,p3,p2,p1,p0,NB_POS,1,1,1);
  652. //TIMER_STAMP(c);
  653. return;
  654. }
  655. //---------------------------------------------------------------------------
  656. void Cor_h_Xint(short h[],short X[],int D[])
  657. {
  658. int i;
  659. DECLARE_SHORT(hh, 2*SubFrLen+16); //h[-1,0,0,1,1,2,2,3,3,4,4,5,...57,58,58,59]
  660. DECLARE_SHORT(XX, 2*SubFrLen+16); //X[ 0,1,0,1,2,3,2,3,4,5,4,5,...58,59,58,59]
  661. #if TESTME
  662. short htest[SubFrLen], Xtest[SubFrLen];
  663. for (i = 0; i<SubFrLen; i++)
  664. {
  665. htest[i] = 1;//(short)(i<30?i:60-i);
  666. Xtest[i] = 1;//(short)(i<30?i:60-i);
  667. }
  668. h = htest;
  669. X = Xtest;
  670. #endif //TESTME
  671. ALIGN_ARRAY(hh);
  672. ALIGN_ARRAY(XX);
  673. for (i=2*SubFrLen; i < 2*SubFrLen+16; i++) {
  674. XX[i] = hh[i] = (short)0;
  675. }
  676. // hh += 8; XX += 8;
  677. #define ASM_Cor_h_Xint 1
  678. #if ASM_Cor_h_Xint
  679. ab2ababw(X, XX, SubFrLen);
  680. ab2abzaw(h, hh, SubFrLen);
  681. //TIMER_STAMP(e);
  682. CorrelateIntTri (hh, XX, D, SubFrLen);
  683. #if TESTME
  684. {
  685. int D2[SubFrLen];
  686. CorrelateInt22 (hh, XX, D2, SubFrLen);
  687. for (i = 0; i<SubFrLen; i++) {
  688. // if(D[i] != D2[i])
  689. printf("%3d: %6d %6d %6d ", i,D[i], D2[i], D[i] - D2[i]);
  690. if(i&1) printf("\n");
  691. }
  692. }
  693. #endif TESTME
  694. #else //ASM_Cor_h_Xint
  695. for (i=0; i < SubFrLen; i+=2) {
  696. hh[2*i] = (i-1 >= 0) ? h[i-1] : (short)0;
  697. hh[2*i+1] = h[i];
  698. hh[2*i+2] = h[i];
  699. hh[2*i+3] = h[i+1];
  700. XX[2*i] = X[i];
  701. XX[2*i+1] = X[i+1];
  702. XX[2*i+2] = X[i];
  703. XX[2*i+3] = X[i+1];
  704. }
  705. for (i=0; i < 2*SubFrLen; i+=4) {
  706. int acc0 = 0, acc1 = 0;
  707. for (j=0; j < 2*SubFrLen - i; j+=4) {
  708. acc0 += (int)hh[j]*XX[i+j] + (int)hh[j+1]*XX[i+j+1];
  709. acc1 += (int)hh[j+2]*XX[i+j+2] + (int)hh[j+3]*XX[i+j+3];
  710. }
  711. D[i/2] = acc0 >> 16;
  712. D[i/2+1] = acc1 >> 16;
  713. }
  714. #endif //ASM_Cor_h_Xint
  715. return;
  716. }
  717. //---------------------------------------------------------------------------
  718. #define ASM_CORR_TRI 1
  719. //#if ASM_CORR_TRI
  720. //------------------------------------------------------
  721. // triangular correlations
  722. // ASSUMES that array has 8 zero values beyond the end
  723. // and can be read 8 more beyond that (without page fault etc)
  724. // data format is
  725. // taps: 0 t0 t0 t1 t1 t2 t2 t3 t3 t4 t4 t5 ... t57 t58 t58 t59
  726. // arr: a0 a1 a0 a1 a2 a3 a2 a3 a4 a5 a4 a5 ... a58 a59 a58 a59
  727. //
  728. void CorrelateIntTri(short *taps, short *array, int *corr, int ncor)
  729. {
  730. #define rega0 mm0
  731. #define regb0 mm1
  732. #define rega1 mm2
  733. #define regb1 mm3
  734. #define rega2 mm4
  735. #define regb2 mm5
  736. #define acc0 mm6
  737. #define acc1 mm7
  738. #define arr esi
  739. #define tap edi
  740. #define cor eax
  741. #define icnt ecx
  742. // In the following macros, 'n' is the column number and 'i' is the
  743. // iteration number.
  744. // we use "the convolution trick" or using la twice so that one
  745. // of the pmadd's is reg,reg and thus can be in the V-slot.
  746. // NOTE: we have read ahead up to 2 quadwords
  747. // so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
  748. // and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal
  749. #define la(n,i) ASM movq rega##n,QP[arr+8*i]
  750. #define lb(n,i) ASM movq regb##n,QP[tap+8*i-8]
  751. #define m0(n,i) ASM pmaddwd regb##n,rega##n
  752. #define m1(n,i) ASM pmaddwd rega##n,QP[tap+8*i]
  753. #define a0(n,i) ASM paddd acc0,regb##n
  754. #define a1(n,i) ASM paddd acc1,rega##n
  755. ASM
  756. {
  757. shr ncor,1;
  758. mov cor,corr;
  759. mov tap,taps;
  760. mov arr,array;
  761. mov icnt,ncor;
  762. }
  763. ForEachCorrPair:
  764. // prime the pump
  765. la(0,0);
  766. ASM pxor regb0,regb0; // to avoid lb(0,0) reading taps[-1]
  767. la(1,1);
  768. ASM pxor acc0,acc0; // clear accumulator
  769. m1(0,0);
  770. ASM pxor acc1,acc1; // clear accumulator
  771. lb(1,1);
  772. ASM sub icnt, 1; // account for pump priming
  773. ASM jle cleanup; // bypass if only one to do
  774. inner:
  775. la(2,2);
  776. m0(1,1);
  777. m1(1,1);
  778. a0(0,0);
  779. lb(2,2);
  780. a1(0,0);
  781. la(0,3);
  782. m0(2,2);
  783. m1(2,2);
  784. a0(1,1);
  785. lb(0,3);
  786. a1(1,1);
  787. la(1,4);
  788. m0(0,3);
  789. m1(0,3);
  790. a0(2,2);
  791. lb(1,4);
  792. a1(2,2);
  793. ASM add arr,24;
  794. ASM add tap,24;
  795. ASM sub icnt,3;
  796. ASM jg inner;
  797. cleanup: // last two adds
  798. a0(0,0);
  799. a1(0,0);
  800. // Done with one correlation pair. Pack and store 2 results in corr array
  801. ASM
  802. {
  803. add cor,16;
  804. mov arr, array
  805. mov tap,taps;
  806. add arr,16;
  807. mov icnt, ncor;
  808. mov array, arr;
  809. sub icnt,2; //set flags for jump
  810. movq QP[cor-16],acc1;
  811. movq QP[cor-8],acc0;
  812. mov ncor, icnt;
  813. jg ForEachCorrPair;
  814. emms;
  815. }
  816. }
  817. #undef rega0
  818. #undef regb0
  819. #undef rega1
  820. #undef regb1
  821. #undef rega2
  822. #undef regb2
  823. #undef acc0
  824. #undef acc1
  825. #undef arr
  826. #undef tap
  827. #undef cor
  828. #undef icnt
  829. #undef la
  830. #undef lb
  831. #undef m0
  832. #undef m1
  833. #undef a0
  834. #undef a1
  835. // 16 bit output
  836. // psrad acc0,16;//this could be less in some cases
  837. // psrad acc1,16;
  838. // packssdw acc1,acc0;
  839. // movq QP[cor-8],acc0;
  840. //#else
  841. //------------------------------------------------------
  842. void CorrelateInt22(short *taps, short *array, int *corr, int ncor)
  843. {
  844. int i,j;
  845. for (i=0; i < 2*ncor; i+=4) {
  846. int acc0 = 0, acc1 = 0;
  847. for (j=0; j < 2*ncor - i; j+=4) {
  848. acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1];
  849. acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3];
  850. }
  851. corr[i/2] = acc0 ;
  852. corr[i/2+1] = acc1 ;
  853. }
  854. return;
  855. }
  856. //#endif
  857. #endif //COMPILE_MMX