// cb53mmx.c

#include "cst_lbc.h"
#include "mmxutil.h"
#include "opt.h"
#include "exc_lbc.h"
#include "timer.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "util_lbc.h"

#define ASM_CORHPL 1
#define ASM_CORHDL 1
#define TESTME 0
#define CHTEST 0

#if COMPILE_MMX
void CorrelateIntTri(short *taps, short *array, int *corr, int ncor);
void CorrelateInt22(short *taps, short *array, int *corr, int ncor);
void Cor_h_Xint(short h[],short X[],int D[]);
void Cor_hint0(short *H, int *rr);
void Cor_hint1(short *H, int *rr);
void cor_h_prodloop(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0);
void cor_h_diag(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0);

//------------------------------------------------------------
int ACELP_LBC_code_int(float X[], float h[], int T0, float code[],
		int *ind_gain, int *shift, int *sign, float gain_T0, int flags)
{
  int i, index;
  float gain_q;
  float Dn[SubFrLen2], tmp_code[SubFrLen2];
  float rr[DIM_RR];
  DECLARE_INT(rrint, DIM_RR);
  DECLARE_SHORT(hint, SubFrLen2);
  DECLARE_INT(Dnint, SubFrLen2);
  DECLARE_SHORT(Xint, SubFrLen2);
  int XScale;
  float hScale;
  int m;
#if 0//TESTME
   float htest[SubFrLen], Xtest[SubFrLen];

   for (i = 0; i<SubFrLen; i++)
   {
	 htest[i] = i; //(float)(i<30?i:60-i);
	 Xtest[i] = (float)(i<30?i:60-i);
   }
   h = htest;
   X = Xtest;
#endif //TESTME

// Include fixed-gain pitch contribution into impulse resp. h[]

  if (T0 < SubFrLen-2)
    for (i = T0; i < SubFrLen; i++)
      h[i] += gain_T0*h[i-T0];

  ALIGN_ARRAY(rrint);
  ALIGN_ARRAY(hint);
  ALIGN_ARRAY(Dnint);
  ALIGN_ARRAY(Xint);

  //hScale = FloatToShortScaled(h, hint, SubFrLen, 3);
  hScale = (float)sqrt(DotProd(h,h,SubFrLen)/(double)SubFrLen);
  m = (asint(hScale) & 0x7f800000) >> 23;
  ScaleFloatToShort(h, hint, SubFrLen, m+3);

  XScale = FloatToShortScaled(X, Xint, SubFrLen, 3);  //would be better to normalize based on engery, not max
#if 0
   for (i = 0; i<SubFrLen; i++)
   {
	 hint[i] = i;
   }
#endif

  // Compute correlations of h[] needed for the codebook search
//TIMER_STAMP(a);
  Cor_hint1(hint, rrint);
   IntToFloat(rrint, DIM_RR, rr);
//TIMER_STAMP(b);
//  Cor_h(h, rr);
////TIMER_STAMP(c);
#if CHTEST
{
  DECLARE_INT(rrint2, DIM_RR);

  ALIGN_ARRAY(rrint2);//debug
  Cor_hint0(hint, rrint2);
  for(i = 0; i<DIM_RR; i++)	 //debug
	  if(rrint[i] != rrint2[i])
	    printf("%3d: %8d %8d  %8d\n",i, rrint[i], rrint2[i], rrint[i] - rrint2[i]);
}
#endif //CHTEST

// Compute correlation of target vector with impulse response.

//TIMER_STAMP(c);
   Cor_h_Xint(hint, Xint, Dnint);
//TIMER_STAMP(d);
   IntToFloat(Dnint, SubFrLen, Dn);

   //TIMER_STAMP(a);

#if TESTME //test
  {
    int fpDnint[SubFrLen2];
//	float scale;

//	scale  =
    Cor_h_X(h,X,Dn);
    FloatToIntScaled(Dn, fpDnint, SubFrLen, 7);
    for (i = 0; i<SubFrLen; i++)
      if(fpDnint[i] != Dnint[i])
        printf("%3d: %8x %8x %8x\n", i, Dnint[i] - fpDnint[i],Dnint[i], fpDnint[i]);
  }
#endif //test


// Find codebook index

//TIMER_STAMP(c);
  index = D4i64_LBC(Dn, rr, h, tmp_code, rr, shift, sign, flags);
//TIMER_STAMP(f);

// Compute innovation vector gain.
// Include fixed-gain pitch contribution into code[].

  *ind_gain = G_code(X, rr, &gain_q);

  for (i=0; i < SubFrLen; i++)
    code[i] = tmp_code[i]*gain_q;

  if(T0 < SubFrLen-2)
    for (i=T0; i < SubFrLen; i++)
      code[i] += code[i-T0]*gain_T0;

  return index;
}


//---------------------------------------------------------------
//---------------------------------------------------------------
void Cor_hint0(short *H, int *rr)
{

//  Compute  correlations of h[]  needed for the codebook search.
//    h[]              :Impulse response.
//    rr[]             :Correlations.

  int *rri0i0, *rri1i1, *rri2i2, *rri3i3;
  int *rri0i1, *rri0i2, *rri0i3;
  int *rri1i2, *rri1i3, *rri2i3;

  int *p0, *p1, *p2, *p3;
  int cor;

  int i, k, m, t;
  DECLARE_SHORT(h,SubFrLen2);
  DECLARE_SHORT(h2,SubFrLen2);

  ALIGN_ARRAY(h);
  ALIGN_ARRAY(h2);

  for(i=0; i<4; i++)
    h[i] = (short)0;

  for(i=0; i<SubFrLen; i++)
    h2[i+2] = h[i+4] = H[i];


// Init pointers

  rri0i0 = rr;
  rri1i1 = rri0i0 + NB_POS;
  rri2i2 = rri1i1 + NB_POS;
  rri3i3 = rri2i2 + NB_POS;

  rri0i1 = rri3i3 + NB_POS;
  rri0i2 = rri0i1 + MSIZE;
  rri0i3 = rri0i2 + MSIZE;
  rri1i2 = rri0i3 + MSIZE;
  rri1i3 = rri1i2 + MSIZE;
  rri2i3 = rri1i3 + MSIZE;

// Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]

  cor = 0;
  m = 0;
  for(i=NB_POS-1; i>=0; i--)
  {
    cor += h[m+0]*h[m+0] + h[m+1]*h[m+1];   rri3i3[i] = cor;
    cor += h[m+2]*h[m+2] + h[m+3]*h[m+3];   rri2i2[i] = cor;
    cor += h[m+4]*h[m+4] + h[m+5]*h[m+5];   rri1i1[i] = cor;
    cor += h[m+6]*h[m+6] + h[m+7]*h[m+7];   rri0i0[i] = cor;

    m += 8;
  }

// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]

  h2 = h+2;
  p3 = rri2i3 + MSIZE-1;
  p2 = rri1i2 + MSIZE-1;
  p1 = rri0i1 + MSIZE-1;
  p0 = rri0i3 + MSIZE-2;

  for (k=0; k<NB_POS; k++)
  {
    cor = 0;
    m = 0;
    t = 0;

    for(i=k+1; i<NB_POS; i++)
    {
      cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

      t -= (NB_POS+1);
      m += 8;
    }
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
    cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;

    h2 += STEP;
    p3 -= NB_POS;
    p2 -= NB_POS;
    p1 -= NB_POS;
    p0 -= 1;
  }


// Compute elements of: rri0i2[], rri1i3[]

  h2 = h+4;
  p3 = rri1i3 + MSIZE-1;
  p2 = rri0i2 + MSIZE-1;
  p1 = rri1i3 + MSIZE-2;
  p0 = rri0i2 + MSIZE-2;

  for (k=0; k<NB_POS; k++)
  {
    cor = 0;
    m = 0;
    t = 0;

    for(i=k+1; i<NB_POS; i++)
    {
      cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

      t -= (NB_POS+1);
      m += 8;
    }
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;

    h2 += STEP;
    p3 -= NB_POS;
    p2 -= NB_POS;
    p1 -= 1;
    p0 -= 1;
  }

// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]

  h2 = h+6;
  p3 = rri0i3 + MSIZE-1;
  p2 = rri2i3 + MSIZE-2;
  p1 = rri1i2 + MSIZE-2;
  p0 = rri0i1 + MSIZE-2;

  for (k=0; k<NB_POS; k++)
  {
    cor = 0;
    m = 0;
    t = 0;

    for(i=k+1; i<NB_POS; i++)
    {
      cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

      t -= (NB_POS+1);
      m += 8;
    }
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;

    h2 += STEP;
    p3 -= NB_POS;
    p2 -= 1;
    p1 -= 1;
    p0 -= 1;
  }

  return;
}

//---------------------------------------------------------------
void cor_h_prods(int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0,int dp3,int dp2,int dp1,int dp0){
  int k;

  for (k=0; k<NB_POS; k++)
  {
    cor_h_prodloop(NB_POS-(k+1),oddn,h,h2,p3,p2,p1,p0);
    h2 += STEP;
    p3 -= dp3;
    p2 -= dp2;
    p1 -= dp1;
    p0 -= dp0;
  }
  return;
}

#if _MSC_FULL_VER >= 13008827 && defined(_M_IX86)
#pragma warning(disable:4731)			// EBP modified with inline asm
#endif

void cor_h_prodloop(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0)
{
#if ASM_CORHPL
	

	n = n * 4 + oddn;

#define in edi
#define inoff edx
#define out esi
#define out3 out+eax
#define out2 out+ebx
#define out1 out+ebp
#define out0 out

#define L(m,n)  ASM movq mm##m, QP[in+8*n]
#define M(m,n)  ASM pmaddwd mm##m, QP[in+inoff+8*n]
#define S(m)    ASM psrlq mm##m, 32
#define AH(m,n)   ASM paddd mm##m, mm##n
#define WH(m,o) ASM movd DP[out##o], mm##m
#define AL(m,n)   ASM paddd mm##m, mm##n
#define WL(m,o) ASM movd DP[out##o], mm##m


ASM {
   push ebp;
   mov ecx, n;
   mov in, h;
   mov inoff, h2;
   sub inoff, in;
   mov out, p0;
   mov eax, p3;
   mov ebx, p2;
   mov ebp, p1;
   sub eax, out;
   sub ebx, out;
   sub ebp, out;
}
L(0,0);
ASM pxor mm3,mm3;
M(0,0);
		L(1,1);
						AL(3,0); //really a copy
		M(1,1);
S(0);
ASM sub ecx,8;
ASM  jl oddends;

inner:
				L(2,2);
AH(0,3);
						WL(3,3);
WH(0,2);
AL(0,1);
				M(2,2);
		S(1);

						L(3,3);
		AH(1,0);
WL(0,1);
		WH(1,0);
		AL(1,2);
						M(3,3);
				S(2);
ASM sub out, 4*(NB_POS+1);

L(0,4);
				AH(2,1);
		WL(1,3);
				WH(2,2);
				AL(2,3);
M(0,4);
						S(3);

		L(1,5);
						AH(3,2);
				WL(2,1);
						WH(3,0);
						AL(3,0);
		M(1,5);
S(0);
ASM sub out, 4*(NB_POS+1);
ASM  add in, 16*2;
ASM sub ecx, 8;
ASM  jge inner;

oddends:
ASM add ecx, 4;
ASM  jl cleanup;

//four more
				L(2,2);
AH(0,3);
						WL(3,3);
WH(0,2);
AL(0,1);
				M(2,2);
		S(1);

						L(3,3);
		AH(1,0);
WL(0,1);
		WH(1,0);
		AL(1,2);
						M(3,3);
				S(2);
ASM sub out, 4*(NB_POS+1);

				AH(2,1);
ASM dec ecx;
ASM jl innerdone;
		WL(1,3);
ASM dec ecx;
ASM jl innerdone;
				WH(2,2);
				AL(2,3);
ASM dec ecx;
ASM jl innerdone;
				WL(2,1);
ASM jmp innerdone;

cleanup:
ASM add ecx, 4;
ASM dec ecx;
ASM jl innerdone;
AH(0,3);
						WL(3,3);
ASM dec ecx;
ASM jl innerdone;
WH(0,2);
AL(0,1);
ASM dec ecx;
ASM jl innerdone;
WL(0,1);

innerdone:
ASM emms;		
ASM pop ebp;
#undef in
#undef inoff
#undef out
#undef out3
#undef out2
#undef out1
#undef out0

#undef L
#undef M
#undef S
#undef AH
#undef WH
#undef AL
#undef WL
#else //ASM_CORHPL
  int cor;
  int i,m,t;

  cor = 0;
  m = 0;
  t = 0;

  for(i=n; i; i--)
  {
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
    cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
    cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

    t -= (NB_POS+1);
    m += 8;
  }
  if(oddn >= 1) {
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    if(oddn >= 2) {
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      if(oddn >= 3) {
        cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      }
    }
  }
#endif //ASM_CORHPL

  return;
}

void cor_h_diag(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0)
{
#if ASM_CORHDL
	

	n = n * 4 + oddn;

#define in edi
#define inoff edx
#define out esi
#define out3 out+eax
#define out2 out+ebx
#define out1 out+ebp
#define out0 out

#define L(m,n)  ASM movq mm##m, QP[in+8*n]
#define M(m,n)  ASM pmaddwd mm##m, QP[in+inoff+8*n]
#define R(m)    ASM psrad mm##m, 1
#define S(m)    ASM psrlq mm##m, 32
#define AH(m,n) ASM paddd mm##m, mm##n
#define WH(m,o) ASM movd DP[out##o], mm##m
#define AL(m,n) ASM paddd mm##m, mm##n
#define WL(m,o) ASM movd DP[out##o], mm##m


ASM {
   push ebp;
   mov ecx, n;
   mov in, h;
   mov inoff, h2;
   sub inoff, in;
   mov out, p0;
   mov eax, p3;
   mov ebx, p2;
   mov ebp, p1;
   sub eax, out;
   sub ebx, out;
   sub ebp, out;
}
L(0,0);
ASM pxor mm3,mm3;
M(0,0);
		L(1,1);
						AL(3,0); //really a copy
		M(1,1);
R(0);
S(0);
ASM sub ecx,8;
ASM  jl oddends;

inner:
				L(2,2);
AH(0,3);
						WL(3,3);
		R(1);
WH(0,2);
AL(0,1);
				M(2,2);
		S(1);

						L(3,3);
		AH(1,0);
WL(0,1);
				R(2);
		WH(1,0);
		AL(1,2);
						M(3,3);
				S(2);
ASM sub out, 4*1;

L(0,4);
				AH(2,1);
		WL(1,3);
						R(3);
				WH(2,2);
				AL(2,3);
M(0,4);
						S(3);

		L(1,5);
						AH(3,2);
				WL(2,1);
R(0);
						WH(3,0);
						AL(3,0);
		M(1,5);
S(0);
ASM sub out, 4*1;
ASM  add in, 16*2;
ASM sub ecx, 8;
ASM  jge inner;

oddends:
ASM add ecx, 4;
ASM  jl cleanup;

//four more
				L(2,2);
AH(0,3);
						WL(3,3);
		R(1);
WH(0,2);
AL(0,1);
				M(2,2);
		S(1);

						L(3,3);
		AH(1,0);
WL(0,1);
				R(2);
		WH(1,0);
		AL(1,2);
						M(3,3);
				S(2);
ASM sub out, 4*1;

				AH(2,1);
ASM dec ecx;
ASM jl innerdone;
		WL(1,3);
ASM dec ecx;
ASM jl innerdone;
				WH(2,2);
				AL(2,3);
ASM dec ecx;
ASM jl innerdone;
				WL(2,1);
ASM jmp innerdone;

cleanup:
ASM add ecx, 4;
ASM dec ecx;
ASM jl innerdone;
AH(0,3);
						WL(3,3);
ASM dec ecx;
ASM jl innerdone;
WH(0,2);
AL(0,1);
ASM dec ecx;
ASM jl innerdone;
WL(0,1);

innerdone:
ASM emms;		
ASM pop ebp;
#undef in
#undef inoff
#undef out
#undef out3
#undef out2
#undef out1
#undef out0

#undef L
#undef M
#undef R
#undef S
#undef AH
#undef WH
#undef AL
#undef WL
#else //ASM_CORHDL
  int cor;
  int i,m,t;

  cor = 0;
  m = 0;
  t = 0;

  for(i=n; i; i--)
  {
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor>>1;
    cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor>>1;
    cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor>>1;
    cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor>>1;

    t -= 1;
    m += 8;
  }
  if(oddn >= 1) {
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    if(oddn >= 2) {
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      if(oddn >= 3) {
        cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      }
    }
  }
#endif //ASM_CORHDL

  return;
}

void Cor_hint1(short *H, int *rr)
{

//  Compute  correlations of h[]  needed for the codebook search.
//    h[]              :Impulse response.
//    rr[]             :Correlations.

  int *rri0i0, *rri1i1, *rri2i2, *rri3i3;
  int *rri0i1, *rri0i2, *rri0i3;
  int *rri1i2, *rri1i3, *rri2i3;

  int *p0, *p1, *p2, *p3;
  short *h2;

  int i;
  DECLARE_SHORT(h,SubFrLen2);
  DECLARE_SHORT(hp2,SubFrLen2);

  ALIGN_ARRAY(h);
  ALIGN_ARRAY(hp2);

  for(i=0; i<4; i++)
    h[i] = (short)0;

  for(i=0; i<SubFrLen; i++)
    hp2[i+2] = h[i+4] = H[i];


// Init pointers

  rri0i0 = rr;
  rri1i1 = rri0i0 + NB_POS;
  rri2i2 = rri1i1 + NB_POS;
  rri3i3 = rri2i2 + NB_POS;

  rri0i1 = rri3i3 + NB_POS;
  rri0i2 = rri0i1 + MSIZE;
  rri0i3 = rri0i2 + MSIZE;
  rri1i2 = rri0i3 + MSIZE;
  rri1i3 = rri1i2 + MSIZE;
  rri2i3 = rri1i3 + MSIZE;
//TIMER_STAMP(a);
// Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]

  cor_h_diag(NB_POS,0,h,h,&rri3i3[NB_POS-1],&rri2i2[NB_POS-1],&rri1i1[NB_POS-1],&rri0i0[NB_POS-1]);
//TIMER_STAMP(b);

// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]

  h2 = hp2;
  p3 = rri2i3 + MSIZE-1;
  p2 = rri1i2 + MSIZE-1;
  p1 = rri0i1 + MSIZE-1;
  p0 = rri0i3 + MSIZE-2;

  cor_h_prods(4-1,h,h2,p3,p2,p1,p0,NB_POS,NB_POS,NB_POS,1);

// Compute elements of: rri0i2[], rri1i3[]

  h2 = h+4;
  p3 = rri1i3 + MSIZE-1;
  p2 = rri0i2 + MSIZE-1;
  p1 = rri1i3 + MSIZE-2;
  p0 = rri0i2 + MSIZE-2;

  cor_h_prods(4-2,h,h2,p3,p2,p1,p0,NB_POS,NB_POS,1,1);

// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]

  h2 = hp2+4;
  p3 = rri0i3 + MSIZE-1;
  p2 = rri2i3 + MSIZE-2;
  p1 = rri1i2 + MSIZE-2;
  p0 = rri0i1 + MSIZE-2;

  cor_h_prods(4-3,h,h2,p3,p2,p1,p0,NB_POS,1,1,1);
//TIMER_STAMP(c);

  return;
}

//---------------------------------------------------------------------------
void Cor_h_Xint(short h[],short X[],int D[])
{
   int i;
   DECLARE_SHORT(hh, 2*SubFrLen+16); //h[-1,0,0,1,1,2,2,3,3,4,4,5,...57,58,58,59]
   DECLARE_SHORT(XX, 2*SubFrLen+16); //X[ 0,1,0,1,2,3,2,3,4,5,4,5,...58,59,58,59]
#if TESTME
   short htest[SubFrLen], Xtest[SubFrLen];

   for (i = 0; i<SubFrLen; i++)
   {
	 htest[i] = 1;//(short)(i<30?i:60-i);
	 Xtest[i] = 1;//(short)(i<30?i:60-i);
   }
   h = htest;
   X = Xtest;
#endif //TESTME

   ALIGN_ARRAY(hh);
   ALIGN_ARRAY(XX);
   for (i=2*SubFrLen; i < 2*SubFrLen+16; i++) {
	 XX[i] = hh[i] = (short)0;
   }
//   hh += 8; XX += 8;

#define ASM_Cor_h_Xint 1
#if ASM_Cor_h_Xint
   ab2ababw(X, XX, SubFrLen);
   ab2abzaw(h, hh, SubFrLen);
   //TIMER_STAMP(e);
   CorrelateIntTri (hh, XX, D, SubFrLen);
#if TESTME
   {
	   int D2[SubFrLen];
       CorrelateInt22 (hh, XX, D2, SubFrLen);
	   for (i = 0; i<SubFrLen; i++)	{
		//  if(D[i] != D2[i])
			printf("%3d: %6d %6d %6d   ", i,D[i], D2[i], D[i] - D2[i]);
			if(i&1) printf("\n");
	   }
   }
#endif TESTME	

#else //ASM_Cor_h_Xint
   for (i=0; i < SubFrLen; i+=2)	 {
	 hh[2*i] = (i-1 >= 0) ? h[i-1] : (short)0;
     hh[2*i+1] = h[i];
     hh[2*i+2] = h[i];
     hh[2*i+3] = h[i+1];
	 XX[2*i] = X[i];
	 XX[2*i+1] = X[i+1];
	 XX[2*i+2] = X[i];
	 XX[2*i+3] = X[i+1];
}

   for (i=0; i < 2*SubFrLen; i+=4)	 {
	 int acc0 = 0, acc1 = 0;
	 for (j=0; j < 2*SubFrLen - i; j+=4) {
       acc0 += (int)hh[j]*XX[i+j] + (int)hh[j+1]*XX[i+j+1];
       acc1 += (int)hh[j+2]*XX[i+j+2] + (int)hh[j+3]*XX[i+j+3];
	 }
	 D[i/2] = acc0 >> 16;
	 D[i/2+1] = acc1 >> 16;
   }
#endif //ASM_Cor_h_Xint
	
   return;
}
//---------------------------------------------------------------------------
#define ASM_CORR_TRI 1
//#if ASM_CORR_TRI
//------------------------------------------------------
// triangular correlations
// ASSUMES that array has 8 zero values beyond the end
//   and can be read 8 more beyond that (without page fault etc)
// data format is
//   taps: 0 t0 t0 t1  t1 t2 t2 t3  t3 t4 t4 t5 ... t57 t58 t58 t59
//	 arr: a0 a1 a0 a1  a2 a3 a2 a3  a4 a5 a4 a5 ... a58 a59 a58 a59
//
void CorrelateIntTri(short *taps, short *array, int *corr, int ncor)
{
#define rega0  mm0
#define regb0  mm1
#define rega1  mm2
#define regb1  mm3
#define rega2  mm4
#define regb2  mm5
#define acc0   mm6
#define acc1   mm7

#define arr    esi
#define tap    edi
#define cor    eax
#define icnt   ecx

// In the following macros, 'n' is the column number and 'i' is the
// iteration number.

// we use "the convolution trick" or using la twice so that one
// of the pmadd's is reg,reg and thus can be in the V-slot.

// NOTE: we have read ahead up to 2 quadwords
//   so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
//   and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal

#define la(n,i)  ASM movq  rega##n,QP[arr+8*i]
#define lb(n,i)  ASM movq  regb##n,QP[tap+8*i-8]
#define m0(n,i)  ASM pmaddwd regb##n,rega##n
#define m1(n,i)  ASM pmaddwd rega##n,QP[tap+8*i]
#define a0(n,i)  ASM paddd acc0,regb##n
#define a1(n,i)  ASM paddd acc1,rega##n

  ASM
  {
    shr ncor,1;
    mov cor,corr;
    mov tap,taps;
    mov arr,array;
    mov icnt,ncor;
  }

ForEachCorrPair:

// prime the pump

  la(0,0);
  ASM pxor regb0,regb0;   // to  avoid lb(0,0) reading taps[-1]
          la(1,1);
  ASM pxor acc0,acc0;	  // clear accumulator
  m1(0,0);
  ASM pxor acc1,acc1;	  // clear accumulator
          lb(1,1);
  ASM sub icnt, 1;        // account for pump priming
  ASM jle cleanup;        // bypass if only one to do

inner:
                  la(2,2);
          m0(1,1);
          m1(1,1);
  a0(0,0);
                  lb(2,2);
  a1(0,0);
  la(0,3);
                  m0(2,2);
                  m1(2,2);
          a0(1,1);
  lb(0,3);
          a1(1,1);
          la(1,4);
  m0(0,3);
  m1(0,3);
                  a0(2,2);
          lb(1,4);
                  a1(2,2);

  ASM add arr,24;
  ASM add tap,24;

  ASM sub icnt,3;
  ASM jg inner;

cleanup:  //  last two adds
  a0(0,0);
  a1(0,0);

// Done with one correlation pair.  Pack and store 2 results in corr array

  ASM
  {
    add cor,16;
     mov arr, array
    mov tap,taps;
	 add arr,16;
    mov icnt, ncor;
	
	mov array, arr;
	 sub icnt,2;	  //set flags for jump

	movq  QP[cor-16],acc1;
	movq  QP[cor-8],acc0;

	mov ncor, icnt;
    jg ForEachCorrPair;

    emms;
  }

}
#undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1

#undef arr
#undef tap
#undef cor
#undef icnt
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
// 16 bit output
//	 psrad acc0,16;//this could be less in some cases
//	 psrad acc1,16;
//	 packssdw acc1,acc0;
//   movq  QP[cor-8],acc0;

//#else
//------------------------------------------------------
void CorrelateInt22(short *taps, short *array, int *corr, int ncor)
{
  int i,j;

  for (i=0; i < 2*ncor; i+=4)	 {
    int acc0 = 0, acc1 = 0;
    for (j=0; j < 2*ncor - i; j+=4) {
      acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1];
      acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3];
    }
    corr[i/2] = acc0 ;
    corr[i/2+1] = acc1 ;
  }

  return;
}
//#endif

#endif //COMPILE_MMX