//
//	ITU-T G.723 Floating Point Speech Coder	ANSI C Source Code.	Version 1.00
//	copyright (c) 1995, AudioCodes, DSP Group, France Telecom,
//	Universite de Sherbrooke, Intel Corporation.  All rights reserved.
//


#include <stdio.h>
#include <math.h>
#include "opt.h"
#include "typedef.h"
#include "cst_lbc.h"
#include "tab_lbc.h"
#include "util_lbc.h"
#include "lsp.h"
#include "timer.h"
#include "mmxutil.h"

#if COMPILE_MMX
//  This file includes all the Lsp related functions

//--------------------------------------------------------------
int mult(short x, short y)
{
  return ( ((int)x)*((int)y) >> 16 );
}
//--------------------------------------------------------------
int LspSearchInt(short *Lspw, short *LspTab)
{

#if ASM_SVQ

  int mem8000[2] = {0x80008000,0x80008000}, zero[2] = {0,0};
  short maxes[4],mx;
  int retu;
  int *ptr,tmp,t,i,k;

#define lsp esi
#define tab edi
#define idx edx
#define pidx eax  // packed index: n+3 n+2 n+1 n
#define maxi ecx

#define r0   mm0
#define r1   mm1
#define r2   mm2
#define r3   mm3
#define r4   mm4
#define max  mm5


// Expand Lspw table so it's 0000 1111 2222 3333 4444 5555 6666 7777

  ptr = (int *)Lspw;
  k = 14;
  for (i=7; i>=0; i--)
  {
    t = Lspw[i]; t = t & 0xffff;
    tmp = t;
    tmp = (tmp << 16) | t;
    ptr[k] = ptr[k+1] = tmp;
    k -= 2;
  }

// Need LspTab to be ordered 0 4 8 12  1 5 9 13  etc.
// so that lo word of each qword sees 0123, next sees 4567, etc.
//
// 'idx' counts iterations, so it goes 0 to 256 in steps of 4.
// The 4 LspTab sets at any given time are 8*idx, 8*idx+8, +16, +24
// Lspw[n] is lsp+8*n, as defined below

#define a(n)  [tab+8*idx+8*n]
#define b(n)  [lsp+8*n]

  ASM
  {
    push lsp;
    push tab;
    push idx;
    push pidx;
    push maxi;

    mov lsp,Lspw;
    mov tab,LspTab;
    xor idx,idx;
    xor maxi,maxi;
    mov pidx,003020100h;
    movq max,mem8000;

/*
  The code below is interleaved with k=1.
  The structure is: four levels of indentation, one for each of the
  4 terms of the sum.  The instructions completely left-justified
  are the part of the loop that's wrapped around on itself.
  Note that the code reads 8 bytes past the end of the LspTab that's
  passed to it.  This is made o.k. by making the table that's passed
  to it have 8 dummy bytes at the end.
*/

// Start up the pipeline

    movq r0,a(0);
    movq r1,r0;
    pmulhw r0,b(4);
    psubsw r1,b(0);
      movq r2,a(1);
      movq r3,r2;
    psllw r0,2;
    pmulhw r0,r1;
      pmulhw r2,b(5);
      psubsw r3,b(1);
        movq r1,a(2);
      psllw r2,2;
      pmulhw r2,r3;  
        movq r3,r1;  
        pmulhw r1,b(6);
        psubsw r3,b(2);
      paddw r0,r2;   
          movq r2,a(3);
        psllw r1,2;
        pmulhw r1,r3;
          movq r3,r2;
          pmulhw r2,b(7);
          psubsw r3,b(3);
        paddw r1,r0;

loop1:
    movq r0,a(4);
          movq r4,r1;   // save accum so not wiped out by first half of loop

          psllw r2,2;
          
    movq r1,r0;
          pmulhw r2,r3;
    
    pmulhw r0,b(4);

    psubsw r1,b(0);
          paddw r4,r2;      // now final answer is in r4

      movq r2,a(5);

paddw r4,mem8000  // make final sum unsigned
      movq r3,r2;      // 0123

    psllw r0,2;
    
psubusw max,r4    // start to compute max
    pmulhw r0,r1;      // 0.23

      pmulhw r2,b(5);
paddw max,r4      // max now done

      psubsw r3,b(1);
pcmpeqw r4,max    // now 1111's means a new max was found

        movq r1,a(6);  // 0123
packsswb r4,r4;    // put all fields in low 32 bits

      psllw r2,2;
      
movd ebx,r4;
      pmulhw r2,r3;    // 012.

xor ebx,0ffffffffh;   // invert mask

and maxi,ebx;       // get old index to keep
        movq r3,r1;    // 0123

        pmulhw r1,b(6);

        psubsw r3,b(2);

xor ebx,0ffffffffh;   // invert mask

      paddw r0,r2;     // 01.3
and ebx,pidx;       // get new index

        psllw r1,2;
        
          movq r2,a(7);// 0123
        pmulhw r1,r3;  // 012.

          movq r3,r2;  // 0123
or maxi,ebx;       // now maxi is done

          pmulhw r2,b(7);

          psubsw r3,b(3);

        paddw r1,r0;   // .123
add idx,4;

add pidx,004040404h;

cmp idx,256;
jl loop1;

psubw max,mem8000;
mov retu,maxi;
movq maxes,max;

    pop maxi;
    pop pidx;
    pop idx;
    pop tab;
    pop lsp;
  }

// find which of the 4 maxes is the max, and return the appropriate
// one of the 4 maxindices.

  mx = maxes[0]; t = 0;
  if (maxes[1] >= mx) { mx = maxes[1]; t = 8; }
  if (maxes[2] >= mx) { mx = maxes[2]; t = 16; }
  if (maxes[3] >= mx) { mx = maxes[3]; t = 24; }

  ASM emms;
  retu = (retu >> t) & 0xff;

  return(retu);

#undef a
#undef b
#undef idx
#undef lsp
#undef tab
#undef max
#undef maxi

#else   // if assembly code not selected, use C code

  int Indx[4],i,s,ret;
  short Max[4],Err,mx;
  short m0,m1,m2,m3,m4,m5,m6,m7,t;

  for (i=0; i<4; i++)
  {
    Max[i] = Indx[i] = 0;
  }
    
  for (i=0; i < LspCbSize; i++)
  {
    s = (i&3);

    m0 = mult(Lspw[4],LspTab[s+0]);
    t = LspTab[s+0]-Lspw[0];  m1 = mult(t,m0<<2);
    m2 = mult(Lspw[5],LspTab[s+4]);
    t = LspTab[s+4]-Lspw[1];  m3 = mult(t,m2<<2);
    m4 = mult(Lspw[6],LspTab[s+8]);
    t = LspTab[s+8]-Lspw[2];  m5 = mult(t,m4<<2);
    m6 = mult(Lspw[7],LspTab[s+12]);
    t = LspTab[s+12]-Lspw[3];  m7 = mult(t,m6<<2);

    Err = m1+m3+m5+m7;

    if (Err >= Max[s])
    {
      Max[s] = Err;
      Indx[s] = i;
    }

    if (s==3)
      LspTab += 16;
  }
  mx = Max[0]; ret = Indx[0];
  if (Max[1] >= mx) { mx = Max[1]; ret = Indx[1]; }
  if (Max[2] >= mx) { mx = Max[2]; ret = Indx[2]; }
  if (Max[3] >= mx) { mx = Max[3]; ret = Indx[3]; }

  return(ret);

#endif
}
//--------------------------------------------------------------

Word32  Svq_Int(float *Lsp, float *Wvect)
{
#define LSP_SCALE 256

  int  i;

  Word32 Rez;
  int    z;
  short Wint[LpcOrder],LspTemp[LpcOrder];
  DECLARE_SHORT(Lspw,32);

  ALIGN_ARRAY(Lspw);

  for (i=0; i<LpcOrder; i++)
  {
    z = (int)(LSP_SCALE*2*Lsp[i]);
    if (z > 32767) LspTemp[i] = 32767;
    else if (z < -32768) LspTemp[i] = -32768;
    else LspTemp[i] = z;
  }
  FloatToShortScaled(Wvect,Wint,10,0);
  Rez = (Word32) 0;

// For each of the 3 bands
  
  Lspw[0] = LspTemp[0]; Lspw[1] = LspTemp[1]; Lspw[2] = LspTemp[2];
  Lspw[3] = 0;
  Lspw[4] = -Wint[0]; Lspw[5] = -Wint[1]; Lspw[6] = -Wint[2];
  Lspw[7] = 0;
  Rez = LspSearchInt(Lspw,LspTableInt);

  Lspw[0] = LspTemp[3]; Lspw[1] = LspTemp[4]; Lspw[2] = LspTemp[5];
  Lspw[3] = 0;
  Lspw[4] = -Wint[3]; Lspw[5] = -Wint[4]; Lspw[6] = -Wint[5];
  Lspw[7] = 0;
  Rez = (Rez<<8) + LspSearchInt(Lspw,&LspTableInt[1024]);

  Lspw[0] = LspTemp[6]; Lspw[1] = LspTemp[7]; Lspw[2] = LspTemp[8];
  Lspw[3] = LspTemp[9];
  Lspw[4] = -Wint[6]; Lspw[5] = -Wint[7]; Lspw[6] = -Wint[8];
  Lspw[7] = -Wint[9];
  Rez = (Rez<<8) + LspSearchInt(Lspw,&LspTableInt[2048]);

  return Rez;

}
#endif
//---------------------------------------------------------------
float Polynomial(float *Lpq, int CosPtr)
{
  return(Lpq[LpcOrder]*CosineTable[0] +
    Lpq[LpcOrder-2]*CosineTable[CosPtr] +
    Lpq[LpcOrder-4]*CosineTable[(CosPtr*2)&(CosineTableSize-1)] +
    Lpq[LpcOrder-6]*CosineTable[(CosPtr*3)&(CosineTableSize-1)] +
    Lpq[LpcOrder-8]*CosineTable[(CosPtr*4)&(CosineTableSize-1)] +
    Lpq[LpcOrder-10]*CosineTable[(CosPtr*5)&(CosineTableSize-1)]);
}


//--------------------------------------------------------------
void  AtoLsp(float *LspVect, float *Lpc, float *PrevLsp)
{
  int  i,j,k;
  int  LspCnt;
  float  Lpq[LpcOrder+2];
  float  PrevVal,CurrVal,AbsPrev,AbsCurr;

// Small additional bandwidth expansion
  
  for (i=0; i < LpcOrder; i++)
    LspVect[i] = Lpc[i]*BandExpTable[i];

// Compute Lp and Lq
 
  Lpq[0] = Lpq[1] = 1.0f;

  for (i=0; i < LpcOrder/2; i++)
  {
    Lpq[2*i+2] = -Lpq[2*i+0] - LspVect[i] - LspVect[LpcOrder-1-i];
    Lpq[2*i+3] =  Lpq[2*i+1] - LspVect[i] + LspVect[LpcOrder-1-i];
  }
  Lpq[LpcOrder+0] *= 0.5f;
  Lpq[LpcOrder+1] *= 0.5f;

// Do first evaluation
  
  k = 0;
  LspCnt = 0;
  PrevVal = Polynomial(Lpq,0);

  for (i=1; i < CosineTableSize/2; i++)
  {
// Evaluate the polynomial
    
    CurrVal = Polynomial(&Lpq[k],i);

// Test for sign change
    
    if ((asint(CurrVal) ^ asint(PrevVal)) < 0)
    {
      AbsPrev = (float)fabs(PrevVal);
      AbsCurr = (float)fabs(CurrVal);

      LspVect[LspCnt++] = (i-1 + AbsPrev/(AbsPrev+AbsCurr));

// Check if all found 

      if (LspCnt == LpcOrder)
        break;

// Switch the pointer, evaluate again
 
      k ^= 1;
      CurrVal = Polynomial(&Lpq[k],i);
    }
    PrevVal = CurrVal;
  }

// Check if all Lsp found

  if (LspCnt != LpcOrder)
  {
    for (j=0; j < LpcOrder; j++)
      LspVect[j] = PrevLsp[j];
    
  }
    return;
}
//--------------------------------------------------------------
Word32 Lsp_Qnt(float *CurrLsp, float *PrevLsp, int UseMMX)
{
  int  i;

  float Wvect[LpcOrder];
  float Min,Tmp;

// Compute the weighting vector
  
  Wvect[0] = 1.0f/(CurrLsp[1] - CurrLsp[0]);
  Wvect[LpcOrder-1] = 1.0f/(CurrLsp[LpcOrder-1] - CurrLsp[LpcOrder-2]);

  for (i=1; i < LpcOrder-1; i++)
  {
    Min = CurrLsp[i+1] - CurrLsp[i];
    Tmp = CurrLsp[i] - CurrLsp[i-1];
    
    if (Tmp < Min)
      Min = Tmp;

    if (Min > 0.0f)
      Wvect[i] = 1.0f/Min;
    else
      Wvect[i] = 1.0f;
  }

// Generate predicted vector as (DC-removed-Curr) - b*(DC-removed-Prev)

    CurrLsp[0] = (CurrLsp[0] - LspDcTable[0]) -
      LspPred0*(PrevLsp[0] - LspDcTable[0]);
	   CurrLsp[1] = (CurrLsp[1] - LspDcTable[1]) -
      LspPred0*(PrevLsp[1] - LspDcTable[1]);
	   CurrLsp[2] = (CurrLsp[2] - LspDcTable[2]) -
      LspPred0*(PrevLsp[2] - LspDcTable[2]);
	   CurrLsp[3] = (CurrLsp[3] - LspDcTable[3]) -
      LspPred0*(PrevLsp[3] - LspDcTable[3]);
	   CurrLsp[4] = (CurrLsp[4] - LspDcTable[4]) -
      LspPred0*(PrevLsp[4] - LspDcTable[4]);
	   CurrLsp[5] = (CurrLsp[5] - LspDcTable[5]) -
      LspPred0*(PrevLsp[5] - LspDcTable[5]);
	   CurrLsp[6] = (CurrLsp[6] - LspDcTable[6]) -
      LspPred0*(PrevLsp[6] - LspDcTable[6]);
	   CurrLsp[7] = (CurrLsp[7] - LspDcTable[7]) -
      LspPred0*(PrevLsp[7] - LspDcTable[7]);
	   CurrLsp[8] = (CurrLsp[8] - LspDcTable[8]) -
      LspPred0*(PrevLsp[8] - LspDcTable[8]);
	   CurrLsp[9] = (CurrLsp[9] - LspDcTable[9]) -
      LspPred0*(PrevLsp[9] - LspDcTable[9]);

// Do the SVQ
#if COMPILE_MMX
  	if (UseMMX)
    	return Svq_Int(CurrLsp, Wvect);
  	else
#endif
    	return Lsp_Svq(CurrLsp, Wvect);
}


//--------------------------------------------------------------
Word32  Lsp_Svq(float *Lsp, float *Wvect)
{
  int  i,k;

  Word32 Rez;
  int    Indx,Start,Dim;
  float *LspQntPnt;
  float  Max,Err,lsp0,lsp1,lsp2,lsp3,w0,w1,w2,w3;
  float LspTemp[LpcOrder];

  for (i=0; i<LpcOrder; i++)
    LspTemp[i] = 2.0f*Lsp[i];
  Rez = (Word32) 0;

// For each of the 3 bands
  
  for (k=0; k < LspQntBands; k++)
  {

// Initialize the search
    
    Max = 0.0f;  //-1.0f;
    Indx = 0;
    LspQntPnt = BandQntTable[k];
    Start = BandInfoTable[k][0];
    Dim = BandInfoTable[k][1];

    lsp0 = LspTemp[Start+0];
    lsp1 = LspTemp[Start+1];
    lsp2 = LspTemp[Start+2];
    w0 = Wvect[Start+0];
    w1 = Wvect[Start+1];
    w2 = Wvect[Start+2];
    
    if (k < 2)
    {
      for (i=0; i < LspCbSize; i++)
      {
        Err = (lsp0 - LspQntPnt[0])*w0*LspQntPnt[0] +
          (lsp1 - LspQntPnt[1])*w1*LspQntPnt[1] +
          (lsp2 - LspQntPnt[2])*w2*LspQntPnt[2];

        LspQntPnt += 3;

        if (asint(Err) > asint(Max))
        {
          Max = Err;
          Indx = i;
        }
      }
    }
    else
    {
      lsp3 = LspTemp[Start+3];
      w3 = Wvect[Start+3];
      for (i=0; i < LspCbSize; i++)
      {
        Err = (lsp0 - LspQntPnt[0])*w0*LspQntPnt[0] +
          (lsp1 - LspQntPnt[1])*w1*LspQntPnt[1] +
          (lsp2 - LspQntPnt[2])*w2*LspQntPnt[2] +
          (lsp3 - LspQntPnt[3])*w3*LspQntPnt[3];

        LspQntPnt += 4;

        if (asint(Err) > asint(Max))
        {
          Max = Err;
          Indx = i;
        }
      }
    }
    Rez = (Rez << 8) | Indx;
  }

  return Rez;
}


//--------------------------------------------------------------
Flag  Lsp_Inq(float *Lsp, float *PrevLsp, Word32 LspId, int Crc)
{
  int  i,j;

  float *LspQntPnt;
  float  Lprd,Scon,Tmpf,Scon2;
  int    Tmp;
  Flag   Test;

  if (Crc == 0)
  {
    Scon = 2.0f;
    Lprd = LspPred0;
  }
  else
  {
    LspId = (Word32) 0;
    Scon = 4.0f;
    Lprd = LspPred1;
  }
  Scon2 = Scon - 0.03125f;

// Reconstruct the LSP vector
  
  for (i=LspQntBands-1; i >= 0; i--)
  {
    Tmp = LspId & (Word32) 0x000000ff;
    LspId >>= 8;

    LspQntPnt = BandQntTable[i];

    for (j=0; j < BandInfoTable[i][1]; j++)
      Lsp[BandInfoTable[i][0] + j] = LspQntPnt[Tmp*BandInfoTable[i][1] + j];
  }

// Add predicted vector and DC to decoded vector
  
  for (j=0; j < LpcOrder; j++)
    Lsp[j] = Lsp[j] + (PrevLsp[j] - LspDcTable[j])*Lprd + LspDcTable[j];

// Perform the stability check
  
  for (i=0; i < LpcOrder; i++)
  {

// Test the first and last one

    if (Lsp[0] < 3.0) 
      Lsp[0] = 3.0f;

    if (Lsp[LpcOrder-1] > 252.0f)
      Lsp[LpcOrder-1] = 252.0f;

// Test the others
    
    for (j=1; j < LpcOrder; j++)
    {
      Tmpf = Scon + Lsp[j-1] - Lsp[j];
      if (Tmpf > 0)
      {
        Tmpf *= 0.5f;
        Lsp[j-1] -= Tmpf;
        Lsp[j] += Tmpf;
      }
    }
    
// Test if stable
    
    Test = False;
    for (j=1; j < LpcOrder; j++)
      if ((Lsp[j] - Lsp[j-1]) < Scon2)
        Test = True;

    if (Test == False)
      break;
  }
  if (Test == True)
  	for (j=0; j < LpcOrder; j++)
	  Lsp[j] = PrevLsp[j];
  return Test;
}


//--------------------------------------------------------------
void  Lsp_Int(float *QntLpc, float *CurrLsp, float *PrevLsp)
{
  int  i,j;

  float  *Dpnt;
  float  Fac[4] = {0.25f, 0.5f, 0.75f, 1.0f};

  Dpnt = QntLpc;
  for (i=0; i < SubFrames; i++)
  {
// Interpolate

    for (j=0; j < LpcOrder; j++)
      Dpnt[j] = (1.0f - Fac[i])*PrevLsp[j] + Fac[i]*CurrLsp[j];

// Convert to Lpc
    
	  LsptoA(Dpnt);
    Dpnt += LpcOrder;
  }

// Copy the Lsp vector
  
  for (i=0; i < LpcOrder; i++)
    PrevLsp[i] = CurrLsp[i];
}


//--------------------------------------------------------------
void  LsptoA(float *Lsp)
{
  int i,j;

  float P[LpcOrder/2+1];
  float Q[LpcOrder/2+1];
  float Fac[(LpcOrder/2)-2] = {1.0f,0.5f,0.25f};

// Convert Lsp's to cosines
  
  for (i=0; i < LpcOrder; i++)
  {
    j = MyFloor(Lsp[i]);
    Lsp[i] = -(CosineTable[j] +
      (CosineTable[j+1]-CosineTable[j])*(Lsp[i]-j));
  }

// Init P and Q.  Note that P,Q * 2^26 correspond to fixed-point code

  P[0] = 0.5f;
  P[1] = Lsp[0] + Lsp[2];
  P[2] = 1.0f + 2.0f*Lsp[0]*Lsp[2];

  Q[0] = 0.5f;
  Q[1] = Lsp[1] + Lsp[3];
  Q[2] = 1.0f + 2.0f*Lsp[1]*Lsp[3];

// Compute all the others
  
  for (i=2; i < LpcOrder/2; i++)
  {
    P[i+1] = P[i-1] + P[i]*Lsp[2*i+0];
    Q[i+1] = Q[i-1] + Q[i]*Lsp[2*i+1];

// All update
    
    for (j=i; j >= 2; j--)
    {
      P[j] = P[j-1]*Lsp[2*i+0] + 0.5f*(P[j]+P[j-2]);
      Q[j] = Q[j-1]*Lsp[2*i+1] + 0.5f*(Q[j]+Q[j-2]);
    }

// Update PQ[01]

    P[0] = P[0]*0.5f;
    Q[0] = Q[0]*0.5f;

    P[1] = (P[1] + Lsp[2*i+0]*Fac[i-2])*0.5f;
    Q[1] = (Q[1] + Lsp[2*i+1]*Fac[i-2])*0.5f;
  }

// Convert to Lpc
  
  for (i=0; i < LpcOrder/2; i++)
  {
    Lsp[i] =            (-P[i] - P[i+1] + Q[i] - Q[i+1])*8.0f;
    Lsp[LpcOrder-1-i] = (-P[i] - P[i+1] - Q[i] + Q[i+1])*8.0f;
  }
}