//cb53.c - 5.3 rate codebook code

#include "opt.h"

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <memory.h>
#include "typedef.h"
#include "cst_lbc.h"
#include "tab_lbc.h"
#include "util_lbc.h"
#include "exc_lbc.h"
#include "timer.h"
#include "mmxutil.h"

void fourPulseFlt (float *rr, float *Dn, float thres, int ip[], int *shiftPtr);

//--------------------------------------------------------
int extra;
void reset_max_time(void)
{
  extra = 120;
}


//------------------------------------------------------------
int ACELP_LBC_code(float X[], float h[], int T0, float code[],
		int *ind_gain, int *shift, int *sign, float gain_T0, int flags)
{
  int i, index;
  float gain_q;
  float Dn[SubFrLen2], tmp_code[SubFrLen2];
  float rr[DIM_RR];

// Include fixed-gain pitch contribution into impulse resp. h[]

  if (T0 < SubFrLen-2)
    for (i = T0; i < SubFrLen; i++)
      h[i] += gain_T0*h[i-T0];

// Compute correlations of h[] needed for the codebook search

  Cor_h(h, rr);
 
// Compute correlation of target vector with impulse response.

  Cor_h_X(h, X, Dn);
  
// Find codebook index

  index = D4i64_LBC(Dn, rr, h, tmp_code, rr, shift, sign, flags);

// Compute innovation vector gain.
// Include fixed-gain pitch contribution into code[].

  *ind_gain = G_code(X, rr, &gain_q);

  for (i=0; i < SubFrLen; i++)
    code[i] = tmp_code[i]*gain_q;

  if(T0 < SubFrLen-2)
    for (i=T0; i < SubFrLen; i++)
      code[i] += code[i-T0]*gain_T0;

  return index;
}


//---------------------------------------------------------------
void Cor_h(float *H, float *rr)
{

//  Compute  correlations of h[]  needed for the codebook search.
//    h[]              :Impulse response.
//    rr[]             :Correlations.

  float *rri0i0, *rri1i1, *rri2i2, *rri3i3;
  float *rri0i1, *rri0i2, *rri0i3;
  float *rri1i2, *rri1i3, *rri2i3;

  float *p0, *p1, *p2, *p3;
  float cor, *h2;
  int i, k, m, t;
  float h[SubFrLen2];

  for(i=0; i<SubFrLen; i++)
    h[i+4] = H[i];

  for(i=0; i<4; i++)
    h[i] = 0.0f;

// Init pointers

  rri0i0 = rr;
  rri1i1 = rri0i0 + NB_POS;
  rri2i2 = rri1i1 + NB_POS;
  rri3i3 = rri2i2 + NB_POS;

  rri0i1 = rri3i3 + NB_POS;
  rri0i2 = rri0i1 + MSIZE;
  rri0i3 = rri0i2 + MSIZE;
  rri1i2 = rri0i3 + MSIZE;
  rri1i3 = rri1i2 + MSIZE;
  rri2i3 = rri1i3 + MSIZE;

// Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]

  cor = 0.0f;
  m = 0;
  for(i=NB_POS-1; i>=0; i--)
  {
    cor += h[m+0]*h[m+0] + h[m+1]*h[m+1];   rri3i3[i] = cor*0.5f;
    cor += h[m+2]*h[m+2] + h[m+3]*h[m+3];   rri2i2[i] = cor*0.5f;
    cor += h[m+4]*h[m+4] + h[m+5]*h[m+5];   rri1i1[i] = cor*0.5f;
    cor += h[m+6]*h[m+6] + h[m+7]*h[m+7];   rri0i0[i] = cor*0.5f;

    m += 8;
  }

// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]

  h2 = h+2;
  p3 = rri2i3 + MSIZE-1;
  p2 = rri1i2 + MSIZE-1;
  p1 = rri0i1 + MSIZE-1;
  p0 = rri0i3 + MSIZE-2;
    
  for (k=0; k<NB_POS; k++)
  {
    cor = 0.0f;
    m = 0;
    t = 0;

    for(i=k+1; i<NB_POS; i++)
    {
      cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

      t -= (NB_POS+1);
      m += 8;
    }
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
    cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;

    h2 += STEP;
    p3 -= NB_POS;
    p2 -= NB_POS;
    p1 -= NB_POS;
    p0 -= 1;
  }


// Compute elements of: rri0i2[], rri1i3[] 

  h2 = h+4;
  p3 = rri1i3 + MSIZE-1;
  p2 = rri0i2 + MSIZE-1;
  p1 = rri1i3 + MSIZE-2;
  p0 = rri0i2 + MSIZE-2;
    
  for (k=0; k<NB_POS; k++)
  {
    cor = 0.0f;
    m = 0;
    t = 0;

    for(i=k+1; i<NB_POS; i++)
    {
      cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

      t -= (NB_POS+1);
      m += 8;
    }
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
    cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
    
    h2 += STEP;
    p3 -= NB_POS;
    p2 -= NB_POS;
    p1 -= 1;
    p0 -= 1;
  }
  
// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]

  h2 = h+6;
  p3 = rri0i3 + MSIZE-1;
  p2 = rri2i3 + MSIZE-2;
  p1 = rri1i2 + MSIZE-2;
  p0 = rri0i1 + MSIZE-2;
    
  for (k=0; k<NB_POS; k++)
  {
    cor = 0.0f;
    m = 0;
    t = 0;

    for(i=k+1; i<NB_POS; i++)
    {
      cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;
      cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3];   p2[t] = cor;
      cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5];   p1[t] = cor;
      cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7];   p0[t] = cor;

      t -= (NB_POS+1);
      m += 8;
    }
    cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1];   p3[t] = cor;

    h2 += STEP;
    p3 -= NB_POS;
    p2 -= 1;
    p1 -= 1;
    p0 -= 1;
  }
  
  return;
}

//---------------------------------------------------------------------------
void Cor_h_X(float h[],float X[],float D[])
{
   int i;
  
   for (i=0; i < SubFrLen; i++)	 
     D[i] = DotProd(&X[i],h,(SubFrLen-i));
	
   return;
}

//-------------------------------------------------------------------------
Find_Pulse4(float *Dn,float *rri3i3,float *ptr_ri0i3,float *ptr_ri1i3,
  float *ptr_ri2i3,float *ptr, float ps2,float alp2,float *psbest,float *abest)
{
  int k,bestk;
  float ps3;
  float a[16];

  for (k=0; k<8; k++)
  {
    ps3 = ps2 + *ptr;
    a[k] = alp2 + rri3i3[k] + ptr_ri0i3[k] + ptr_ri1i3[k] + ptr_ri2i3[k];
    a[k+8] = ps3 * ps3;
    
    ptr += STEP;
  }

  bestk = -1;
  for (k=0; k<8; k++)
  {
    if((a[k+8] * (*abest)) > ((*psbest) * a[k]))
    {
      *psbest = a[k+8];
      *abest = a[k];
      bestk = k;
    }
  }
  return(bestk);
}

//-------------------------------------------------------------------------
//   routine   D4i64_LBC                                                  
//           ~~~~~~~~~                                                   
// Algebraic codebook for LBC.                                           
//  -> 17 bits; 4 pulses in a frame of 60 samples                        
//                                                                       
// The code length is 60, containing 4 nonzero pulses i0, i1, i2, i3.    
// Each pulses can have 8 possible positions (positive or negative):     
//                                                                       
// i0 (+-1) : 0, 8,  16, 24, 32, 40, 48, 56                              
// i1 (+-1) : 2, 10, 18, 26, 34, 42, 50, 58                              
// i2 (+-1) : 4, 12, 20, 28, 36, 44, 52, (60)                            
// i3 (+-1) : 6, 14, 22, 30, 38, 46, 54, (62)                            
//                                                                       
// All the pulse can be shift by one.                                    
// The last position of the last 2 pulse falls outside the               
// frame and signifies that the pulse is not present.                    
//
//  Input arguments:                                                     
//                                                                       
//   Dn[]       Correlation between target vector and impulse response h[]
//   rr[]       Correlations of impulse response h[]                     
//   h[]        Impulse response of filters                              
//                                                                       
//  Output arguments:                                                   
//                                                                       
//   cod[]      Selected algebraic codeword                              
//   y[]        Filtered codeword                                        
//   code_shift Shift of the codeword                                    
//   sign       Signs of the 4 pulses.                                   
//                                                                       
//   return:    Index of selected codevector                             
//
// The threshold control if a section of the innovative              
// codebook should be searched or not.                               
//                                                                   
//--------------------------------------------------------------------

int D4i64_LBC(float Dn[], float rr[], float h[], float cod[],
			  float y[], int *code_shift, int *sign, int flags)
{
   int  ip[4];
   int  i0, i1, i2, i3, ip0, ip1, ip2, ip3;
   int  i, j;
   int  shif;
   float   means, max0, max1, max2, thres;

   float *rri0i0,*rri1i1,*rri2i2,*rri3i3;
   float *rri0i1,*rri0i2,*rri0i3;
   float *rri1i2,*rri1i3,*rri2i3;

  // float *ptr_ri0i0,*ptr_ri1i1,*ptr_ri2i2;
   float *ptr_ri0i1,*ptr_ri0i2,*ptr_ri0i3;
   float *ptr_ri1i2,*ptr_ri1i3,*ptr_ri2i3;

   int  p_sign[SubFrLen2/2];
//   float  p_sign[SubFrLen2/2],p_sign2[SubFrLen2/2];

// Init pointers 

  rri0i0 = rr;
  rri1i1 = rri0i0 + NB_POS;
  rri2i2 = rri1i1 + NB_POS;
  rri3i3 = rri2i2 + NB_POS;

  rri0i1 = rri3i3 + NB_POS;
  rri0i2 = rri0i1 + MSIZE;
  rri0i3 = rri0i2 + MSIZE;
  rri1i2 = rri0i3 + MSIZE;
  rri1i3 = rri1i2 + MSIZE;
  rri2i3 = rri1i3 + MSIZE;

 // Extend the backward filtered target vector by zeros                

   for (i=SubFrLen; i < SubFrLen2; i++) 
     Dn[i] = 0.0f;

// Chose the sign of the impulse.                                        

   for (i=0; i<SubFrLen; i+=2)
   {
     if((Dn[i] + Dn[i+1]) >= 0.0f)
     {
		 p_sign[i/2] = 0x00000000;
//       p_sign[i/2] = 1.0f;
//       p_sign2[i/2] = 2.0f;
     }
     else
     {
		 p_sign[i/2] = 0x80000000;
//       p_sign[i/2] = -1.0f;
//       p_sign2[i/2] = -2.0f;
       Dn[i] = -Dn[i];
       Dn[i+1] = -Dn[i+1];
     }
   }
   p_sign[30] = p_sign[31] = 0x00000000;
//   p_sign[30] = p_sign[31] = 1.0f;
//   p_sign2[30] = p_sign2[31] = 2.0f;

// - Compute the search threshold after three pulses                 
// odd positions 
// Find maximum of Dn[i0]+Dn[i1]+Dn[i2]

   max0 = Dn[0];
   max1 = Dn[2];
   max2 = Dn[4];
   for (i=8; i < SubFrLen; i+=STEP)
   {
     if (Dn[i] > max0)   max0 = Dn[i];
     if (Dn[i+2] > max1) max1 = Dn[i+2];
     if (Dn[i+4] > max2) max2 = Dn[i+4];
   }
   max0 = max0 + max1 + max2;

// Find means of Dn[i0]+Dn[i1]+Dn[i]

   means = 0.0f;
   for (i=0; i < SubFrLen; i+=STEP)
     means += Dn[i+4] + Dn[i+2] + Dn[i];

   means *= 0.125f;  
   if (flags & SC_THRES)
     thres = means*0.25f + max0*0.75f;
   else
     thres = means + (max0-means)*0.5f;
 
 // even positions 
 // Find maximum of Dn[i0]+Dn[i1]+Dn[i2]

   max0 = Dn[1];
   max1 = Dn[3];
   max2 = Dn[5];
   for (i=9; i < SubFrLen; i+=STEP)
   {
     if (Dn[i] > max0)   max0 = Dn[i];
     if (Dn[i+2] > max1) max1 = Dn[i+2];
     if (Dn[i+4] > max2) max2 = Dn[i+4];
   }
   max0 = max0 + max1 + max2;

// Find means of Dn[i0]+Dn[i1]+Dn[i2] 

   means = 0.0f;
   for (i=1; i < SubFrLen; i+=STEP)
     means += Dn[i+4] + Dn[i+2] + Dn[i];
  
   means *= 0.125f;
   if (flags & SC_THRES)
     max1 = means*0.25f + max0*0.75f;
   else
     max1 = means + (max0-means)*0.5f; 

// Keep maximum threshold between odd and even position 

   if(max1 > thres) thres = max1;

// Modification of rrixiy[] to take signs into account.
//TIMER_STAMP(a);            
  ptr_ri0i1 = rri0i1;
  ptr_ri0i2 = rri0i2;
  ptr_ri0i3 = rri0i3;

  for(i0=0; i0<SubFrLen/2; i0+=STEP/2)
  {
	 for(i1=2/2; i1<SubFrLen/2; i1+=STEP/2)
     {
	   (int)*ptr_ri0i1++ = (asint(*ptr_ri0i1) ^ p_sign[i0] ^ p_sign[i1]);
	   (int)*ptr_ri0i2++ = (asint(*ptr_ri0i2) ^ p_sign[i0] ^ p_sign[i1+1]);
	   (int)*ptr_ri0i3++ = (asint(*ptr_ri0i3) ^ p_sign[i0] ^ p_sign[i1+2]);
     }
  }

  ptr_ri1i2 = rri1i2;
  ptr_ri1i3 = rri1i3;
  for(i1=2/2; i1<SubFrLen/2; i1+=STEP/2)
  {
	 for(i2=4/2; i2<SubFrLen2/2; i2+=STEP/2)
     {
	   (int)*ptr_ri1i2++ = (asint(*ptr_ri1i2) ^ p_sign[i1] ^ p_sign[i2]);
	   (int)*ptr_ri1i3++ = (asint(*ptr_ri1i3) ^ p_sign[i1] ^ p_sign[i2+1]);
     }
  }

  ptr_ri2i3 = rri2i3;
  for(i2=4/2; i2<SubFrLen2/2; i2+=STEP/2)
  {
	 for(i3=6/2; i3<SubFrLen2/2; i3+=STEP/2)
	   (int)*ptr_ri2i3++ = (asint(*ptr_ri2i3) ^ p_sign[i2] ^ p_sign[i3]);
  }

//TIMER_STAMP(b);            
fourPulseFlt(rr, Dn, thres, ip, code_shift);
//TIMER_STAMP(c);            

ip0 = ip[0];
ip1 = ip[1];
ip2 = ip[2];
ip3 = ip[3];
shif = *code_shift;

// Set the sign of impulses 

 i0 = (p_sign[(ip0 >> 1)]>=0?1:-1);
 i1 = (p_sign[(ip1 >> 1)]>=0?1:-1);
 i2 = (p_sign[(ip2 >> 1)]>=0?1:-1);
 i3 = (p_sign[(ip3 >> 1)]>=0?1:-1);

// Find the codeword corresponding to the selected positions 

 for(i=0; i<SubFrLen; i++) 
   cod[i] = 0.0f;

 if(shif > 0)
 {
   ip0++;
   ip1++;
   ip2++;
   ip3++;
 }
 
//printf("%3d %3d %3d %3d\n",ip0*i0,ip1*i1,ip2*i2,ip3*i3);
 cod[ip0] =  (float)i0;
 cod[ip1] =  (float)i1;
 if(ip2<SubFrLen)
   cod[ip2] = (float)i2;
 if(ip3<SubFrLen)
   cod[ip3] = (float)i3;

// find the filtered codeword 

 for (i=0; i < SubFrLen; i++) 
   y[i] = 0.0f;

 if(i0 > 0)
   for(i=ip0, j=0; i<SubFrLen; i++, j++)
	   y[i] = y[i] + h[j];
 else
   for(i=ip0, j=0; i<SubFrLen; i++, j++)
       y[i] = y[i] - h[j];

 if(i1 > 0)
   for(i=ip1, j=0; i<SubFrLen; i++, j++)
	   y[i] = y[i] + h[j];
 else
   for(i=ip1, j=0; i<SubFrLen; i++, j++)
       y[i] = y[i] - h[j];

 if(ip2<SubFrLen)
 {
   if(i2 > 0)
	 for(i=ip2, j=0; i<SubFrLen; i++, j++)
         y[i] = y[i] + h[j];
   else
	 for(i=ip2, j=0; i<SubFrLen; i++, j++)
         y[i] = y[i] - h[j];
 }

 if(ip3<SubFrLen)
 {
   if(i3 > 0)
	 for(i=ip3, j=0; i<SubFrLen; i++, j++)
       y[i] = y[i] + h[j];
   else
	 for(i=ip3, j=0; i<SubFrLen; i++, j++)
       y[i] = y[i] - h[j];
 }

// find codebook index;  17-bit address 

 *code_shift = shif;

 *sign = 0;
 if(i0 > 0) *sign += 1;
 if(i1 > 0) *sign += 2;
 if(i2 > 0) *sign += 4;
 if(i3 > 0) *sign += 8;

 i = ((ip3 >> 3) << 9) + ((ip2 >> 3) << 6) + ((ip1 >> 3) << 3) + (ip0 >> 3);
//TIMER_STAMP(d);

 return i;
}

//--------------------------------------------------------------------
int G_code(float X[], float Y[], float *gain_q)
{
   int i;
   float xy, yy, gain_nq; 
   int gain;
   float dist, dist_min;

// Compute scalar product <X[],Y[]>
   
	xy = DotProd(X,Y,SubFrLen);

// Be sure xy < yy 

   if(xy <= 0) 
   {
	 gain = 0;
	 *gain_q =FcbkGainTable[gain];
	 return(gain);
   }

// Compute scalar product <Y[],Y[]> 
  
   yy = DotProd(Y,Y,SubFrLen);

   if (yy != 0.0f)
     gain_nq = xy/yy * 0.5f;
   else
     gain_nq = 0.0f;

   gain = 0;
   dist_min = (float)fabs(gain_nq - FcbkGainTable[0]);
 
   for (i=1; i <NumOfGainLev; i++) 
   {
	 dist = (float)fabs(gain_nq - FcbkGainTable[i]);
	 if (dist < dist_min) 
	 {
		dist_min = dist;
		gain = i;
	 }
   }
   *gain_q = FcbkGainTable[gain];
   return(gain);
}



 //-------------------------------------------------------------------
 // Search the optimum positions of the four  pulses which maximize   
 //     square(correlation) / energy                                  
 // The search is performed in four  nested loops. At each loop, one  
 // pulse contribution is added to the correlation and energy.        
 //                                                                   
 // The fourth loop is entered only if the correlation due to the     
 //  contribution of the first three pulses exceeds the preset        
 //  threshold.                                                       
 //-------------------------------------------------------------------
void fourPulseFlt (float *rr, float *Dn, float thres, int ip[], int *shifPtr){

 // Default values 

   int ip0    = 0;
   int ip1    = 2;
   int ip2    = 4;
   int ip3    = 6;
   int shif   = 0;
   int  i0, i1, i2;
   int  k, time;
   int  shift, bestk, lasti2, inc;
   float psc    = 0.0f;
   float alpha  = 1.0f;
   float  ps0, ps1, ps2, alp0;
   float  alp1, alp2;
   float  ps0a, ps1a, ps2a;
   float *ptr_ri0i0,*ptr_ri1i1,*ptr_ri2i2;
   float *ptr_ri0i1,*ptr_ri0i2,*ptr_ri0i3;
   float *ptr_ri1i2,*ptr_ri1i3,*ptr_ri2i3;

   float *rri0i0,*rri1i1,*rri2i2,*rri3i3;
   float *rri0i1,*rri0i2,*rri0i3;
   float *rri1i2,*rri1i3,*rri2i3;
   
   float a[16];
   float t1,t2,*pntr;
   float dmax4, dmax5, dmax2, dmax3; //used for bypass
#if !OPT_PULSE4
   int i3;
   float ps3;
#endif

 time   = max_time + extra;

 // Four loops to search innovation code.
 // Init. pointers that depend on first loop
  
  rri0i0 = rr;
  rri1i1 = rri0i0 + NB_POS;
  rri2i2 = rri1i1 + NB_POS;
  rri3i3 = rri2i2 + NB_POS;

  rri0i1 = rri3i3 + NB_POS;
  rri0i2 = rri0i1 + MSIZE;
  rri0i3 = rri0i2 + MSIZE;
  rri1i2 = rri0i3 + MSIZE;
  rri1i3 = rri1i2 + MSIZE;
  rri2i3 = rri1i3 + MSIZE;

 ptr_ri0i0 = rri0i0;    
 ptr_ri0i1 = rri0i1;
 ptr_ri0i2 = rri0i2;
 ptr_ri0i3 = rri0i3;

 // Compute the Dn max's

 dmax2 = dmax3 = dmax4 = dmax5 = -1000000.0f; //i.e., large negative number
 for (k = 2; k<SubFrLen2; k+=STEP)
 {
   if (Dn[k] > dmax2) dmax2 = Dn[k];
   if (Dn[k+1] > dmax3) dmax3 = Dn[k+1];
   if (Dn[k+2] > dmax4) dmax4 = Dn[k+2];
   if (Dn[k+3] > dmax5) dmax5 = Dn[k+3];
 }

// first pulse loop  
 for (i0=0; i0 < SubFrLen; i0 +=STEP)        
 {
   ps0  = Dn[i0];
   ps0a = Dn[i0+1];
   alp0 = *ptr_ri0i0++;

// Init. pointers that depand on second loop
 
   ptr_ri1i1 = rri1i1;    
   ptr_ri1i2 = rri1i2;
   ptr_ri1i3 = rri1i3;

   ps1 = ps0 + dmax2 + dmax4;
   ps1a = ps0a + dmax3 + dmax5;
   if (asint(ps1) < asint(thres) && asint(ps1a) < asint(thres))
   {
	 ptr_ri0i1 += NB_POS;
	 goto skipsecond;
   }

 // second pulse loop

   for (i1=2; i1 < SubFrLen; i1 +=STEP)     
   {
	 ps1  = ps0 + Dn[i1];
	 ps1a = ps0a + Dn[i1+1];

	 alp1 = alp0 + *ptr_ri1i1++ + *ptr_ri0i1++; 

// Init. pointers that depend on third loop
 
     ptr_ri2i2 = rri2i2;     
     ptr_ri2i3 = rri2i3;
     lasti2 = 4;
 
     ps2 = ps1 + dmax4;
     ps2a = ps1a + dmax5;
     if (asint(ps2) < asint(thres) && asint(ps2a) < asint(thres))
     {
	   i2 = 68;
	   goto skipthird;
     }

// third pulse loop

	 for (i2 = 4; i2 < SubFrLen2; i2 +=STEP)    
   {
	   ps2  = ps1 + Dn[i2];
	   ps2a = ps1a + Dn[i2+1];

// Threshold test and 4th pulse loop.  Since the probability of
// entering this is low, we cram as much of the 3rd-pulse-loop
// logic inside the threshold test.  So the computation of shift,
// the choice of ps2 vs ps2a, the computation of alp2, and the
// incrementing of the 02,12,22 pointers are all done there.
     
     if (asint(ps2) > asint(thres) || asint(ps2a) > asint(thres))
	   {
       shift = 0;
       if(asint(ps2a) > asint(ps2))
       {
         shift = 1;
         ps2   = ps2a;
       }

       inc = (i2 - lasti2) >> 3;
       lasti2 = i2;
       ptr_ri0i2 += inc;
       ptr_ri1i2 += inc;
       ptr_ri2i2 += inc;

       alp2 = alp1 + *ptr_ri2i2 + *ptr_ri0i2 + *ptr_ri1i2; 
       pntr = &Dn[6+shift];

#if OPT_PULSE4

  ASM
  {
    push esi;
    push ebx;

    mov esi,pntr;

;// First half of first loop

    fld DP [esi+4*8*0];
    fld DP [esi+4*8*1];
    fld DP [esi+4*8*2];
    fld DP [esi+4*8*3];

    fxch ST(3);
    fadd ps2;
    fxch ST(2);
    fadd ps2;
    fxch ST(1);
    fadd ps2;
    fxch ST(3);
    fadd ps2;

    fxch ST(2);
    fmul ST,ST(0);
    fxch ST(1);
    fmul ST,ST(0);
    fxch ST(3);
    fmul ST,ST(0);
    fxch ST(2);
    fmul ST,ST(0);

    fxch ST(1);
    fstp a[4*8];
    fxch ST(2);
    fstp a[4*9];
    fstp a[4*10];
    fstp a[4*11];

;// Second half of first loop

    fld DP [esi+4*8*4];
    fld DP [esi+4*8*5];
    fld DP [esi+4*8*6];
    fld DP [esi+4*8*7];

    fxch ST(3);
    fadd ps2;
    fxch ST(2);
    fadd ps2;
    fxch ST(1);
    fadd ps2;
    fxch ST(3);
    fadd ps2;

    fxch ST(2);
    fmul ST,ST(0);
    fxch ST(1);
    fmul ST,ST(0);
    fxch ST(3);
    fmul ST,ST(0);
    fxch ST(2);
    fmul ST,ST(0);

    fxch ST(1);
    fstp a[4*12];
    fxch ST(2);
    fstp a[4*13];
    fstp a[4*14];
    fstp a[4*15];

;// First half of second loop

    mov eax,rri3i3;
    mov ebx,ptr_ri0i3;
    mov ecx,ptr_ri1i3;
    mov edx,ptr_ri2i3;

    fld alp2;
    fld alp2;
    fld alp2;
    fld alp2;

    fxch ST(3);
    fadd DP [eax+4*0];
    fxch ST(2);
    fadd DP [eax+4*1];
    fxch ST(1);
    fadd DP [eax+4*2];
    fxch ST(3);
    fadd DP [eax+4*3];

    fxch ST(2);
    fadd DP [ebx+4*0];
    fxch ST(1);
    fadd DP [ebx+4*1];
    fxch ST(3);
    fadd DP [ebx+4*2];
    fxch ST(2);
    fadd DP [ebx+4*3];

    fxch ST(1);
    fadd DP [ecx+4*0];
    fxch ST(3);
    fadd DP [ecx+4*1];
    fxch ST(2);
    fadd DP [ecx+4*2];
    fxch ST(1);
    fadd DP [ecx+4*3];

    fxch ST(3);
    fadd DP [edx+4*0];
    fxch ST(2);
    fadd DP [edx+4*1];
    fxch ST(1);
    fadd DP [edx+4*2];
    fxch ST(3);
    fadd DP [edx+4*3];

    fxch ST(2);
    fstp a[4*0];
    fstp a[4*1];
    fxch ST(1);
    fstp a[4*2];
    fstp a[4*3];

;// Second half of second loop

    fld alp2;
    fld alp2;
    fld alp2;
    fld alp2;

    fxch ST(3);
    fadd DP [eax+4*4];
    fxch ST(2);
    fadd DP [eax+4*5];
    fxch ST(1);
    fadd DP [eax+4*6];
    fxch ST(3);
    fadd DP [eax+4*7];

    fxch ST(2);
    fadd DP [ebx+4*4];
    fxch ST(1);
    fadd DP [ebx+4*5];
    fxch ST(3);
    fadd DP [ebx+4*6];
    fxch ST(2);
    fadd DP [ebx+4*7];

    fxch ST(1);
    fadd DP [ecx+4*4];
    fxch ST(3);
    fadd DP [ecx+4*5];
    fxch ST(2);
    fadd DP [ecx+4*6];
    fxch ST(1);
    fadd DP [ecx+4*7];

    fxch ST(3);
    fadd DP [edx+4*4];
    fxch ST(2);
    fadd DP [edx+4*5];
    fxch ST(1);
    fadd DP [edx+4*6];
    fxch ST(3);
    fadd DP [edx+4*7];

    fxch ST(2);
    fstp a[4*4];
    fstp a[4*5];
    fxch ST(1);
    fstp a[4*6];
    fstp a[4*7];
    
    pop ebx;
    pop esi;
  }

#else

       for (k=0; k<8; k++)
       {
         ps3 = ps2 + *pntr;
         pntr += STEP;
         a[k+8] = ps3 * ps3;
       }

       for (k=0; k<8; k++)
         a[k] = alp2 + rri3i3[k] + ptr_ri0i3[k] + ptr_ri1i3[k] + ptr_ri2i3[k];

#endif

       bestk = -1;
       for (k=0; k<8; k++)
       {
         t1 = a[k+8] * alpha;
         t2 = psc * a[k];
         if (asint(t1) > asint(t2))
         {
           psc = a[k+8];
           alpha = a[k];
           bestk = k;
         }
       }
          
       if (bestk >= 0)
       {
         ip0 = i0;
         ip1 = i1;
         ip2 = i2;
         ip3 = 6 + (bestk << 3);
         shif = shift;
//#define t32 4294967296.0f
//		 printf("  %3d %3d %3d %3d %d %f %f %f\n",ip0,ip1,ip2,ip3,shift,psc/thres/thres,alpha/thres,(float)psc/(float)alpha/thres);
       }
       
       time--;
       if(time <= 0) 
         goto end_search;     
     }
     ptr_ri2i3 += NB_POS;
	 }
skipthird:
   inc = (i2 - lasti2) >> 3;
   ptr_ri0i2 += inc;
   ptr_ri1i2 += inc;
   ptr_ri2i2 += inc;
	 
 // end of for i2 = 

     ptr_ri0i2 -= NB_POS;
     ptr_ri1i3 += NB_POS;
   } 
skipsecond:

 // end of for i1 =

   ptr_ri0i2 += NB_POS;
   ptr_ri0i3 += NB_POS;
 }
 // end of for i0 = 

end_search:

extra = time;
 
 ip[0] = ip0;
 ip[1] = ip1;
 ip[2] = ip2;
 ip[3] = ip3;
 *shifPtr = shif;

 return;
}