|
|
//cb53.c - 5.3 rate codebook code
#include "opt.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <memory.h>
#include "typedef.h"
#include "cst_lbc.h"
#include "tab_lbc.h"
#include "util_lbc.h"
#include "exc_lbc.h"
#include "timer.h"
#include "mmxutil.h"
void fourPulseFlt (float *rr, float *Dn, float thres, int ip[], int *shiftPtr);
//--------------------------------------------------------
int extra; void reset_max_time(void) { extra = 120; }
//------------------------------------------------------------
int ACELP_LBC_code(float X[], float h[], int T0, float code[], int *ind_gain, int *shift, int *sign, float gain_T0, int flags) { int i, index; float gain_q; float Dn[SubFrLen2], tmp_code[SubFrLen2]; float rr[DIM_RR];
// Include fixed-gain pitch contribution into impulse resp. h[]
if (T0 < SubFrLen-2) for (i = T0; i < SubFrLen; i++) h[i] += gain_T0*h[i-T0];
// Compute correlations of h[] needed for the codebook search
Cor_h(h, rr); // Compute correlation of target vector with impulse response.
Cor_h_X(h, X, Dn); // Find codebook index
index = D4i64_LBC(Dn, rr, h, tmp_code, rr, shift, sign, flags);
// Compute innovation vector gain.
// Include fixed-gain pitch contribution into code[].
*ind_gain = G_code(X, rr, &gain_q);
for (i=0; i < SubFrLen; i++) code[i] = tmp_code[i]*gain_q;
if(T0 < SubFrLen-2) for (i=T0; i < SubFrLen; i++) code[i] += code[i-T0]*gain_T0;
return index; }
//---------------------------------------------------------------
void Cor_h(float *H, float *rr) {
// Compute correlations of h[] needed for the codebook search.
// h[] :Impulse response.
// rr[] :Correlations.
float *rri0i0, *rri1i1, *rri2i2, *rri3i3; float *rri0i1, *rri0i2, *rri0i3; float *rri1i2, *rri1i3, *rri2i3;
float *p0, *p1, *p2, *p3; float cor, *h2; int i, k, m, t; float h[SubFrLen2];
for(i=0; i<SubFrLen; i++) h[i+4] = H[i];
for(i=0; i<4; i++) h[i] = 0.0f;
// Init pointers
rri0i0 = rr; rri1i1 = rri0i0 + NB_POS; rri2i2 = rri1i1 + NB_POS; rri3i3 = rri2i2 + NB_POS;
rri0i1 = rri3i3 + NB_POS; rri0i2 = rri0i1 + MSIZE; rri0i3 = rri0i2 + MSIZE; rri1i2 = rri0i3 + MSIZE; rri1i3 = rri1i2 + MSIZE; rri2i3 = rri1i3 + MSIZE;
// Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]
cor = 0.0f; m = 0; for(i=NB_POS-1; i>=0; i--) { cor += h[m+0]*h[m+0] + h[m+1]*h[m+1]; rri3i3[i] = cor*0.5f; cor += h[m+2]*h[m+2] + h[m+3]*h[m+3]; rri2i2[i] = cor*0.5f; cor += h[m+4]*h[m+4] + h[m+5]*h[m+5]; rri1i1[i] = cor*0.5f; cor += h[m+6]*h[m+6] + h[m+7]*h[m+7]; rri0i0[i] = cor*0.5f;
m += 8; }
// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
h2 = h+2; p3 = rri2i3 + MSIZE-1; p2 = rri1i2 + MSIZE-1; p1 = rri0i1 + MSIZE-1; p0 = rri0i3 + MSIZE-2; for (k=0; k<NB_POS; k++) { cor = 0.0f; m = 0; t = 0;
for(i=k+1; i<NB_POS; i++) { cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor; cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor; cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor; cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1); m += 8; } cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor; cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor; cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
h2 += STEP; p3 -= NB_POS; p2 -= NB_POS; p1 -= NB_POS; p0 -= 1; }
// Compute elements of: rri0i2[], rri1i3[]
h2 = h+4; p3 = rri1i3 + MSIZE-1; p2 = rri0i2 + MSIZE-1; p1 = rri1i3 + MSIZE-2; p0 = rri0i2 + MSIZE-2; for (k=0; k<NB_POS; k++) { cor = 0.0f; m = 0; t = 0;
for(i=k+1; i<NB_POS; i++) { cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor; cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor; cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor; cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1); m += 8; } cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor; cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor; h2 += STEP; p3 -= NB_POS; p2 -= NB_POS; p1 -= 1; p0 -= 1; } // Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
h2 = h+6; p3 = rri0i3 + MSIZE-1; p2 = rri2i3 + MSIZE-2; p1 = rri1i2 + MSIZE-2; p0 = rri0i1 + MSIZE-2; for (k=0; k<NB_POS; k++) { cor = 0.0f; m = 0; t = 0;
for(i=k+1; i<NB_POS; i++) { cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor; cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor; cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor; cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1); m += 8; } cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
h2 += STEP; p3 -= NB_POS; p2 -= 1; p1 -= 1; p0 -= 1; } return; }
//---------------------------------------------------------------------------
void Cor_h_X(float h[],float X[],float D[]) { int i; for (i=0; i < SubFrLen; i++) D[i] = DotProd(&X[i],h,(SubFrLen-i)); return; }
//-------------------------------------------------------------------------
Find_Pulse4(float *Dn,float *rri3i3,float *ptr_ri0i3,float *ptr_ri1i3, float *ptr_ri2i3,float *ptr, float ps2,float alp2,float *psbest,float *abest) { int k,bestk; float ps3; float a[16];
for (k=0; k<8; k++) { ps3 = ps2 + *ptr; a[k] = alp2 + rri3i3[k] + ptr_ri0i3[k] + ptr_ri1i3[k] + ptr_ri2i3[k]; a[k+8] = ps3 * ps3; ptr += STEP; }
bestk = -1; for (k=0; k<8; k++) { if((a[k+8] * (*abest)) > ((*psbest) * a[k])) { *psbest = a[k+8]; *abest = a[k]; bestk = k; } } return(bestk); }
//-------------------------------------------------------------------------
// routine D4i64_LBC
// ~~~~~~~~~
// Algebraic codebook for LBC.
// -> 17 bits; 4 pulses in a frame of 60 samples
//
// The code length is 60, containing 4 nonzero pulses i0, i1, i2, i3.
// Each pulses can have 8 possible positions (positive or negative):
//
// i0 (+-1) : 0, 8, 16, 24, 32, 40, 48, 56
// i1 (+-1) : 2, 10, 18, 26, 34, 42, 50, 58
// i2 (+-1) : 4, 12, 20, 28, 36, 44, 52, (60)
// i3 (+-1) : 6, 14, 22, 30, 38, 46, 54, (62)
//
// All the pulse can be shift by one.
// The last position of the last 2 pulse falls outside the
// frame and signifies that the pulse is not present.
//
// Input arguments:
//
// Dn[] Correlation between target vector and impulse response h[]
// rr[] Correlations of impulse response h[]
// h[] Impulse response of filters
//
// Output arguments:
//
// cod[] Selected algebraic codeword
// y[] Filtered codeword
// code_shift Shift of the codeword
// sign Signs of the 4 pulses.
//
// return: Index of selected codevector
//
// The threshold control if a section of the innovative
// codebook should be searched or not.
//
//--------------------------------------------------------------------
int D4i64_LBC(float Dn[], float rr[], float h[], float cod[], float y[], int *code_shift, int *sign, int flags) { int ip[4]; int i0, i1, i2, i3, ip0, ip1, ip2, ip3; int i, j; int shif; float means, max0, max1, max2, thres;
float *rri0i0,*rri1i1,*rri2i2,*rri3i3; float *rri0i1,*rri0i2,*rri0i3; float *rri1i2,*rri1i3,*rri2i3;
// float *ptr_ri0i0,*ptr_ri1i1,*ptr_ri2i2;
float *ptr_ri0i1,*ptr_ri0i2,*ptr_ri0i3; float *ptr_ri1i2,*ptr_ri1i3,*ptr_ri2i3;
int p_sign[SubFrLen2/2]; // float p_sign[SubFrLen2/2],p_sign2[SubFrLen2/2];
// Init pointers
rri0i0 = rr; rri1i1 = rri0i0 + NB_POS; rri2i2 = rri1i1 + NB_POS; rri3i3 = rri2i2 + NB_POS;
rri0i1 = rri3i3 + NB_POS; rri0i2 = rri0i1 + MSIZE; rri0i3 = rri0i2 + MSIZE; rri1i2 = rri0i3 + MSIZE; rri1i3 = rri1i2 + MSIZE; rri2i3 = rri1i3 + MSIZE;
// Extend the backward filtered target vector by zeros
for (i=SubFrLen; i < SubFrLen2; i++) Dn[i] = 0.0f;
// Chose the sign of the impulse.
for (i=0; i<SubFrLen; i+=2) { if((Dn[i] + Dn[i+1]) >= 0.0f) { p_sign[i/2] = 0x00000000; // p_sign[i/2] = 1.0f;
// p_sign2[i/2] = 2.0f;
} else { p_sign[i/2] = 0x80000000; // p_sign[i/2] = -1.0f;
// p_sign2[i/2] = -2.0f;
Dn[i] = -Dn[i]; Dn[i+1] = -Dn[i+1]; } } p_sign[30] = p_sign[31] = 0x00000000; // p_sign[30] = p_sign[31] = 1.0f;
// p_sign2[30] = p_sign2[31] = 2.0f;
// - Compute the search threshold after three pulses
// odd positions
// Find maximum of Dn[i0]+Dn[i1]+Dn[i2]
max0 = Dn[0]; max1 = Dn[2]; max2 = Dn[4]; for (i=8; i < SubFrLen; i+=STEP) { if (Dn[i] > max0) max0 = Dn[i]; if (Dn[i+2] > max1) max1 = Dn[i+2]; if (Dn[i+4] > max2) max2 = Dn[i+4]; } max0 = max0 + max1 + max2;
// Find means of Dn[i0]+Dn[i1]+Dn[i]
means = 0.0f; for (i=0; i < SubFrLen; i+=STEP) means += Dn[i+4] + Dn[i+2] + Dn[i];
means *= 0.125f; if (flags & SC_THRES) thres = means*0.25f + max0*0.75f; else thres = means + (max0-means)*0.5f; // even positions
// Find maximum of Dn[i0]+Dn[i1]+Dn[i2]
max0 = Dn[1]; max1 = Dn[3]; max2 = Dn[5]; for (i=9; i < SubFrLen; i+=STEP) { if (Dn[i] > max0) max0 = Dn[i]; if (Dn[i+2] > max1) max1 = Dn[i+2]; if (Dn[i+4] > max2) max2 = Dn[i+4]; } max0 = max0 + max1 + max2;
// Find means of Dn[i0]+Dn[i1]+Dn[i2]
means = 0.0f; for (i=1; i < SubFrLen; i+=STEP) means += Dn[i+4] + Dn[i+2] + Dn[i]; means *= 0.125f; if (flags & SC_THRES) max1 = means*0.25f + max0*0.75f; else max1 = means + (max0-means)*0.5f;
// Keep maximum threshold between odd and even position
if(max1 > thres) thres = max1;
// Modification of rrixiy[] to take signs into account.
//TIMER_STAMP(a);
ptr_ri0i1 = rri0i1; ptr_ri0i2 = rri0i2; ptr_ri0i3 = rri0i3;
for(i0=0; i0<SubFrLen/2; i0+=STEP/2) { for(i1=2/2; i1<SubFrLen/2; i1+=STEP/2) { (int)*ptr_ri0i1++ = (asint(*ptr_ri0i1) ^ p_sign[i0] ^ p_sign[i1]); (int)*ptr_ri0i2++ = (asint(*ptr_ri0i2) ^ p_sign[i0] ^ p_sign[i1+1]); (int)*ptr_ri0i3++ = (asint(*ptr_ri0i3) ^ p_sign[i0] ^ p_sign[i1+2]); } }
ptr_ri1i2 = rri1i2; ptr_ri1i3 = rri1i3; for(i1=2/2; i1<SubFrLen/2; i1+=STEP/2) { for(i2=4/2; i2<SubFrLen2/2; i2+=STEP/2) { (int)*ptr_ri1i2++ = (asint(*ptr_ri1i2) ^ p_sign[i1] ^ p_sign[i2]); (int)*ptr_ri1i3++ = (asint(*ptr_ri1i3) ^ p_sign[i1] ^ p_sign[i2+1]); } }
ptr_ri2i3 = rri2i3; for(i2=4/2; i2<SubFrLen2/2; i2+=STEP/2) { for(i3=6/2; i3<SubFrLen2/2; i3+=STEP/2) (int)*ptr_ri2i3++ = (asint(*ptr_ri2i3) ^ p_sign[i2] ^ p_sign[i3]); }
//TIMER_STAMP(b);
fourPulseFlt(rr, Dn, thres, ip, code_shift); //TIMER_STAMP(c);
ip0 = ip[0]; ip1 = ip[1]; ip2 = ip[2]; ip3 = ip[3]; shif = *code_shift;
// Set the sign of impulses
i0 = (p_sign[(ip0 >> 1)]>=0?1:-1); i1 = (p_sign[(ip1 >> 1)]>=0?1:-1); i2 = (p_sign[(ip2 >> 1)]>=0?1:-1); i3 = (p_sign[(ip3 >> 1)]>=0?1:-1);
// Find the codeword corresponding to the selected positions
for(i=0; i<SubFrLen; i++) cod[i] = 0.0f;
if(shif > 0) { ip0++; ip1++; ip2++; ip3++; } //printf("%3d %3d %3d %3d\n",ip0*i0,ip1*i1,ip2*i2,ip3*i3);
cod[ip0] = (float)i0; cod[ip1] = (float)i1; if(ip2<SubFrLen) cod[ip2] = (float)i2; if(ip3<SubFrLen) cod[ip3] = (float)i3;
// find the filtered codeword
for (i=0; i < SubFrLen; i++) y[i] = 0.0f;
if(i0 > 0) for(i=ip0, j=0; i<SubFrLen; i++, j++) y[i] = y[i] + h[j]; else for(i=ip0, j=0; i<SubFrLen; i++, j++) y[i] = y[i] - h[j];
if(i1 > 0) for(i=ip1, j=0; i<SubFrLen; i++, j++) y[i] = y[i] + h[j]; else for(i=ip1, j=0; i<SubFrLen; i++, j++) y[i] = y[i] - h[j];
if(ip2<SubFrLen) { if(i2 > 0) for(i=ip2, j=0; i<SubFrLen; i++, j++) y[i] = y[i] + h[j]; else for(i=ip2, j=0; i<SubFrLen; i++, j++) y[i] = y[i] - h[j]; }
if(ip3<SubFrLen) { if(i3 > 0) for(i=ip3, j=0; i<SubFrLen; i++, j++) y[i] = y[i] + h[j]; else for(i=ip3, j=0; i<SubFrLen; i++, j++) y[i] = y[i] - h[j]; }
// find codebook index; 17-bit address
*code_shift = shif;
*sign = 0; if(i0 > 0) *sign += 1; if(i1 > 0) *sign += 2; if(i2 > 0) *sign += 4; if(i3 > 0) *sign += 8;
i = ((ip3 >> 3) << 9) + ((ip2 >> 3) << 6) + ((ip1 >> 3) << 3) + (ip0 >> 3); //TIMER_STAMP(d);
return i; }
//--------------------------------------------------------------------
int G_code(float X[], float Y[], float *gain_q) { int i; float xy, yy, gain_nq; int gain; float dist, dist_min;
// Compute scalar product <X[],Y[]>
xy = DotProd(X,Y,SubFrLen);
// Be sure xy < yy
if(xy <= 0) { gain = 0; *gain_q =FcbkGainTable[gain]; return(gain); }
// Compute scalar product <Y[],Y[]>
yy = DotProd(Y,Y,SubFrLen);
if (yy != 0.0f) gain_nq = xy/yy * 0.5f; else gain_nq = 0.0f;
gain = 0; dist_min = (float)fabs(gain_nq - FcbkGainTable[0]); for (i=1; i <NumOfGainLev; i++) { dist = (float)fabs(gain_nq - FcbkGainTable[i]); if (dist < dist_min) { dist_min = dist; gain = i; } } *gain_q = FcbkGainTable[gain]; return(gain); }
//-------------------------------------------------------------------
// Search the optimum positions of the four pulses which maximize
// square(correlation) / energy
// The search is performed in four nested loops. At each loop, one
// pulse contribution is added to the correlation and energy.
//
// The fourth loop is entered only if the correlation due to the
// contribution of the first three pulses exceeds the preset
// threshold.
//-------------------------------------------------------------------
void fourPulseFlt (float *rr, float *Dn, float thres, int ip[], int *shifPtr){
// Default values
int ip0 = 0; int ip1 = 2; int ip2 = 4; int ip3 = 6; int shif = 0; int i0, i1, i2; int k, time; int shift, bestk, lasti2, inc; float psc = 0.0f; float alpha = 1.0f; float ps0, ps1, ps2, alp0; float alp1, alp2; float ps0a, ps1a, ps2a; float *ptr_ri0i0,*ptr_ri1i1,*ptr_ri2i2; float *ptr_ri0i1,*ptr_ri0i2,*ptr_ri0i3; float *ptr_ri1i2,*ptr_ri1i3,*ptr_ri2i3;
float *rri0i0,*rri1i1,*rri2i2,*rri3i3; float *rri0i1,*rri0i2,*rri0i3; float *rri1i2,*rri1i3,*rri2i3; float a[16]; float t1,t2,*pntr; float dmax4, dmax5, dmax2, dmax3; //used for bypass
#if !OPT_PULSE4
int i3; float ps3; #endif
time = max_time + extra;
// Four loops to search innovation code.
// Init. pointers that depend on first loop
rri0i0 = rr; rri1i1 = rri0i0 + NB_POS; rri2i2 = rri1i1 + NB_POS; rri3i3 = rri2i2 + NB_POS;
rri0i1 = rri3i3 + NB_POS; rri0i2 = rri0i1 + MSIZE; rri0i3 = rri0i2 + MSIZE; rri1i2 = rri0i3 + MSIZE; rri1i3 = rri1i2 + MSIZE; rri2i3 = rri1i3 + MSIZE;
ptr_ri0i0 = rri0i0; ptr_ri0i1 = rri0i1; ptr_ri0i2 = rri0i2; ptr_ri0i3 = rri0i3;
// Compute the Dn max's
dmax2 = dmax3 = dmax4 = dmax5 = -1000000.0f; //i.e., large negative number
for (k = 2; k<SubFrLen2; k+=STEP) { if (Dn[k] > dmax2) dmax2 = Dn[k]; if (Dn[k+1] > dmax3) dmax3 = Dn[k+1]; if (Dn[k+2] > dmax4) dmax4 = Dn[k+2]; if (Dn[k+3] > dmax5) dmax5 = Dn[k+3]; }
// first pulse loop
for (i0=0; i0 < SubFrLen; i0 +=STEP) { ps0 = Dn[i0]; ps0a = Dn[i0+1]; alp0 = *ptr_ri0i0++;
// Init. pointers that depand on second loop
ptr_ri1i1 = rri1i1; ptr_ri1i2 = rri1i2; ptr_ri1i3 = rri1i3;
ps1 = ps0 + dmax2 + dmax4; ps1a = ps0a + dmax3 + dmax5; if (asint(ps1) < asint(thres) && asint(ps1a) < asint(thres)) { ptr_ri0i1 += NB_POS; goto skipsecond; }
// second pulse loop
for (i1=2; i1 < SubFrLen; i1 +=STEP) { ps1 = ps0 + Dn[i1]; ps1a = ps0a + Dn[i1+1];
alp1 = alp0 + *ptr_ri1i1++ + *ptr_ri0i1++;
// Init. pointers that depend on third loop
ptr_ri2i2 = rri2i2; ptr_ri2i3 = rri2i3; lasti2 = 4; ps2 = ps1 + dmax4; ps2a = ps1a + dmax5; if (asint(ps2) < asint(thres) && asint(ps2a) < asint(thres)) { i2 = 68; goto skipthird; }
// third pulse loop
for (i2 = 4; i2 < SubFrLen2; i2 +=STEP) { ps2 = ps1 + Dn[i2]; ps2a = ps1a + Dn[i2+1];
// Threshold test and 4th pulse loop. Since the probability of
// entering this is low, we cram as much of the 3rd-pulse-loop
// logic inside the threshold test. So the computation of shift,
// the choice of ps2 vs ps2a, the computation of alp2, and the
// incrementing of the 02,12,22 pointers are all done there.
if (asint(ps2) > asint(thres) || asint(ps2a) > asint(thres)) { shift = 0; if(asint(ps2a) > asint(ps2)) { shift = 1; ps2 = ps2a; }
inc = (i2 - lasti2) >> 3; lasti2 = i2; ptr_ri0i2 += inc; ptr_ri1i2 += inc; ptr_ri2i2 += inc;
alp2 = alp1 + *ptr_ri2i2 + *ptr_ri0i2 + *ptr_ri1i2; pntr = &Dn[6+shift];
#if OPT_PULSE4
ASM { push esi; push ebx;
mov esi,pntr;
;// First half of first loop
fld DP [esi+4*8*0]; fld DP [esi+4*8*1]; fld DP [esi+4*8*2]; fld DP [esi+4*8*3];
fxch ST(3); fadd ps2; fxch ST(2); fadd ps2; fxch ST(1); fadd ps2; fxch ST(3); fadd ps2;
fxch ST(2); fmul ST,ST(0); fxch ST(1); fmul ST,ST(0); fxch ST(3); fmul ST,ST(0); fxch ST(2); fmul ST,ST(0);
fxch ST(1); fstp a[4*8]; fxch ST(2); fstp a[4*9]; fstp a[4*10]; fstp a[4*11];
;// Second half of first loop
fld DP [esi+4*8*4]; fld DP [esi+4*8*5]; fld DP [esi+4*8*6]; fld DP [esi+4*8*7];
fxch ST(3); fadd ps2; fxch ST(2); fadd ps2; fxch ST(1); fadd ps2; fxch ST(3); fadd ps2;
fxch ST(2); fmul ST,ST(0); fxch ST(1); fmul ST,ST(0); fxch ST(3); fmul ST,ST(0); fxch ST(2); fmul ST,ST(0);
fxch ST(1); fstp a[4*12]; fxch ST(2); fstp a[4*13]; fstp a[4*14]; fstp a[4*15];
;// First half of second loop
mov eax,rri3i3; mov ebx,ptr_ri0i3; mov ecx,ptr_ri1i3; mov edx,ptr_ri2i3;
fld alp2; fld alp2; fld alp2; fld alp2;
fxch ST(3); fadd DP [eax+4*0]; fxch ST(2); fadd DP [eax+4*1]; fxch ST(1); fadd DP [eax+4*2]; fxch ST(3); fadd DP [eax+4*3];
fxch ST(2); fadd DP [ebx+4*0]; fxch ST(1); fadd DP [ebx+4*1]; fxch ST(3); fadd DP [ebx+4*2]; fxch ST(2); fadd DP [ebx+4*3];
fxch ST(1); fadd DP [ecx+4*0]; fxch ST(3); fadd DP [ecx+4*1]; fxch ST(2); fadd DP [ecx+4*2]; fxch ST(1); fadd DP [ecx+4*3];
fxch ST(3); fadd DP [edx+4*0]; fxch ST(2); fadd DP [edx+4*1]; fxch ST(1); fadd DP [edx+4*2]; fxch ST(3); fadd DP [edx+4*3];
fxch ST(2); fstp a[4*0]; fstp a[4*1]; fxch ST(1); fstp a[4*2]; fstp a[4*3];
;// Second half of second loop
fld alp2; fld alp2; fld alp2; fld alp2;
fxch ST(3); fadd DP [eax+4*4]; fxch ST(2); fadd DP [eax+4*5]; fxch ST(1); fadd DP [eax+4*6]; fxch ST(3); fadd DP [eax+4*7];
fxch ST(2); fadd DP [ebx+4*4]; fxch ST(1); fadd DP [ebx+4*5]; fxch ST(3); fadd DP [ebx+4*6]; fxch ST(2); fadd DP [ebx+4*7];
fxch ST(1); fadd DP [ecx+4*4]; fxch ST(3); fadd DP [ecx+4*5]; fxch ST(2); fadd DP [ecx+4*6]; fxch ST(1); fadd DP [ecx+4*7];
fxch ST(3); fadd DP [edx+4*4]; fxch ST(2); fadd DP [edx+4*5]; fxch ST(1); fadd DP [edx+4*6]; fxch ST(3); fadd DP [edx+4*7];
fxch ST(2); fstp a[4*4]; fstp a[4*5]; fxch ST(1); fstp a[4*6]; fstp a[4*7]; pop ebx; pop esi; }
#else
for (k=0; k<8; k++) { ps3 = ps2 + *pntr; pntr += STEP; a[k+8] = ps3 * ps3; }
for (k=0; k<8; k++) a[k] = alp2 + rri3i3[k] + ptr_ri0i3[k] + ptr_ri1i3[k] + ptr_ri2i3[k];
#endif
bestk = -1; for (k=0; k<8; k++) { t1 = a[k+8] * alpha; t2 = psc * a[k]; if (asint(t1) > asint(t2)) { psc = a[k+8]; alpha = a[k]; bestk = k; } } if (bestk >= 0) { ip0 = i0; ip1 = i1; ip2 = i2; ip3 = 6 + (bestk << 3); shif = shift; //#define t32 4294967296.0f
// printf(" %3d %3d %3d %3d %d %f %f %f\n",ip0,ip1,ip2,ip3,shift,psc/thres/thres,alpha/thres,(float)psc/(float)alpha/thres);
} time--; if(time <= 0) goto end_search; } ptr_ri2i3 += NB_POS; } skipthird: inc = (i2 - lasti2) >> 3; ptr_ri0i2 += inc; ptr_ri1i2 += inc; ptr_ri2i2 += inc; // end of for i2 =
ptr_ri0i2 -= NB_POS; ptr_ri1i3 += NB_POS; } skipsecond:
// end of for i1 =
ptr_ri0i2 += NB_POS; ptr_ri0i3 += NB_POS; } // end of for i0 =
end_search:
extra = time; ip[0] = ip0; ip[1] = ip1; ip[2] = ip2; ip[3] = ip3; *shifPtr = shif;
return; }
|