|
|
// SAC MMx utilities
#include <memory.h>
#include "mmxutil.h"
#include "opt.h"
#define I2FTEST 0
#if I2FTEST
#include "stdio.h"
#endif
//------------------------------------------------------
int IsMMX() // does the processor I'm running have MMX(tm) technology?
{ int retu;
#ifdef _ALPHA_
return 0; #endif
#ifdef _X86_
__asm { push ebx pushfd pop edx mov eax,edx xor edx,200000h push edx popfd pushfd pop edx //
// DON'T do this. This clears EAX, but the code is relying
// on edx being 0 in the bail out case!!!
//
// -mikeg
//
// xor eax,edx
//
//
xor edx,eax //This is the right way
je no_cpuid
mov eax,1 _emit 0x0f //CPUID magic incantation
_emit 0xa2 and edx,000800000h shr edx,23 no_cpuid: mov retu,edx pop ebx } return(retu); #endif
} //------------------------------------------------------
/* The following 4 routines make an 8-byte-aligned 'output' array
from an 'input' array with various alignments. MakeAlignedN assumes that 'input' starts on an address equal to N mod 8. For now we only handle even N. */
//------------------------------------------------------
void MakeAligned0(void *input, void *output, int numbytes) { memcpy(output,input,numbytes); } //------------------------------------------------------
void MakeAligned2(void *input, void *output, int numbytes) { memcpy(output,input,numbytes); } //------------------------------------------------------
void MakeAligned4(void *input, void *output, int numbytes) { memcpy(output,input,numbytes); } //------------------------------------------------------
void MakeAligned6(void *input, void *output, int numbytes) { memcpy(output,input,numbytes); }
//------------------------------------------------------
int FloatToShortScaled(float *input, short *output, int len, int guard) { int max;
/* Convert an array of floats to an array of shorts with dynamic scaling.
If guard=0 the array is scaled so that the largest power of 2 contained in the input comes out as 16384, which means all values fit in 16 bits without overflow. If guard>0 the outputs are shifted an extra 'guard' bits to the right. */
max = FloatMaxExp(input, len); ScaleFloatToShort(input, output, len, max + guard);
return max; }
int FloatToIntScaled(float *input, int *output, int len, int guard) { int max;
/* Convert an array of floats to an array of shorts with dynamic scaling.
If guard=0 the array is scaled so that the largest power of 2 contained in the input comes out as 2^30, which means all values fit in 32 bits without overflow. If guard>0 the outputs are shifted an extra 'guard' bits to the right. */
max = FloatMaxExp(input, len); ScaleFloatToInt(input, output, len, max + guard);
return max; }
int FloatMaxExp(float *input, int len) { int max;
#if ASM_FTOSS
ASM { mov esi,input; xor eax,eax; mov ebx,len; xor edi,edi; // max
loop2: mov ecx,DP[esi+4*eax]; mov edx,DP[esi+4*eax+4];
and ecx,07f800000h; and edx,07f800000h;
cmp edi,ecx; jge skip1; mov edi,ecx; skip1:
cmp edi,edx; jge skip2; mov edi,edx; skip2:
add eax,2; cmp eax,ebx; jl loop2;
mov max,edi; }
#else
int exp,i;
max = 0; for (i=0; i<len; i++) { exp = (*((int *)(input + i))) & 0x7f800000; if (exp > max) max = exp; } #endif
return max >> 23; }
void ScaleFloatToShort(float *input, short *output, int len, int newmax) { int i; float scale; /*
If max exponent is 14, we want a scale factor of 1, since then values will be at most +/- 32727. So scale factor multiplier should be 2^(14 - max - guard). But 'max' has the exponent bias built in, so we must add BIAS once to the exponent to get a "real" exponent. But then we want a FP exponent that has bias, so we need to add BIAS again! So we get 2^(2*BIAS+14 - max - guard). 2*BIAS+14 is 254 + 14 = 252+12, so it's 0x86000000 (first 9 bits 1 0000 1100) */
i = 0x86000000 - (newmax << 23); scale = (*(float *)&i);
#if ASM_FTOSS
ASM { mov esi,input; mov edi,output; xor eax,eax; mov ebx,len;
loop1: fld DP[esi+4*eax]; fmul scale; fld DP[esi+4*eax+4]; fmul scale; fxch(1); fistp WP[edi+2*eax]; fistp WP[edi+2*eax+2];
add eax,2; cmp eax,ebx; jl loop1; }
#else
for (i=0; i<len; i++) output[i] = (short)(input[i]*scale);
#endif
return; }
void ConstFloatToShort(float *input, short *output, int len, float scale) {
#if ASM_FTOSS
ASM { mov esi,input; mov edi,output; xor eax,eax; mov ebx,len;
loop1: fld DP[esi+4*eax]; fmul scale; fld DP[esi+4*eax+4]; fmul scale; fxch(1); fistp WP[edi+2*eax]; fistp WP[edi+2*eax+2];
add eax,2; cmp eax,ebx; jl loop1; }
#else
int i;
for (i=0; i<len; i++) output[i] = (short)(input[i]*scale);
#endif
return; }
//------------------------------------------------------
void ScaleFloatToInt(float *input, int *output, int len, int newmax) { int i; float scale;
i = 0x8E000000 - (newmax << 23); scale = (*(float *)&i);
#if ASM_FTOSS
ASM { mov esi,input; mov edi,output; xor eax,eax; mov ebx,len;
loop1: fld DP[esi+4*eax]; fmul scale; fld DP[esi+4*eax+4]; fmul scale; fxch(1); fistp DP[edi+4*eax]; fistp DP[edi+4*eax+4];
add eax,2; cmp eax,ebx; jl loop1; }
#else
for (i=0; i<len; i++) output[i] = (int)(input[i]*scale);
#endif
return; }
void ConstFloatToInt(float *input, int *output, int len, float scale) {
#if ASM_FTOSS
ASM { mov esi,input; mov edi,output; xor eax,eax; mov ebx,len;
loop1: fld DP[esi+4*eax]; fmul scale; fld DP[esi+4*eax+4]; fmul scale; fxch(1); fistp DP[edi+4*eax]; fistp DP[edi+4*eax+4];
add eax,2; cmp eax,ebx; jl loop1; }
#else
int i;
for (i=0; i<len; i++) output[i] = (int)(input[i]*scale);
#endif
return; }
//------------------------------------------------------
void CorrelateInt(short *taps, short *array, int *corr, int len, int num) { int i,j;
for (i=0; i<num; i++) // for each correlation
{ corr[i] = 0; for (j=0; j<len; j++) corr[i] += (int)taps[j] * (int)array[i+j]; } }
#if ASM_CORR
//------------------------------------------------------
void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor) {
#define rega0 mm0
#define regb0 mm1
#define rega1 mm2
#define regb1 mm3
#define rega2 mm4
#define regb2 mm5
#define acc0 mm6
#define acc1 mm7
#define arr esi
#define tap edi
#define cor eax
#define icnt ebx
// In the following macros, 'n' is the column number and 'i' is the
// iteration number.
#define la(n,i) ASM movq rega##n,QP[arr+8*i]
#define lb(n,i) ASM movq regb##n,QP[tap+8*i+8]
#define m0(n,i) ASM pmaddwd regb##n,rega##n
#define m1(n,i) ASM pmaddwd rega##n,QP[tap+8*i]
#define a0(n,i) ASM paddd acc0,regb##n
#define a1(n,i) ASM paddd acc1,rega##n
ASM { shr ntaps,2; sub taps,8; // point to 1 before start of taps array
mov cor,corr;
ForEachCorrPair:
mov icnt,ntaps; pxor acc0,acc0; pxor acc1,acc1; mov tap,taps; mov arr,array; }
// prime the pump
la(0,0); lb(0,0); m0(0,0); ASM pxor rega0,rega0; // to make first a1(0,0) a nop
la(1,1); lb(1,1);
inner: la(2,2); m0(1,1); m1(1,1); a0(0,0); lb(2,2); a1(0,0); la(0,3); m0(2,2); m1(2,2); a0(1,1); lb(0,3); a1(1,1); la(1,4); m0(0,3); m1(0,3); a0(2,2); lb(1,4); a1(2,2);
ASM add arr,24; ASM add tap,24;
ASM sub icnt,3; ASM jg inner;
a1(0,0);
// Done with one correlation pair. First need to add halves of
// acc0 and acc1 together and then store 2 results in corr array
ASM { movq mm0,acc0; psrlq acc0,32; paddd acc0,mm0; movq mm1,acc1; psrlq acc1,32; movd DP[cor],acc0; paddd acc1,mm1; movd DP[cor+16],acc1;
add cor,32; add array,16; sub ncor,2; jg ForEachCorrPair;
emms; }
} #undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1
#undef arr
#undef tap
#undef cor
#undef icnt
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
#else
//------------------------------------------------------
void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor) { int i,j,k;
k = 0; for (i=0; i<ncor; i++) // for each correlation
{ corr[k] = 0; for (j=0; j<ntaps; j++) corr[k] += (int)taps[j] * (int)array[k+j]; k += 4; } } #endif
#if COMPILE_MMX
#undef icnt
void ab2abbcw(const short *input, short *output, int n) {
#define in edi
#define out esi
#define icnt ecx
#define L(m,i) ASM movq mm##m,QP[in+8*(i/2)]
#define PL(m) ASM punpcklwd mm##m,mm##m
#define PH(m) ASM punpckhwd mm##m,mm##m
#define SL(m) ASM psllq mm##m,16
#define SR(m) ASM psrlq mm##m,48
#define O(m,n) ASM por mm##m,mm##n
#define S(m,i) ASM movq QP[out+8*i],mm##m
ASM { mov in, input; mov out, output; mov icnt, n; ASM pxor mm3,mm3; sub icnt, 8; jl odd_ends; }
//prime pump
L(0,0); PL(0); L(1,1); SL(0); PH(1); SL(1); O(3,0); L(2,2); SR(0); S(3,0); PL(2);
ASM sub icnt, 8; ASM jl cleanup; inner: SL(2); O(0,1); L(3,3) SR(1); S(0,1); PH(3); SL(3); O(1,2); L(0,4); SR(2); S(1,2); PL(0); SL(0); O(2,3); L(1,5); SR(3); S(2,3); PH(1); SL(1); O(3,0); L(2,6); SR(0); S(3,4); PL(2);
ASM add in, 16; ASM add out, 32; ASM sub icnt, 8; ASM jg inner;
cleanup: SL(2); O(0,1); L(3,2); SR(1); S(0,1); PH(3); SL(3); O(1,2); SR(2); S(1,2); O(2,3); S(2,3);
odd_ends: ASM add icnt, 8-4; ASM jl end; // jump if no sign change
L(0,4); SR(3); PL(0); L(1,5); SL(0); PH(1); O(3,0); SL(1); SR(0); S(3,4); O(0,1); S(0,5);
end: ASM emms; #undef in
#undef out
#undef icnt
#undef L
#undef PL
#undef PH
#undef SL
#undef SR
#undef O
#undef S
return; } void ab2ababw(const short *input, short *output, int n) {
#define in edi
#define out esi
#define icnt ecx
#define L(m,i) ASM movq mm##m,QP[in+4*i]
#define C(m,n) ASM movq mm##m,mm##n
#define PL(m) ASM punpckldq mm##m,mm##m
#define PH(m) ASM punpckhdq mm##m,mm##m
#define S(m,i) ASM movq [out+8*i],mm##m
ASM { mov in, input; mov out, output; mov icnt, n; sub icnt, 8; jl odd_ends; } //prime pump
L(0,0); C(1,0); PL(0); L(2,2); PH(1); S(0,0); C(3,2); S(1,1); PL(2); ASM add in, 16; ASM add out, 32; ASM sub icnt, 8; ASM jl cleanup;
inner: L(0,0); PH(3); S(2,-2); C(1,0); S(3,-1); PL(0); L(2,2); PH(1); S(0,0); C(3,2); S(1,1); PL(2); ASM add in, 16; ASM add out, 32; ASM sub icnt, 8; ASM jg inner;
cleanup: PH(3); S(2,-2); S(3,-1); odd_ends: ASM add icnt, 8-2; ASM jl end; // jump if no sign change
inner_by2: ASM movd mm0, DP[in]; PL(0); S(0,0); ASM add in, 4; ASM add out, 8; ASM sub icnt, 2; ASM jge inner_by2;
end: ASM emms;
return; } #undef in
#undef out
#undef icnt
#undef L
#undef C
#undef PL
#undef PH
#undef S
void ConvMMX(short *input1, short *input2, int *output, int ncor) { #define rega0 mm0
#define regb0 mm1
#define rega1 mm2
#define regb1 mm3
#define rega2 mm4
#define regb2 mm5
#define acc0 mm6
#define acc1 mm7
#define in2 esi
#define in1 edi
#define out eax
#define icnt ecx
#define tmp ebx
// In the following macros, 'n' is the column number and 'i' is the
// iteration number.
// we use "the convolution trick" or using la twice so that one
// of the pmadd's is reg,reg and thus can be in the V-slot.
// NOTE: we have read ahead up to 2 quadwords
// so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
// and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal
#define la(n,i) ASM movq rega##n,QP[in2+8*i]
#define lb(n,i) ASM movq regb##n,QP[in1+8*i-8]
#define m0(n,i) ASM pmaddwd regb##n,rega##n
#define m1(n,i) ASM pmaddwd rega##n,QP[in1+8*i]
#define a0(n,i) ASM paddd acc0,regb##n
#define a1(n,i) ASM paddd acc1,rega##n
ASM { mov tmp,ncor; shl tmp,2; shr ncor,1; mov out,output; add out,tmp; add out,16; mov in1,input1; mov in2,input2; mov icnt,ncor; }
ForEachCorrPair:
// prime the pump
la(0,0); ASM pxor regb0,regb0; // to avoid lb(0,0) reading taps[-1]
la(1,1); ASM pxor acc0,acc0; // clear accumulator
m1(0,0); ASM pxor acc1,acc1; // clear accumulator
lb(1,1); ASM sub icnt, 1; // account for pump priming
ASM jle cleanup; // bypass if only one to do
inner: la(2,2); m0(1,1); m1(1,1); a0(0,0); lb(2,2); a1(0,0); la(0,3); m0(2,2); m1(2,2); a0(1,1); lb(0,3); a1(1,1); la(1,4); m0(0,3); m1(0,3); a0(2,2); lb(1,4); a1(2,2);
ASM add in2,24; ASM add in1,24;
ASM sub icnt,3; ASM jg inner;
cleanup: // last two adds
a0(0,0); a1(0,0);
// Done with one correlation pair. Pack and store 2 results in corr array
ASM { sub out,16; mov in2, input2; mov in1,input1; add in2,16; mov icnt, ncor; mov input2, in2; sub icnt,2; //set flags for jump
movq QP[out-16],acc0; movq QP[out-8],acc1;
mov ncor, icnt; jg ForEachCorrPair;
emms; }
} #undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1
#undef in2
#undef in1
#undef out
#undef icnt
#undef tmp
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
// 16 bit output
// psrad acc0,16;//this could be less in some cases
// psrad acc1,16;
// packssdw acc1,acc0;
// movq QP[cor-8],acc0;
//#else
//------------------------------------------------------
/*
void ConvMMX(short *in1, short *in2, int *out, int ncor) { int i,j;
for (i=0; i < 2*ncor; i+=4) { int acc0 = 0, acc1 = 0; for (j=0; j < 2*ncor - i; j+=4) { acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1]; acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3]; } corr[i/2] = acc0 ; corr[i/2+1] = acc1 ; }
return; }*/
void ab2abzaw(const short *input, short *output, int n) { register int i; register unsigned *in, *out; register unsigned x, y; //tread two words at a time as raw bits
in = (unsigned *)input; out = (unsigned *)output; //unroll by two
for (i = n/2 - 2; i>0; i-=2) { x = in[i]; y = in[i+1]; out[2*(i+1)] = y; out[2*(i+1)+1] = (y<<16 | x>>16); x = in[i-1]; y = in[i]; out[2*i] = y; out[2*i+1] = (y<<16 | x>>16); } //odd ends
for (i++; i>=0; i--) { x = (i>0)?in[i-1]:0; y = in[i]; out[2*i] = y; out[2*i+1] = (y<<16 | x>>16); } return; }
void ShortToFloatScale(short *x, float scale, int N, float *y) {
/*
short i; float yy[100]; for (i=0; i<N; i++) { yy[i]=x[i]*scale; }
ASM { mov esi,x; mov edi,y; lea ecx,scale; mov eax, N sub eax, 2 loop1: fild WORD PTR [esi+eax*2] fmul DWORD PTR [ecx] fstp DWORD PTR [edi+eax*4]
fild WORD PTR [esi+eax*2+2] fmul DWORD PTR [ecx] fstp DWORD PTR [edi+eax*4+4]
sub eax, 2 jge loop1; }
*/
ASM { mov esi,x; mov edi,y; lea ecx,scale; mov eax, N sub eax, 6 fld DP [ecx] ; c
fild WORD PTR [esi+eax*2+8] ; L0 c
fild WORD PTR [esi+eax*2+10] ; L1 L0 c fxch ST(1) ; L0 L1 c fmul ST(0), ST(2) ; M0 L1 c fxch ST(1) ; L1 M0 c fmul ST(0),ST(2) ; M1 M0 c
fild WORD PTR [esi+eax*2+4] ; L0 M1 M0 c
fild WORD PTR [esi+eax*2+6]; L1 L0 M1 M0 c fxch ST(3) ; M0 L0 M1 L1 c fstp DWORD PTR [edi+eax*4+16]; L0 M1 L1 c loop1: ; L0 M1 L1 c
fmul ST(0),ST(3) ; M0 M1 L1 c fxch ST(1) ; M1 M0 L1 c fstp DWORD PTR [edi+eax*4+20]; M0 L1 c fxch ST(1) ; L1 M0 c fmul ST(0),ST(2) ; M1 M0 c fild WORD PTR [esi+eax*2] ; L0 M1 M0 c
fild WORD PTR [esi+eax*2+2] ; L1 L0 M1 M0 c fxch ST(3) ; M0 L0 M1 L1 c fstp DWORD PTR [edi+eax*4+8]; L0 M1 L1 c
sub eax, 2 jge loop1; fmul ST(0),ST(3) ;eax==-2 M0 M1 L1 c fxch ST(1) ; M1 M0 L1 c fstp DWORD PTR [edi+eax*4+20] ; M0 L1 c fxch ST(1) ; L1 M0 c fmulp ST(2), st(0) ; M0 M1
fstp DWORD PTR [edi+eax*4+8] ; M1
fstp DWORD PTR [edi+eax*4+12] ; } /*
for (i=0; i<N; i++) { if (y[i]!=yy[i]) { fprintf(stdout,"\nfloat problem\n"); break; } }
*/
}
//assumes N is even
void IntToFloatScale(int *x, float scale, int N, float *y) { #if I2FTEST //test code
int i; float yy[1000]; for (i=0; i<N; i++) { yy[i]=(float)x[i]*scale; } #endif //test code
#if 0 //simple code
//simple assembly version
ASM { mov esi,x; mov edi,y; lea ecx,scale; mov eax, N sub eax, 2 loop1: fild DWORD PTR [esi+eax*4] fmul DWORD PTR [ecx] fstp DWORD PTR [edi+eax*4]
fild DWORD PTR [esi+eax*4+4] fmul DWORD PTR [ecx] fstp DWORD PTR [edi+eax*4+4]
sub eax, 2 jge loop1; } #endif //test code
ASM { mov esi,x; mov edi,y; lea ecx,scale; mov eax, N sub eax, 6 fld DP [ecx] ; c
fild DWORD PTR [esi+eax*4+16] ; L0 c
fild DWORD PTR [esi+eax*4+20] ; L1 L0 c fxch ST(1) ; L0 L1 c fmul ST(0), ST(2) ; M0 L1 c fxch ST(1) ; L1 M0 c fmul ST(0),ST(2) ; M1 M0 c
fild DWORD PTR [esi+eax*4+8] ; L0 M1 M0 c
fild DWORD PTR [esi+eax*4+12];L1 L0 M1 M0 c fxch ST(3) ; M0 L0 M1 L1 c fstp DWORD PTR [edi+eax*4+16]; L0 M1 L1 c loop1: ; L0 M1 L1 c
fmul ST(0),ST(3) ; M0 M1 L1 c fxch ST(1) ; M1 M0 L1 c fstp DWORD PTR [edi+eax*4+20]; M0 L1 c fxch ST(1) ; L1 M0 c fmul ST(0),ST(2) ; M1 M0 c fild DWORD PTR [esi+eax*4] ; L0 M1 M0 c
fild DWORD PTR [esi+eax*4+4] ;L1 L0 M1 M0 c fxch ST(3) ; M0 L0 M1 L1 c fstp DWORD PTR [edi+eax*4+8]; L0 M1 L1 c
sub eax, 2 jge loop1; fmul ST(0),ST(3) ;eax==-2 M0 M1 L1 c fxch ST(1) ; M1 M0 L1 c fstp DWORD PTR [edi+eax*4+20] ; M0 L1 c fxch ST(1) ; L1 M0 c fmulp ST(2), st(0) ; M0 M1
fstp DWORD PTR [edi+eax*4+8] ; M1
fstp DWORD PTR [edi+eax*4+12] ; }
#if I2FTEST
for (i=0; i<N; i++) { if (y[i]!=yy[i]) { printf("F2I %3d %8f %8f\n", i, y[i], yy[i]); } } #endif //test code
}
//assumes N is even
void IntToFloat(int *x, int N, float *y) { #if I2FTEST //test code
int i; float yy[1000]; for (i=0; i<N; i++) { yy[i]=(float)x[i]; } #endif //test code
//simple assembly version
ASM { mov esi,x; mov edi,y; mov eax, N sub eax, 2 loop1: fild DWORD PTR [esi+eax*4] fild DWORD PTR [esi+eax*4+4] fxch ST(1) ; fstp DWORD PTR [edi+eax*4] fstp DWORD PTR [edi+eax*4+4]
sub eax, 2 jge loop1; }
#if I2FTEST
for (i=0; i<N; i++) { if (y[i]!=yy[i]) { printf("F2I %3d %8f %8f\n", i, y[i], yy[i]); } } #endif //test code
} #endif
|