//  SAC MMx utilities
#include <memory.h>

#include "mmxutil.h"
#include "opt.h"
#define I2FTEST 0
#if I2FTEST
#include "stdio.h"
#endif

//------------------------------------------------------
int IsMMX()     // does the processor I'm running have MMX(tm) technology?
{
  int retu;

#ifdef _ALPHA_
    return 0;
#endif

#ifdef _X86_
  __asm
  {
	push ebx
    pushfd
    pop edx
    mov eax,edx
    xor edx,200000h
    push edx
    popfd
    pushfd
    pop edx
//
//  DON'T do this. This clears EAX, but the code is relying
//  on edx being 0 in the bail out case!!!
//
//  -mikeg
//
//    xor       eax,edx
//
//
    xor edx,eax     //This is the right way
    je  no_cpuid

    mov eax,1
    _emit 0x0f     //CPUID magic incantation
    _emit 0xa2
    and  edx,000800000h
    shr  edx,23
no_cpuid:
    mov  retu,edx
	pop ebx
  }
  return(retu);
#endif
}
//------------------------------------------------------
/* The following 4 routines make an 8-byte-aligned 'output' array
   from an 'input' array with various alignments.  MakeAlignedN assumes
   that 'input' starts on an address equal to N mod 8.  For now we
   only handle even N.
*/

//------------------------------------------------------
void MakeAligned0(void *input, void *output, int numbytes)
{
  memcpy(output,input,numbytes);
}
//------------------------------------------------------
void MakeAligned2(void *input, void *output, int numbytes)
{
  memcpy(output,input,numbytes);
}
//------------------------------------------------------
void MakeAligned4(void *input, void *output, int numbytes)
{
  memcpy(output,input,numbytes);
}
//------------------------------------------------------
void MakeAligned6(void *input, void *output, int numbytes)
{
  memcpy(output,input,numbytes);
}

//------------------------------------------------------
int FloatToShortScaled(float *input, short *output, int len, int guard)
{
  int max;

/* Convert an array of floats to an array of shorts with dynamic scaling.
   If guard=0 the array is scaled so that the largest power of 2 contained
   in the input comes out as 16384, which means all values fit in 16 bits
   without overflow.  If guard>0 the outputs are shifted an extra 'guard'
   bits to the right.
*/

  max = FloatMaxExp(input, len);
  ScaleFloatToShort(input, output, len, max + guard);

  return max;
}

int FloatToIntScaled(float *input, int *output, int len, int guard)
{
  int max;

/* Convert an array of floats to an array of shorts with dynamic scaling.
   If guard=0 the array is scaled so that the largest power of 2 contained
   in the input comes out as 2^30, which means all values fit in 32 bits
   without overflow.  If guard>0 the outputs are shifted an extra 'guard'
   bits to the right.
*/

  max = FloatMaxExp(input, len);
  ScaleFloatToInt(input, output, len, max + guard);

  return max;
}

int FloatMaxExp(float *input, int len)
{
  int max;

#if ASM_FTOSS

  ASM
  {
    mov esi,input;
    xor eax,eax;
    mov ebx,len;
    xor edi,edi;   // max

loop2:
    mov ecx,DP[esi+4*eax];
     mov edx,DP[esi+4*eax+4];

    and ecx,07f800000h;
     and edx,07f800000h;

    cmp edi,ecx;
     jge skip1;
    mov edi,ecx;
skip1:

    cmp edi,edx;
     jge skip2;
    mov edi,edx;
skip2:

    add eax,2;
    cmp eax,ebx;
    jl loop2;

    mov max,edi;
  }

#else

  int exp,i;

  max = 0;
  for (i=0; i<len; i++)
  {
    exp = (*((int *)(input + i))) & 0x7f800000;
    if (exp > max)
      max = exp;
  }
#endif

  return max >> 23;
}


void ScaleFloatToShort(float *input, short *output, int len, int newmax)
{
  int i;
  float scale;
/*
  If max exponent is 14, we want a scale factor of 1, since
  then values will be at most +/- 32727.  So scale factor multiplier
  should be 2^(14 - max - guard).  But 'max' has the exponent bias
  built in, so we must add BIAS once to the exponent to get a "real"
  exponent.  But then we want a FP exponent that has bias, so we
  need to add BIAS again!  So we get 2^(2*BIAS+14 - max - guard).
  2*BIAS+14 is 254 + 14 = 252+12, so it's 0x86000000 (first 9 bits 1 0000 1100)
*/

  i = 0x86000000 - (newmax << 23);
  scale = (*(float *)&i);

#if ASM_FTOSS

  ASM
  {
    mov esi,input;
    mov edi,output;
    xor eax,eax;
    mov ebx,len;

loop1:
    fld DP[esi+4*eax];
    fmul scale;
    fld DP[esi+4*eax+4];
    fmul scale;
    fxch(1);
    fistp WP[edi+2*eax];
    fistp WP[edi+2*eax+2];

    add eax,2;
    cmp eax,ebx;
    jl loop1;
  }

#else

  for (i=0; i<len; i++)
    output[i] = (short)(input[i]*scale);

#endif
  return;
}

void ConstFloatToShort(float *input, short *output, int len, float scale)
{

#if ASM_FTOSS

  ASM
  {
    mov esi,input;
    mov edi,output;
    xor eax,eax;
    mov ebx,len;

loop1:
    fld DP[esi+4*eax];
    fmul scale;
    fld DP[esi+4*eax+4];
    fmul scale;
    fxch(1);
    fistp WP[edi+2*eax];
    fistp WP[edi+2*eax+2];

    add eax,2;
    cmp eax,ebx;
    jl loop1;
  }

#else
  int i;

  for (i=0; i<len; i++)
    output[i] = (short)(input[i]*scale);

#endif
  return;
}


//------------------------------------------------------
void ScaleFloatToInt(float *input, int *output, int len, int newmax)
{
  int i;
  float scale;

  i = 0x8E000000 - (newmax << 23);
  scale = (*(float *)&i);

#if ASM_FTOSS

  ASM
  {
    mov esi,input;
    mov edi,output;
    xor eax,eax;
    mov ebx,len;

loop1:
    fld DP[esi+4*eax];
    fmul scale;
    fld DP[esi+4*eax+4];
    fmul scale;
    fxch(1);
    fistp DP[edi+4*eax];
    fistp DP[edi+4*eax+4];

    add eax,2;
    cmp eax,ebx;
    jl loop1;
  }

#else

  for (i=0; i<len; i++)
    output[i] = (int)(input[i]*scale);

#endif
  return;
}

void ConstFloatToInt(float *input, int *output, int len, float scale)
{

#if ASM_FTOSS

  ASM
  {
    mov esi,input;
    mov edi,output;
    xor eax,eax;
    mov ebx,len;

loop1:
    fld DP[esi+4*eax];
    fmul scale;
    fld DP[esi+4*eax+4];
    fmul scale;
    fxch(1);
    fistp DP[edi+4*eax];
    fistp DP[edi+4*eax+4];

    add eax,2;
    cmp eax,ebx;
    jl loop1;
  }

#else
  int i;

  for (i=0; i<len; i++)
    output[i] = (int)(input[i]*scale);

#endif
  return;
}


//------------------------------------------------------
void CorrelateInt(short *taps, short *array, int *corr, int len, int num)
{
  int i,j;

  for (i=0; i<num; i++)  // for each correlation
  {
    corr[i] = 0;
    for (j=0; j<len; j++)
      corr[i] += (int)taps[j] * (int)array[i+j];
  }
}

#if ASM_CORR
//------------------------------------------------------
void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor)
{

#define rega0  mm0
#define regb0  mm1
#define rega1  mm2
#define regb1  mm3
#define rega2  mm4
#define regb2  mm5
#define acc0   mm6
#define acc1   mm7

#define arr    esi
#define tap    edi
#define cor    eax
#define icnt   ebx

// In the following macros, 'n' is the column number and 'i' is the
// iteration number.

#define la(n,i)  ASM movq  rega##n,QP[arr+8*i]
#define lb(n,i)  ASM movq  regb##n,QP[tap+8*i+8]
#define m0(n,i)  ASM pmaddwd regb##n,rega##n
#define m1(n,i)  ASM pmaddwd rega##n,QP[tap+8*i]
#define a0(n,i)  ASM paddd acc0,regb##n
#define a1(n,i)  ASM paddd acc1,rega##n

  ASM
  {
    shr ntaps,2;
    sub taps,8;  // point to 1 before start of taps array
    mov cor,corr;

ForEachCorrPair:

    mov icnt,ntaps;
    pxor acc0,acc0;
    pxor acc1,acc1;
    mov tap,taps;
    mov arr,array;
  }

// prime the pump

  la(0,0);
  lb(0,0);
  m0(0,0);
  ASM pxor rega0,rega0;   // to make first a1(0,0) a nop
	  la(1,1);
	  lb(1,1);

inner:
		  la(2,2);
	  m0(1,1);
	  m1(1,1);
  a0(0,0);
		  lb(2,2);
  a1(0,0);
  la(0,3);
		  m0(2,2);
		  m1(2,2);
	  a0(1,1);
  lb(0,3);
	  a1(1,1);
	  la(1,4);
  m0(0,3);
  m1(0,3);
		  a0(2,2);
	  lb(1,4);
		  a1(2,2);

  ASM add arr,24;
  ASM add tap,24;

  ASM sub icnt,3;
  ASM jg inner;

  a1(0,0);

// Done with one correlation pair.  First need to add halves of
// acc0 and acc1 together and then store 2 results in corr array

  ASM
  {
    movq  mm0,acc0;
    psrlq acc0,32;
    paddd acc0,mm0;
    movq  mm1,acc1;
    psrlq acc1,32;
    movd  DP[cor],acc0;
    paddd acc1,mm1;
    movd  DP[cor+16],acc1;

    add cor,32;
    add array,16;
    sub ncor,2;
    jg ForEachCorrPair;

    emms;
  }

}
#undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1

#undef arr
#undef tap
#undef cor
#undef icnt
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1

#else
//------------------------------------------------------
void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor)
{
  int i,j,k;

  k = 0;
  for (i=0; i<ncor; i++)  // for each correlation
  {
    corr[k] = 0;
    for (j=0; j<ntaps; j++)
      corr[k] += (int)taps[j] * (int)array[k+j];
    k += 4;
  }
}
#endif
#if COMPILE_MMX
#undef icnt
void ab2abbcw(const short *input, short *output, int n)
{

#define in edi
#define out esi
#define icnt ecx

#define L(m,i)  ASM movq mm##m,QP[in+8*(i/2)]
#define PL(m)   ASM punpcklwd mm##m,mm##m
#define PH(m)   ASM punpckhwd mm##m,mm##m
#define SL(m) ASM psllq mm##m,16
#define SR(m) ASM psrlq mm##m,48
#define O(m,n)  ASM por mm##m,mm##n
#define S(m,i)  ASM movq QP[out+8*i],mm##m
	ASM {
	mov in, input;
	mov out, output;
	mov icnt, n;
	ASM     pxor mm3,mm3;
	sub icnt, 8;
	jl odd_ends;
	}

	//prime pump
	L(0,0);
	PL(0);
			L(1,1);
	SL(0);
			PH(1);
			SL(1);
							O(3,0);
					L(2,2);
	SR(0);
							S(3,0);
					PL(2);

	ASM sub icnt, 8;
	ASM jl cleanup;
inner:
					SL(2);
	O(0,1);
							L(3,3)
			SR(1);
	S(0,1);
							PH(3);
							SL(3);
			O(1,2);
	L(0,4);
					SR(2);
			S(1,2);
	PL(0);
	SL(0);          
					O(2,3);
			L(1,5);
							SR(3);
					S(2,3);
			PH(1);
			SL(1);
							O(3,0);
					L(2,6);
	SR(0);
							S(3,4);
					PL(2);

	ASM add in, 16;
	ASM  add out, 32;
	ASM sub icnt, 8;
	ASM  jg inner;

cleanup:
					SL(2);
	O(0,1);
							L(3,2);
			SR(1);
	S(0,1);
							PH(3);
							SL(3);
			O(1,2);
					SR(2);
			S(1,2);
					O(2,3);
					S(2,3);

odd_ends:
	ASM add icnt, 8-4;
	ASM  jl end;     // jump if no sign change

	L(0,4);
							SR(3);
	PL(0);
			L(1,5);
	SL(0);          
			PH(1);
							O(3,0);
			SL(1);
	SR(0);
							S(3,4);
	O(0,1);
	S(0,5);

end:
	ASM emms;
#undef in
#undef out
#undef icnt

#undef L
#undef PL
#undef PH
#undef SL
#undef SR
#undef O
#undef S

	return;
}
void ab2ababw(const short *input, short *output, int n)
{

#define in edi
#define out esi
#define icnt ecx

#define L(m,i) ASM movq mm##m,QP[in+4*i]
#define C(m,n) ASM movq mm##m,mm##n
#define PL(m)  ASM punpckldq mm##m,mm##m
#define PH(m)  ASM punpckhdq mm##m,mm##m
#define S(m,i) ASM movq [out+8*i],mm##m

	ASM {
	mov in, input;
	mov out, output;
	mov icnt, n;
	sub icnt, 8;
	jl odd_ends;
	}
	//prime pump
	L(0,0);
			C(1,0);
	PL(0);
					L(2,2);
			PH(1);
	S(0,0);
							C(3,2);
			S(1,1);
					PL(2);
	ASM add in, 16;
	ASM  add out, 32;
	ASM sub icnt, 8;
	ASM  jl cleanup;

inner:
	L(0,0);
							PH(3);
					S(2,-2);
			C(1,0);
							S(3,-1);
	PL(0);
					L(2,2);
			PH(1);
	S(0,0);
					C(3,2);
			S(1,1);
					PL(2);
	ASM add in, 16;
	ASM  add out, 32;
	ASM sub icnt, 8;
	ASM  jg inner;

cleanup:
							PH(3);
					S(2,-2);
							S(3,-1);
odd_ends:
	ASM add icnt, 8-2;
	ASM  jl end;     // jump if no sign change

inner_by2:
	ASM movd mm0, DP[in];
	PL(0);
	S(0,0);
	ASM add in, 4;
	ASM  add out, 8;
	ASM sub icnt, 2;
	ASM  jge inner_by2;

end:
	ASM emms;

	return;
}
#undef in
#undef out
#undef icnt

#undef L
#undef C
#undef PL
#undef PH
#undef S

void ConvMMX(short *input1, short *input2, int *output, int ncor)
{
#define rega0  mm0
#define regb0  mm1
#define rega1  mm2
#define regb1  mm3
#define rega2  mm4
#define regb2  mm5
#define acc0   mm6
#define acc1   mm7

#define in2    esi
#define in1    edi
#define out    eax
#define icnt   ecx
#define tmp        ebx

// In the following macros, 'n' is the column number and 'i' is the
// iteration number.

// we use "the convolution trick" or using la twice so that one
// of the pmadd's is reg,reg and thus can be in the V-slot.

// NOTE: we have read ahead up to 2 quadwords
//   so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
//   and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal

#define la(n,i)  ASM movq  rega##n,QP[in2+8*i]
#define lb(n,i)  ASM movq  regb##n,QP[in1+8*i-8]
#define m0(n,i)  ASM pmaddwd regb##n,rega##n
#define m1(n,i)  ASM pmaddwd rega##n,QP[in1+8*i]
#define a0(n,i)  ASM paddd acc0,regb##n
#define a1(n,i)  ASM paddd acc1,rega##n

  ASM
  {
	mov tmp,ncor;
	shl tmp,2;
    shr ncor,1;
    mov out,output;
	add out,tmp;
	add out,16;
    mov in1,input1;
    mov in2,input2;
    mov icnt,ncor;
  }

ForEachCorrPair:

// prime the pump

  la(0,0);
  ASM pxor regb0,regb0;   // to  avoid lb(0,0) reading taps[-1]
	  la(1,1);
  ASM pxor acc0,acc0;     // clear accumulator
  m1(0,0);
  ASM pxor acc1,acc1;     // clear accumulator
	  lb(1,1);
  ASM sub icnt, 1;        // account for pump priming
  ASM jle cleanup;        // bypass if only one to do

inner:
		  la(2,2);
	  m0(1,1);
	  m1(1,1);
  a0(0,0);
		  lb(2,2);
  a1(0,0);
  la(0,3);
		  m0(2,2);
		  m1(2,2);
	  a0(1,1);
  lb(0,3);
	  a1(1,1);
	  la(1,4);
  m0(0,3);
  m1(0,3);
		  a0(2,2);
	  lb(1,4);
		  a1(2,2);

  ASM add in2,24;
  ASM add in1,24;

  ASM sub icnt,3;
  ASM jg inner;

cleanup:  //  last two adds
  a0(0,0);
  a1(0,0);

// Done with one correlation pair.  Pack and store 2 results in corr array

  ASM
  {
    sub out,16;
	
     mov in2, input2;
    mov in1,input1;
	 add in2,16;
    mov icnt, ncor;
	
	mov input2, in2;
	 sub icnt,2;      //set flags for jump

	movq  QP[out-16],acc0;
	movq  QP[out-8],acc1;

	mov ncor, icnt;
    jg ForEachCorrPair;

    emms;
  }

}
#undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1

#undef in2
#undef in1
#undef out
#undef icnt
#undef tmp

#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
// 16 bit output
//       psrad acc0,16;//this could be less in some cases
//       psrad acc1,16;
//       packssdw acc1,acc0;
//   movq  QP[cor-8],acc0;

//#else
//------------------------------------------------------
/*
void ConvMMX(short *in1, short *in2, int *out, int ncor)
{
  int i,j;

  for (i=0; i < 2*ncor; i+=4)    {
    int acc0 = 0, acc1 = 0;
    for (j=0; j < 2*ncor - i; j+=4) {
      acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1];
      acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3];
    }
    corr[i/2] = acc0 ;
    corr[i/2+1] = acc1 ;
  }

  return;
}*/

void ab2abzaw(const short *input, short *output, int n)
{
	register int i;
	register unsigned *in, *out;
	register unsigned x, y; //tread two words at a time as raw bits

	in = (unsigned *)input;
	out = (unsigned *)output;
	//unroll by two
	for (i = n/2 - 2; i>0; i-=2) {
		x = in[i];
		y = in[i+1];
		out[2*(i+1)] = y;
		out[2*(i+1)+1] = (y<<16 | x>>16);
		
		x = in[i-1];
		y = in[i];
		out[2*i] = y;
		out[2*i+1] = (y<<16 | x>>16);
	}
	//odd ends
	for (i++; i>=0; i--) {
		x = (i>0)?in[i-1]:0;
		y = in[i];
		out[2*i] = y;
		out[2*i+1] = (y<<16 | x>>16);
	}
	return;
}

void ShortToFloatScale(short *x, float scale, int N, float *y)
{

/*
	short i;
	float yy[100];
	for (i=0; i<N; i++)
	{ yy[i]=x[i]*scale; }


  ASM
	{       
    mov esi,x;
    mov edi,y;
	lea ecx,scale;
	mov     eax, N
	sub     eax, 2
loop1:
	fild    WORD PTR [esi+eax*2]
	fmul    DWORD PTR [ecx]
	fstp    DWORD PTR [edi+eax*4]

	fild    WORD PTR [esi+eax*2+2]
	fmul    DWORD PTR [ecx]
	fstp    DWORD PTR [edi+eax*4+4]

	sub     eax, 2
	jge loop1;
	}

*/

  ASM
	{
	mov esi,x;
	mov edi,y;
	lea ecx,scale;
	mov     eax, N
	sub     eax, 6
	fld     DP [ecx]        ;                     c

	fild    WORD PTR [esi+eax*2+8] ;          L0  c

	fild    WORD PTR [esi+eax*2+10] ;      L1 L0  c
	 fxch   ST(1) ;                        L0 L1  c
	fmul    ST(0), ST(2) ;                        M0 L1  c
	 fxch    ST(1) ;                       L1 M0  c
	fmul   ST(0),ST(2) ;                         M1 M0  c

	fild    WORD PTR [esi+eax*2+4] ;    L0 M1 M0  c

	fild    WORD PTR [esi+eax*2+6];  L1 L0 M1 M0  c
	 fxch    ST(3) ;                 M0 L0 M1 L1  c
	fstp    DWORD PTR [edi+eax*4+16];   L0 M1 L1  c
loop1:  ;                                   L0 M1 L1  c

	fmul    ST(0),ST(3) ;                     M0 M1 L1  c
	 fxch    ST(1) ;                    M1 M0 L1  c
	fstp    DWORD PTR [edi+eax*4+20];      M0 L1  c
	 fxch    ST(1) ;                       L1 M0  c
	fmul   ST(0),ST(2) ;                         M1 M0  c
	fild    WORD PTR [esi+eax*2] ;      L0 M1 M0  c

	fild    WORD PTR [esi+eax*2+2] ; L1 L0 M1 M0  c
	 fxch    ST(3) ;                 M0 L0 M1 L1  c
	fstp    DWORD PTR [edi+eax*4+8];    L0 M1 L1  c

	sub     eax, 2
	 jge loop1;
	fmul    ST(0),ST(3) ;eax==-2              M0 M1 L1  c
	 fxch    ST(1) ;                    M1 M0 L1  c
	fstp    DWORD PTR [edi+eax*4+20] ;     M0 L1  c
	 fxch    ST(1) ;                       L1 M0  c
	fmulp   ST(2), st(0) ;                           M0 M1

	fstp    DWORD PTR [edi+eax*4+8] ;            M1

	fstp    DWORD PTR [edi+eax*4+12] ;
	}
/*


for (i=0; i<N; i++)
{
if (y[i]!=yy[i])
{
fprintf(stdout,"\nfloat problem\n");
break;
}
}

*/


}

//assumes N is even
void IntToFloatScale(int *x, float scale, int N, float *y)
{
#if I2FTEST //test code
	int i;
	float yy[1000];
	for (i=0; i<N; i++)
	{ yy[i]=(float)x[i]*scale; }
#endif //test code

#if 0 //simple code
//simple assembly version       
	ASM
	{       
    mov esi,x;
    mov edi,y;
	lea ecx,scale;
	mov     eax, N
	sub     eax, 2
loop1:
	fild    DWORD PTR [esi+eax*4]
	fmul    DWORD PTR [ecx]
	fstp    DWORD PTR [edi+eax*4]

	fild    DWORD PTR [esi+eax*4+4]
	fmul    DWORD PTR [ecx]
	fstp    DWORD PTR [edi+eax*4+4]

	sub     eax, 2
	jge loop1;
	}
#endif //test code


  ASM
	{
	mov esi,x;
	mov edi,y;
	lea ecx,scale;
	mov     eax, N
	sub     eax, 6
	fld     DP [ecx]        ;                     c

	fild    DWORD PTR [esi+eax*4+16] ;        L0  c

	fild    DWORD PTR [esi+eax*4+20] ;     L1 L0  c
	 fxch   ST(1) ;                        L0 L1  c
	fmul    ST(0), ST(2) ;                 M0 L1  c
	 fxch    ST(1) ;                       L1 M0  c
	fmul   ST(0),ST(2) ;                   M1 M0  c

	fild    DWORD PTR [esi+eax*4+8] ;   L0 M1 M0  c

	fild    DWORD PTR [esi+eax*4+12];L1 L0 M1 M0  c
	 fxch    ST(3) ;                 M0 L0 M1 L1  c
	fstp    DWORD PTR [edi+eax*4+16];   L0 M1 L1  c
loop1:  ;                                   L0 M1 L1  c

	fmul    ST(0),ST(3) ;               M0 M1 L1  c
	 fxch    ST(1) ;                    M1 M0 L1  c
	fstp    DWORD PTR [edi+eax*4+20];      M0 L1  c
	 fxch    ST(1) ;                       L1 M0  c
	fmul   ST(0),ST(2) ;                   M1 M0  c
	fild    DWORD PTR [esi+eax*4] ;     L0 M1 M0  c

	fild    DWORD PTR [esi+eax*4+4] ;L1 L0 M1 M0  c
	 fxch    ST(3) ;                 M0 L0 M1 L1  c
	fstp    DWORD PTR [edi+eax*4+8];    L0 M1 L1  c

	sub     eax, 2
	 jge loop1;
	fmul    ST(0),ST(3) ;eax==-2        M0 M1 L1  c
	 fxch    ST(1) ;                    M1 M0 L1  c
	fstp    DWORD PTR [edi+eax*4+20] ;     M0 L1  c
	 fxch    ST(1) ;                       L1 M0  c
	fmulp   ST(2), st(0) ;                    M0 M1

	fstp    DWORD PTR [edi+eax*4+8] ;            M1

	fstp    DWORD PTR [edi+eax*4+12] ;
	}


#if I2FTEST
  for (i=0; i<N; i++)
  {
    if (y[i]!=yy[i])
    {
      printf("F2I %3d %8f %8f\n", i, y[i], yy[i]);
    }
  }
#endif //test code


}

//assumes N is even
void IntToFloat(int *x, int N, float *y)
{
#if I2FTEST //test code
	int i;
	float yy[1000];
	for (i=0; i<N; i++)
	{ yy[i]=(float)x[i]; }
#endif //test code

//simple assembly version       
	ASM
	{       
    mov esi,x;
    mov edi,y;
	mov     eax, N
	sub     eax, 2
loop1:
	fild    DWORD PTR [esi+eax*4]
	fild    DWORD PTR [esi+eax*4+4]
	 fxch    ST(1) ;
	fstp    DWORD PTR [edi+eax*4]
	fstp    DWORD PTR [edi+eax*4+4]

	sub     eax, 2
	jge loop1;
	}


#if I2FTEST
  for (i=0; i<N; i++)
  {
    if (y[i]!=yy[i])
    {
      printf("F2I %3d %8f %8f\n", i, y[i], yy[i]);
    }
  }
#endif //test code


}
#endif