/* *************************************************************************
**    INTEL Corporation Proprietary Information
**
**    This listing is supplied under the terms of a license
**    agreement with INTEL Corporation and may not be copied
**    nor disclosed except in accordance with the terms of
**    that agreement.
**
**    Copyright (c) 1995 Intel Corporation.
**    All Rights Reserved.
**
** *************************************************************************
*/

//////////////////////////////////////////////////////////////////////////
// $Author:   AGUPTA2  $
// $Date:   08 Mar 1996 16:46:18  $
// $Archive:   S:\h26x\src\dec\d3halfmc.cpv  $
// $Header:   S:\h26x\src\dec\d3halfmc.cpv   1.15   08 Mar 1996 16:46:18   AGUPTA2  $
// $Log:   S:\h26x\src\dec\d3halfmc.cpv  $
// 
//    Rev 1.15   08 Mar 1996 16:46:18   AGUPTA2
// Added pragma code_seg.
// 
// 
//    Rev 1.14   29 Jan 1996 17:53:56   RMCKENZX
// Completely re-wrote all 3 routines.  The loops no longer use pseudo
// SIMD logic and have been tightened to 256, 169, and 169 cycles
// for half-half, half-int, and int-half respectively.
// 
//    Rev 1.13   19 Jan 1996 17:40:36   RMCKENZX
// fixed half-int so it will correctly round
// 
//    Rev 1.12   19 Jan 1996 13:29:32   RHAZRA
// Fixed halfpixel prediction by bilinear interpolation in ASM code
// 
//    Rev 1.11   27 Dec 1995 14:36:06   RMCKENZX
// Added copyright notice
// 
//    Rev 1.10   09 Oct 1995 09:43:36   CZHU
// Fixed bug in (half,half) interpolation optimization
// 
//    Rev 1.9   08 Oct 1995 13:40:14   CZHU
// Added C version of (half,half) and use it for now until we fix the bug
// in the optimized version
// 
//    Rev 1.8   03 Oct 1995 15:06:30   CZHU
// 
// Adding debug assistance
// 
//    Rev 1.7   28 Sep 1995 15:32:22   CZHU
// Fixed bugs mast off bits after shift
// 
//    Rev 1.6   26 Sep 1995 11:13:36   CZHU
// 
// Adjust pitch back to normal, and changed UINT to U32
// 
//    Rev 1.5   25 Sep 1995 09:04:14   CZHU
// Added and cleaned some comments 
// 
//    Rev 1.4   22 Sep 1995 16:42:00   CZHU
// 
// improve pairing
// 
//    Rev 1.3   22 Sep 1995 15:59:48   CZHU
// finished first around coding of half pel interpolation and tested
// with the standalone program
// 
//    Rev 1.2   21 Sep 1995 16:56:28   CZHU
// Unit tested (half, int) case
// 
//    Rev 1.1   21 Sep 1995 12:06:22   CZHU
// More development
// 
//    Rev 1.0   20 Sep 1995 16:27:56   CZHU
// Initial revision.
// 

#include "precomp.h"

#define FRAMEPOINTER		esp

//Interpolat_Int_half interpolated the pels from the pRef block 
//Write to pNewRef.
//Assumes that pRef area has been expanded
// Todo: Loop control and setup the stack for locals,CZHU,9/20/95
//       preload output cache lines, 9/21
//       Cache preload is no longer needed, 9/21/95
// Cycles count: 50*4 =200 cycles

#pragma code_seg("IACODE2")
__declspec(naked)
void Interpolate_Half_Int (U32 pRef, U32 pNewRef)
{		
__asm {
	push	ebp
	 push	ebx
	push	edi
	 push	esi

	mov 	esi, [esp+20] 		// pRef = esp + 4 pushes + ret
	 mov	edi, [esp+24]		// pNewRef = esp + 4 pushes + ret + pRef
	sub 	edi, PITCH			// pre-decrement destination
	 mov	ebp, 8				// loop counter
	xor 	eax, eax			// clear registers
	 xor 	ebx, ebx
	xor 	ecx, ecx
	 xor	edx, edx

//--------------------------------------------------------------------------//
//
//	This loop is, basically, a 4 instruction, 2 cycle loop.
//	It is 3-folded, meaning that it works on 3 results per each 
//	2 cycle unit.  It is 8-unrolled, meaning that it does 8 results
//	(one block's row) per loop iteration.  The basic calculations
//	follow this pattern:
//
//	   pass-> 1      2       3
//	cycle	
//	  1     load |       | shift
//	      -----------------------
//	  2          |  add  | store 
//
//	This assumes that the prior pell's value was loaded and 
//	preserved from the prior result's calculation.  Therefore
//	each result uses 2 registers -- one to load (and preserve)
//	the right-hand pell, and the other (overwriting the previous
//	result's stored pell value) to add into, shift, and store out
//	of.  The add is accomplished with the lea instruction, allowing
//	a round bit to be added in without using a separate instruction.
//	
//	The preamble loads & adds for the first result, and loads 
//	for the second.  The body executes the basic pattern six times.
//	The postamble shifts and stores for the seventh result and 
//	adds, shifts, and stores for the eighth.
//
//	Timing:
//		  4	preamble (including bank conflict)
//		 12	body
//		  4	postamble
//		----------------
//		 20	per loop
//		x 8	loops
//		----------------
//		160 subtotal
//		  6	initialize
//	 	  3	finalize
//		================
//		169 total cycles
//--------------------------------------------------------------------------//

main_loop:	
// preamble
	mov 	al, 0[esi]
	 mov	bl, 1[esi]			// probable BANK CONFLICT
	mov 	dl, 0[edi]			// heat the cache
	 add	edi, PITCH			// increment destination at top
	lea 	eax, [1+eax+ebx]	// use a regular add in the preamble
	 mov	cl, 2[esi]

// body (6 pels)
	shr 	eax, 1
	 mov	dl, 3[esi]
	lea 	ebx, [ebx+ecx+1]
	 mov	0[edi], al

	shr 	ebx, 1
	 mov	al, 4[esi]
	lea 	ecx, [ecx+edx+1]
	 mov	1[edi], bl

	shr 	ecx, 1
	 mov	bl, 5[esi]
	lea 	edx, [edx+eax+1]
	 mov	2[edi], cl

	shr 	edx, 1
	 mov	cl, 6[esi]
	lea 	eax, [eax+ebx+1]
	 mov	3[edi], dl

	shr 	eax, 1
	 mov	dl, 7[esi]
	lea 	ebx, [ebx+ecx+1]
	 mov	4[edi], al

	shr 	ebx, 1
	 mov	al, 8[esi]
	lea 	ecx, [ecx+edx+1]
	 mov	5[edi], bl

// postamble
	shr 	ecx, 1
	 lea 	edx, [edx+eax+1]
	shr 	edx, 1
	 mov	6[edi], cl
	add 	esi, PITCH			// increment source pointer
	 mov	7[edi], dl
	dec 	ebp					// loop counter
	 jne	main_loop

// restore registers and return
	pop 	esi
	 pop	edi
	pop 	ebx
	 pop	ebp
	ret
  }	 //end of asm
}
// end Interpolate_Half_Int()
//--------------------------------------------------------------------------//


__declspec(naked)
void Interpolate_Int_Half (U32 pRef, U32 pNewRef)
{		
__asm {
	push	ebp
	 push	ebx
	push	edi
	 push	esi

	mov 	esi, [esp+20] 		// pRef = esp + 4 pushes + ret
	 mov	edi, [esp+24]		// pNewRef = esp + 4 pushes + ret + pRef
	dec 	edi					// pre-decrement destination
	 mov	ebp, 8				// loop counter
	xor 	eax, eax			// clear registers
	 xor 	ebx, ebx
	xor 	ecx, ecx
	 xor	edx, edx

//--------------------------------------------------------------------------//
//
//	This loop is, basically, a 4 instruction, 2 cycle loop.
//	It is 3-folded, meaning that it works on 3 results per each 
//	2 cycle unit.  It is 8-unrolled, meaning that it does 8 results
//	(one block's row) per loop iteration.  The basic calculations
//	follow this pattern:
//
//	   pass-> 1      2       3
//	cycle	
//	  1     load |       | shift
//	      -----------------------
//	  2          |  add  | store 
//
//	This assumes that the prior pell's value was loaded and 
//	preserved from the prior result's calculation.  Therefore
//	each result uses 2 registers -- one to load (and preserve)
//	the right-hand pell, and the other (overwriting the previous
//	result's stored pell value) to add into, shift, and store out
//	of.  The add is accomplished with the lea instruction, allowing
//	a round bit to be added in without using a separate instruction.
//	
//	The preamble loads & adds for the first result, and loads 
//	for the second.  The body executes the basic pattern six times.
//	The postamble shifts and stores for the seventh result and 
//	adds, shifts, and stores for the eighth.
//
//	Timing:
//		  4	preamble (including bank conflict)
//		 12	body
//		  4	postamble
//		----------------
//		 20	per loop
//		x 8	loops
//		----------------
//		160 subtotal
//		  6	initialize
//	 	  3	finalize
//		================
//		169 total cycles
//--------------------------------------------------------------------------//

main_loop:	
// preamble
	mov 	al, [esi]
	 mov	bl, PITCH[esi]		// probable BANK CONFLICT
	mov 	dl, [edi]			// heat the cache
	 inc	edi					// increment destination at top
	lea 	eax, [1+eax+ebx]	// use a regular add in the preamble
	 mov	cl, [2*PITCH+esi]

// body (6 pels)
	shr 	eax, 1
	 mov	dl, [3*PITCH+esi]
	lea 	ebx, [ebx+ecx+1]
	 mov	[edi], al

	shr 	ebx, 1
	 mov	al, [4*PITCH+esi]
	lea 	ecx, [ecx+edx+1]
	 mov	[PITCH+edi], bl

	shr 	ecx, 1
	 mov	bl, [5*PITCH+esi]
	lea 	edx, [edx+eax+1]
	 mov	[2*PITCH+edi], cl

	shr 	edx, 1
	 mov	cl, [6*PITCH+esi]
	lea 	eax, [eax+ebx+1]
	 mov	[3*PITCH+edi], dl

	shr 	eax, 1
	 mov	dl, [7*PITCH+esi]
	lea 	ebx, [ebx+ecx+1]
	 mov	[4*PITCH+edi], al

	shr 	ebx, 1
	 mov	al, [8*PITCH+esi]
	lea 	ecx, [ecx+edx+1]
	 mov	[5*PITCH+edi], bl

// postamble
	shr 	ecx, 1
	 lea 	edx, [edx+eax+1]
	shr 	edx, 1
	 mov	[6*PITCH+edi], cl
	inc 	esi					// increment source pointer
	 mov	[7*PITCH+edi], dl
	dec 	ebp					// loop counter
	 jne	main_loop

// restore registers and return
	pop 	esi
	 pop	edi
	pop 	ebx
	 pop	ebp
	ret
  }	 // end of asm
}
// end Interpolate_Int_Half()
//--------------------------------------------------------------------------//


__declspec(naked)
void Interpolate_Half_Half (U32 pRef, U32 pNewRef)
{		
__asm {
	push	ebp
	 push	ebx
	push	edi
	 push	esi

	mov 	esi, [esp+20] 		// pRef = esp + 4 pushes + ret
	 mov	edi, [esp+24]		// pNewRef = esp + 4 pushes + ret + pRef
	mov		ebp, 8				// loop counter
	 sub 	edi, PITCH			// pre-decrement destination pointer
	xor 	ecx, ecx
	 xor	edx, edx

//--------------------------------------------------------------------------//
//
//	This loop is, basically, a 6 instruction, 3 cycle loop.
//	It is 3-folded, meaning that it works on 3 results per each 
//	3 cycle unit.  It is 8-unrolled, meaning that it does 8 results
//	(one block's row) per loop iteration.  The basic calculations
//	follow this pattern:
//
//	   pass-> 1        2        3
//	cycle	
//	  1     load | add left | 
//	      ----------------------------
//	  2     load |          | shift
//	      ----------------------------
//	  3          | add  all | store 
//
//	Five registers are used to preserve values from one pass to the next: 
//	  cl & dl		hold the last two pell values
//	  ebp or ebx	holds the sum of the two left-hand pells + 1
//	  eax			holds the sum of all four pells
//	Both adds are accomplished with the lea instruction.  For the sum
//	of the two left-hand pells, this allows a rounding bit to be added
//	in without using a separate instruction.  For both sums it allows
//	the result to be placed into a register independent of the sources'.
//	Since the sum of the two left-hand pells is used twice, it is place
//	alternately into ebx and ebp.
//	
//	The preamble does two preliminary loads plus passes 1 & 2 for the
//   first result, and pass 1 for the second.  The body executes the basic 
//	pattern six times.  The postamble does pass 3 for the  
//	seventh result and passes 2 & 3 for the eighth.
//
//	Due to the need for five registers, the loop counter is kept on
//	the stack.
//
//	Timing:
//		  8	preamble
//		 18	body
//		  5	postamble
//		----------------
//		 31	per loop
//		x 8	loops
//		----------------
//		248 subtotal
//		  5	initialize
//	 	  3	finalize
//		================
//		256 total cycles
//--------------------------------------------------------------------------//

main_loop:	
// preamble
	mov 	cl, [esi]					// pell 0
	 xor	eax, eax
	mov 	al, [esi+PITCH]				// pell 0
	 xor	ebx, ebx
	mov 	dl, [esi+1]					// pell 1
	 add 	eax, ecx					// partial sum 0 sans round
	mov 	bl, [esi+PITCH+1]			// pell 1
	 inc 	eax							// partial sum 0
	mov 	cl, [esi+2]					// pell 2
	 add	ebx, edx					// partial sum 1 sans round
	mov 	dl, [esi+PITCH+2]			// pell 2
	 inc	ebx							// partial sum 1
	add 	eax, ebx					// full sum 0
	 push	ebp							// save loop counter on stack
 	mov 	ebp, [edi+PITCH]			// heat the cache
	 add 	edi, PITCH					// increment dst. pointer at top of loop

// body (x 6)
	lea 	ebp, [ecx+edx+1]			// partial sum 2 with round
	 mov	cl, [esi+3]					// pell 3
	shr 	eax, 2						// value 0
	 mov	dl, [esi+PITCH+3]			// pell 3
	mov 	[edi], al					// write value 0
	 lea	eax, [ebx+ebp]				// full sum 1

	lea 	ebx, [ecx+edx+1]			// partial sum 3 with round
	 mov	cl, [esi+4]					// pell 4
	shr 	eax, 2						// value 1
	 mov	dl, [esi+PITCH+4]			// pell 4
	mov 	[edi+1], al					// write value 1
	 lea	eax, [ebx+ebp]				// full sum 2

	lea 	ebp, [ecx+edx+1]			// partial sum 4 with round
	 mov	cl, [esi+5]					// pell 5
	shr 	eax, 2						// value 2
	 mov	dl, [esi+PITCH+5]			// pell 5
	mov 	[edi+2], al					// write value 2
	 lea	eax, [ebx+ebp]				// full sum 3

	lea 	ebx, [ecx+edx+1]			// partial sum 5 with round
	 mov	cl, [esi+6]					// pell 6
	shr 	eax, 2						// value 3
	 mov	dl, [esi+PITCH+6]			// pell 6
	mov 	[edi+3], al					// write value 3
	 lea	eax, [ebx+ebp]				// full sum 4

	lea 	ebp, [ecx+edx+1]			// partial sum 6 with round
	 mov	cl, [esi+7]					// pell 7
	shr 	eax, 2						// value 4
	 mov	dl, [esi+PITCH+7]			// pell 7
	mov 	[edi+4], al					// write value 4
	 lea	eax, [ebx+ebp]				// full sum 5

	lea 	ebx, [ecx+edx+1]			// partial sum 7 with round
	 mov	cl, [esi+8]					// pell 8
	shr 	eax, 2						// value 5
	 mov	dl, [esi+PITCH+8]			// pell 8
	mov 	[edi+5], al					// write value 5
	 lea	eax, [ebx+ebp]				// full sum 6

// postamble
	shr 	eax, 2						// value 6
	 lea 	ebp, [ecx+edx+1]			// partial sum 8 with round
	mov 	[edi+6], al					// write value 6
	 add	esi, PITCH					// increment read pointer
	lea		eax, [ebx+ebp]				// full sum 7
	 pop	ebp							// restore loop counter
	shr 	eax, 2						// value 7
	 dec	ebp							// decrement loop counter
	mov 	[edi+7], al					// write value 7
	 jne	main_loop					// loop if not done

// restore registers and return
	pop 	esi
	 pop	edi
	pop 	ebx
	 pop	ebp
	ret
  }	 //end of asm
}
#pragma code_seg()
// end Interpolate_Half_Half()
//--------------------------------------------------------------------------//


/*
void Interpolate_Half_Half_C (U32 pRef, U32 pNewRef)
{
  U8 * pSrc = (U8 *) pRef;
  U8 * pDst = (U8 *) pNewRef;
  int i, j;

  for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
   	 for (j=0; j<8; j++)
	 	pDst[j] = (pSrc[j] + pSrc[j+1] + pSrc[PITCH+j] + pSrc[PITCH+j+1] + 2) >> 2;
}

void Interpolate_Int_Half_C (U32 pRef, U32 pNewRef)
{
  U8 * pSrc = (U8 *) pRef;
  U8 * pDst = (U8 *) pNewRef;
  int i, j;

  for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
   	 for (j=0; j<8; j++)
	 	pDst[j] = (pSrc[j] + pSrc[PITCH+j] + 1) >> 1;
}

void Interpolate_Half_Int_C (U32 pRef, U32 pNewRef)
{
  U8 * pSrc = (U8 *) pRef;
  U8 * pDst = (U8 *) pNewRef;
  int i, j;

  for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
   	 for (j=0; j<8; j++)
	 	pDst[j] = (pSrc[j] + pSrc[j+1] + 1) >> 1;
}
*/