|
|
/* *************************************************************************
** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */
//////////////////////////////////////////////////////////////////////////
// $Author: AGUPTA2 $
// $Date: 08 Mar 1996 16:46:18 $
// $Archive: S:\h26x\src\dec\d3halfmc.cpv $
// $Header: S:\h26x\src\dec\d3halfmc.cpv 1.15 08 Mar 1996 16:46:18 AGUPTA2 $
// $Log: S:\h26x\src\dec\d3halfmc.cpv $
//
// Rev 1.15 08 Mar 1996 16:46:18 AGUPTA2
// Added pragma code_seg.
//
//
// Rev 1.14 29 Jan 1996 17:53:56 RMCKENZX
// Completely re-wrote all 3 routines. The loops no longer use pseudo
// SIMD logic and have been tightened to 256, 169, and 169 cycles
// for half-half, half-int, and int-half respectively.
//
// Rev 1.13 19 Jan 1996 17:40:36 RMCKENZX
// fixed half-int so it will correctly round
//
// Rev 1.12 19 Jan 1996 13:29:32 RHAZRA
// Fixed halfpixel prediction by bilinear interpolation in ASM code
//
// Rev 1.11 27 Dec 1995 14:36:06 RMCKENZX
// Added copyright notice
//
// Rev 1.10 09 Oct 1995 09:43:36 CZHU
// Fixed bug in (half,half) interpolation optimization
//
// Rev 1.9 08 Oct 1995 13:40:14 CZHU
// Added C version of (half,half) and use it for now until we fix the bug
// in the optimized version
//
// Rev 1.8 03 Oct 1995 15:06:30 CZHU
//
// Adding debug assistance
//
// Rev 1.7 28 Sep 1995 15:32:22 CZHU
// Fixed bugs mast off bits after shift
//
// Rev 1.6 26 Sep 1995 11:13:36 CZHU
//
// Adjust pitch back to normal, and changed UINT to U32
//
// Rev 1.5 25 Sep 1995 09:04:14 CZHU
// Added and cleaned some comments
//
// Rev 1.4 22 Sep 1995 16:42:00 CZHU
//
// improve pairing
//
// Rev 1.3 22 Sep 1995 15:59:48 CZHU
// finished first around coding of half pel interpolation and tested
// with the standalone program
//
// Rev 1.2 21 Sep 1995 16:56:28 CZHU
// Unit tested (half, int) case
//
// Rev 1.1 21 Sep 1995 12:06:22 CZHU
// More development
//
// Rev 1.0 20 Sep 1995 16:27:56 CZHU
// Initial revision.
//
#include "precomp.h"
#define FRAMEPOINTER esp
//Interpolat_Int_half interpolated the pels from the pRef block
//Write to pNewRef.
//Assumes that pRef area has been expanded
// Todo: Loop control and setup the stack for locals,CZHU,9/20/95
// preload output cache lines, 9/21
// Cache preload is no longer needed, 9/21/95
// Cycles count: 50*4 =200 cycles
#pragma code_seg("IACODE2")
__declspec(naked) void Interpolate_Half_Int (U32 pRef, U32 pNewRef) { __asm { push ebp push ebx push edi push esi
mov esi, [esp+20] // pRef = esp + 4 pushes + ret
mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
sub edi, PITCH // pre-decrement destination
mov ebp, 8 // loop counter
xor eax, eax // clear registers
xor ebx, ebx xor ecx, ecx xor edx, edx
//--------------------------------------------------------------------------//
//
// This loop is, basically, a 4 instruction, 2 cycle loop.
// It is 3-folded, meaning that it works on 3 results per each
// 2 cycle unit. It is 8-unrolled, meaning that it does 8 results
// (one block's row) per loop iteration. The basic calculations
// follow this pattern:
//
// pass-> 1 2 3
// cycle
// 1 load | | shift
// -----------------------
// 2 | add | store
//
// This assumes that the prior pell's value was loaded and
// preserved from the prior result's calculation. Therefore
// each result uses 2 registers -- one to load (and preserve)
// the right-hand pell, and the other (overwriting the previous
// result's stored pell value) to add into, shift, and store out
// of. The add is accomplished with the lea instruction, allowing
// a round bit to be added in without using a separate instruction.
//
// The preamble loads & adds for the first result, and loads
// for the second. The body executes the basic pattern six times.
// The postamble shifts and stores for the seventh result and
// adds, shifts, and stores for the eighth.
//
// Timing:
// 4 preamble (including bank conflict)
// 12 body
// 4 postamble
// ----------------
// 20 per loop
// x 8 loops
// ----------------
// 160 subtotal
// 6 initialize
// 3 finalize
// ================
// 169 total cycles
//--------------------------------------------------------------------------//
main_loop: // preamble
mov al, 0[esi] mov bl, 1[esi] // probable BANK CONFLICT
mov dl, 0[edi] // heat the cache
add edi, PITCH // increment destination at top
lea eax, [1+eax+ebx] // use a regular add in the preamble
mov cl, 2[esi]
// body (6 pels)
shr eax, 1 mov dl, 3[esi] lea ebx, [ebx+ecx+1] mov 0[edi], al
shr ebx, 1 mov al, 4[esi] lea ecx, [ecx+edx+1] mov 1[edi], bl
shr ecx, 1 mov bl, 5[esi] lea edx, [edx+eax+1] mov 2[edi], cl
shr edx, 1 mov cl, 6[esi] lea eax, [eax+ebx+1] mov 3[edi], dl
shr eax, 1 mov dl, 7[esi] lea ebx, [ebx+ecx+1] mov 4[edi], al
shr ebx, 1 mov al, 8[esi] lea ecx, [ecx+edx+1] mov 5[edi], bl
// postamble
shr ecx, 1 lea edx, [edx+eax+1] shr edx, 1 mov 6[edi], cl add esi, PITCH // increment source pointer
mov 7[edi], dl dec ebp // loop counter
jne main_loop
// restore registers and return
pop esi pop edi pop ebx pop ebp ret } //end of asm
} // end Interpolate_Half_Int()
//--------------------------------------------------------------------------//
__declspec(naked) void Interpolate_Int_Half (U32 pRef, U32 pNewRef) { __asm { push ebp push ebx push edi push esi
mov esi, [esp+20] // pRef = esp + 4 pushes + ret
mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
dec edi // pre-decrement destination
mov ebp, 8 // loop counter
xor eax, eax // clear registers
xor ebx, ebx xor ecx, ecx xor edx, edx
//--------------------------------------------------------------------------//
//
// This loop is, basically, a 4 instruction, 2 cycle loop.
// It is 3-folded, meaning that it works on 3 results per each
// 2 cycle unit. It is 8-unrolled, meaning that it does 8 results
// (one block's row) per loop iteration. The basic calculations
// follow this pattern:
//
// pass-> 1 2 3
// cycle
// 1 load | | shift
// -----------------------
// 2 | add | store
//
// This assumes that the prior pell's value was loaded and
// preserved from the prior result's calculation. Therefore
// each result uses 2 registers -- one to load (and preserve)
// the right-hand pell, and the other (overwriting the previous
// result's stored pell value) to add into, shift, and store out
// of. The add is accomplished with the lea instruction, allowing
// a round bit to be added in without using a separate instruction.
//
// The preamble loads & adds for the first result, and loads
// for the second. The body executes the basic pattern six times.
// The postamble shifts and stores for the seventh result and
// adds, shifts, and stores for the eighth.
//
// Timing:
// 4 preamble (including bank conflict)
// 12 body
// 4 postamble
// ----------------
// 20 per loop
// x 8 loops
// ----------------
// 160 subtotal
// 6 initialize
// 3 finalize
// ================
// 169 total cycles
//--------------------------------------------------------------------------//
main_loop: // preamble
mov al, [esi] mov bl, PITCH[esi] // probable BANK CONFLICT
mov dl, [edi] // heat the cache
inc edi // increment destination at top
lea eax, [1+eax+ebx] // use a regular add in the preamble
mov cl, [2*PITCH+esi]
// body (6 pels)
shr eax, 1 mov dl, [3*PITCH+esi] lea ebx, [ebx+ecx+1] mov [edi], al
shr ebx, 1 mov al, [4*PITCH+esi] lea ecx, [ecx+edx+1] mov [PITCH+edi], bl
shr ecx, 1 mov bl, [5*PITCH+esi] lea edx, [edx+eax+1] mov [2*PITCH+edi], cl
shr edx, 1 mov cl, [6*PITCH+esi] lea eax, [eax+ebx+1] mov [3*PITCH+edi], dl
shr eax, 1 mov dl, [7*PITCH+esi] lea ebx, [ebx+ecx+1] mov [4*PITCH+edi], al
shr ebx, 1 mov al, [8*PITCH+esi] lea ecx, [ecx+edx+1] mov [5*PITCH+edi], bl
// postamble
shr ecx, 1 lea edx, [edx+eax+1] shr edx, 1 mov [6*PITCH+edi], cl inc esi // increment source pointer
mov [7*PITCH+edi], dl dec ebp // loop counter
jne main_loop
// restore registers and return
pop esi pop edi pop ebx pop ebp ret } // end of asm
} // end Interpolate_Int_Half()
//--------------------------------------------------------------------------//
__declspec(naked) void Interpolate_Half_Half (U32 pRef, U32 pNewRef) { __asm { push ebp push ebx push edi push esi
mov esi, [esp+20] // pRef = esp + 4 pushes + ret
mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
mov ebp, 8 // loop counter
sub edi, PITCH // pre-decrement destination pointer
xor ecx, ecx xor edx, edx
//--------------------------------------------------------------------------//
//
// This loop is, basically, a 6 instruction, 3 cycle loop.
// It is 3-folded, meaning that it works on 3 results per each
// 3 cycle unit. It is 8-unrolled, meaning that it does 8 results
// (one block's row) per loop iteration. The basic calculations
// follow this pattern:
//
// pass-> 1 2 3
// cycle
// 1 load | add left |
// ----------------------------
// 2 load | | shift
// ----------------------------
// 3 | add all | store
//
// Five registers are used to preserve values from one pass to the next:
// cl & dl hold the last two pell values
// ebp or ebx holds the sum of the two left-hand pells + 1
// eax holds the sum of all four pells
// Both adds are accomplished with the lea instruction. For the sum
// of the two left-hand pells, this allows a rounding bit to be added
// in without using a separate instruction. For both sums it allows
// the result to be placed into a register independent of the sources'.
// Since the sum of the two left-hand pells is used twice, it is place
// alternately into ebx and ebp.
//
// The preamble does two preliminary loads plus passes 1 & 2 for the
// first result, and pass 1 for the second. The body executes the basic
// pattern six times. The postamble does pass 3 for the
// seventh result and passes 2 & 3 for the eighth.
//
// Due to the need for five registers, the loop counter is kept on
// the stack.
//
// Timing:
// 8 preamble
// 18 body
// 5 postamble
// ----------------
// 31 per loop
// x 8 loops
// ----------------
// 248 subtotal
// 5 initialize
// 3 finalize
// ================
// 256 total cycles
//--------------------------------------------------------------------------//
main_loop: // preamble
mov cl, [esi] // pell 0
xor eax, eax mov al, [esi+PITCH] // pell 0
xor ebx, ebx mov dl, [esi+1] // pell 1
add eax, ecx // partial sum 0 sans round
mov bl, [esi+PITCH+1] // pell 1
inc eax // partial sum 0
mov cl, [esi+2] // pell 2
add ebx, edx // partial sum 1 sans round
mov dl, [esi+PITCH+2] // pell 2
inc ebx // partial sum 1
add eax, ebx // full sum 0
push ebp // save loop counter on stack
mov ebp, [edi+PITCH] // heat the cache
add edi, PITCH // increment dst. pointer at top of loop
// body (x 6)
lea ebp, [ecx+edx+1] // partial sum 2 with round
mov cl, [esi+3] // pell 3
shr eax, 2 // value 0
mov dl, [esi+PITCH+3] // pell 3
mov [edi], al // write value 0
lea eax, [ebx+ebp] // full sum 1
lea ebx, [ecx+edx+1] // partial sum 3 with round
mov cl, [esi+4] // pell 4
shr eax, 2 // value 1
mov dl, [esi+PITCH+4] // pell 4
mov [edi+1], al // write value 1
lea eax, [ebx+ebp] // full sum 2
lea ebp, [ecx+edx+1] // partial sum 4 with round
mov cl, [esi+5] // pell 5
shr eax, 2 // value 2
mov dl, [esi+PITCH+5] // pell 5
mov [edi+2], al // write value 2
lea eax, [ebx+ebp] // full sum 3
lea ebx, [ecx+edx+1] // partial sum 5 with round
mov cl, [esi+6] // pell 6
shr eax, 2 // value 3
mov dl, [esi+PITCH+6] // pell 6
mov [edi+3], al // write value 3
lea eax, [ebx+ebp] // full sum 4
lea ebp, [ecx+edx+1] // partial sum 6 with round
mov cl, [esi+7] // pell 7
shr eax, 2 // value 4
mov dl, [esi+PITCH+7] // pell 7
mov [edi+4], al // write value 4
lea eax, [ebx+ebp] // full sum 5
lea ebx, [ecx+edx+1] // partial sum 7 with round
mov cl, [esi+8] // pell 8
shr eax, 2 // value 5
mov dl, [esi+PITCH+8] // pell 8
mov [edi+5], al // write value 5
lea eax, [ebx+ebp] // full sum 6
// postamble
shr eax, 2 // value 6
lea ebp, [ecx+edx+1] // partial sum 8 with round
mov [edi+6], al // write value 6
add esi, PITCH // increment read pointer
lea eax, [ebx+ebp] // full sum 7
pop ebp // restore loop counter
shr eax, 2 // value 7
dec ebp // decrement loop counter
mov [edi+7], al // write value 7
jne main_loop // loop if not done
// restore registers and return
pop esi pop edi pop ebx pop ebp ret } //end of asm
} #pragma code_seg()
// end Interpolate_Half_Half()
//--------------------------------------------------------------------------//
/*
void Interpolate_Half_Half_C (U32 pRef, U32 pNewRef) { U8 * pSrc = (U8 *) pRef; U8 * pDst = (U8 *) pNewRef; int i, j;
for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH) for (j=0; j<8; j++) pDst[j] = (pSrc[j] + pSrc[j+1] + pSrc[PITCH+j] + pSrc[PITCH+j+1] + 2) >> 2; }
void Interpolate_Int_Half_C (U32 pRef, U32 pNewRef) { U8 * pSrc = (U8 *) pRef; U8 * pDst = (U8 *) pNewRef; int i, j;
for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH) for (j=0; j<8; j++) pDst[j] = (pSrc[j] + pSrc[PITCH+j] + 1) >> 1; }
void Interpolate_Half_Int_C (U32 pRef, U32 pNewRef) { U8 * pSrc = (U8 *) pRef; U8 * pDst = (U8 *) pNewRef; int i, j;
for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH) for (j=0; j<8; j++) pDst[j] = (pSrc[j] + pSrc[j+1] + 1) >> 1; } */
|