You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
523 lines
15 KiB
523 lines
15 KiB
/* *************************************************************************
|
|
** INTEL Corporation Proprietary Information
|
|
**
|
|
** This listing is supplied under the terms of a license
|
|
** agreement with INTEL Corporation and may not be copied
|
|
** nor disclosed except in accordance with the terms of
|
|
** that agreement.
|
|
**
|
|
** Copyright (c) 1995 Intel Corporation.
|
|
** All Rights Reserved.
|
|
**
|
|
** *************************************************************************
|
|
*/
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// $Author: AGUPTA2 $
|
|
// $Date: 08 Mar 1996 16:46:18 $
|
|
// $Archive: S:\h26x\src\dec\d3halfmc.cpv $
|
|
// $Header: S:\h26x\src\dec\d3halfmc.cpv 1.15 08 Mar 1996 16:46:18 AGUPTA2 $
|
|
// $Log: S:\h26x\src\dec\d3halfmc.cpv $
|
|
//
|
|
// Rev 1.15 08 Mar 1996 16:46:18 AGUPTA2
|
|
// Added pragma code_seg.
|
|
//
|
|
//
|
|
// Rev 1.14 29 Jan 1996 17:53:56 RMCKENZX
|
|
// Completely re-wrote all 3 routines. The loops no longer use pseudo
|
|
// SIMD logic and have been tightened to 256, 169, and 169 cycles
|
|
// for half-half, half-int, and int-half respectively.
|
|
//
|
|
// Rev 1.13 19 Jan 1996 17:40:36 RMCKENZX
|
|
// fixed half-int so it will correctly round
|
|
//
|
|
// Rev 1.12 19 Jan 1996 13:29:32 RHAZRA
|
|
// Fixed halfpixel prediction by bilinear interpolation in ASM code
|
|
//
|
|
// Rev 1.11 27 Dec 1995 14:36:06 RMCKENZX
|
|
// Added copyright notice
|
|
//
|
|
// Rev 1.10 09 Oct 1995 09:43:36 CZHU
|
|
// Fixed bug in (half,half) interpolation optimization
|
|
//
|
|
// Rev 1.9 08 Oct 1995 13:40:14 CZHU
|
|
// Added C version of (half,half) and use it for now until we fix the bug
|
|
// in the optimized version
|
|
//
|
|
// Rev 1.8 03 Oct 1995 15:06:30 CZHU
|
|
//
|
|
// Adding debug assistance
|
|
//
|
|
// Rev 1.7 28 Sep 1995 15:32:22 CZHU
|
|
// Fixed bugs mast off bits after shift
|
|
//
|
|
// Rev 1.6 26 Sep 1995 11:13:36 CZHU
|
|
//
|
|
// Adjust pitch back to normal, and changed UINT to U32
|
|
//
|
|
// Rev 1.5 25 Sep 1995 09:04:14 CZHU
|
|
// Added and cleaned some comments
|
|
//
|
|
// Rev 1.4 22 Sep 1995 16:42:00 CZHU
|
|
//
|
|
// improve pairing
|
|
//
|
|
// Rev 1.3 22 Sep 1995 15:59:48 CZHU
|
|
// finished first around coding of half pel interpolation and tested
|
|
// with the standalone program
|
|
//
|
|
// Rev 1.2 21 Sep 1995 16:56:28 CZHU
|
|
// Unit tested (half, int) case
|
|
//
|
|
// Rev 1.1 21 Sep 1995 12:06:22 CZHU
|
|
// More development
|
|
//
|
|
// Rev 1.0 20 Sep 1995 16:27:56 CZHU
|
|
// Initial revision.
|
|
//
|
|
|
|
#include "precomp.h"
|
|
|
|
#define FRAMEPOINTER esp
|
|
|
|
//Interpolat_Int_half interpolated the pels from the pRef block
|
|
//Write to pNewRef.
|
|
//Assumes that pRef area has been expanded
|
|
// Todo: Loop control and setup the stack for locals,CZHU,9/20/95
|
|
// preload output cache lines, 9/21
|
|
// Cache preload is no longer needed, 9/21/95
|
|
// Cycles count: 50*4 =200 cycles
|
|
|
|
#pragma code_seg("IACODE2")
|
|
__declspec(naked)
|
|
void Interpolate_Half_Int (U32 pRef, U32 pNewRef)
|
|
{
|
|
__asm {
|
|
push ebp
|
|
push ebx
|
|
push edi
|
|
push esi
|
|
|
|
mov esi, [esp+20] // pRef = esp + 4 pushes + ret
|
|
mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
|
|
sub edi, PITCH // pre-decrement destination
|
|
mov ebp, 8 // loop counter
|
|
xor eax, eax // clear registers
|
|
xor ebx, ebx
|
|
xor ecx, ecx
|
|
xor edx, edx
|
|
|
|
//--------------------------------------------------------------------------//
|
|
//
|
|
// This loop is, basically, a 4 instruction, 2 cycle loop.
|
|
// It is 3-folded, meaning that it works on 3 results per each
|
|
// 2 cycle unit. It is 8-unrolled, meaning that it does 8 results
|
|
// (one block's row) per loop iteration. The basic calculations
|
|
// follow this pattern:
|
|
//
|
|
// pass-> 1 2 3
|
|
// cycle
|
|
// 1 load | | shift
|
|
// -----------------------
|
|
// 2 | add | store
|
|
//
|
|
// This assumes that the prior pell's value was loaded and
|
|
// preserved from the prior result's calculation. Therefore
|
|
// each result uses 2 registers -- one to load (and preserve)
|
|
// the right-hand pell, and the other (overwriting the previous
|
|
// result's stored pell value) to add into, shift, and store out
|
|
// of. The add is accomplished with the lea instruction, allowing
|
|
// a round bit to be added in without using a separate instruction.
|
|
//
|
|
// The preamble loads & adds for the first result, and loads
|
|
// for the second. The body executes the basic pattern six times.
|
|
// The postamble shifts and stores for the seventh result and
|
|
// adds, shifts, and stores for the eighth.
|
|
//
|
|
// Timing:
|
|
// 4 preamble (including bank conflict)
|
|
// 12 body
|
|
// 4 postamble
|
|
// ----------------
|
|
// 20 per loop
|
|
// x 8 loops
|
|
// ----------------
|
|
// 160 subtotal
|
|
// 6 initialize
|
|
// 3 finalize
|
|
// ================
|
|
// 169 total cycles
|
|
//--------------------------------------------------------------------------//
|
|
|
|
main_loop:
|
|
// preamble
|
|
mov al, 0[esi]
|
|
mov bl, 1[esi] // probable BANK CONFLICT
|
|
mov dl, 0[edi] // heat the cache
|
|
add edi, PITCH // increment destination at top
|
|
lea eax, [1+eax+ebx] // use a regular add in the preamble
|
|
mov cl, 2[esi]
|
|
|
|
// body (6 pels)
|
|
shr eax, 1
|
|
mov dl, 3[esi]
|
|
lea ebx, [ebx+ecx+1]
|
|
mov 0[edi], al
|
|
|
|
shr ebx, 1
|
|
mov al, 4[esi]
|
|
lea ecx, [ecx+edx+1]
|
|
mov 1[edi], bl
|
|
|
|
shr ecx, 1
|
|
mov bl, 5[esi]
|
|
lea edx, [edx+eax+1]
|
|
mov 2[edi], cl
|
|
|
|
shr edx, 1
|
|
mov cl, 6[esi]
|
|
lea eax, [eax+ebx+1]
|
|
mov 3[edi], dl
|
|
|
|
shr eax, 1
|
|
mov dl, 7[esi]
|
|
lea ebx, [ebx+ecx+1]
|
|
mov 4[edi], al
|
|
|
|
shr ebx, 1
|
|
mov al, 8[esi]
|
|
lea ecx, [ecx+edx+1]
|
|
mov 5[edi], bl
|
|
|
|
// postamble
|
|
shr ecx, 1
|
|
lea edx, [edx+eax+1]
|
|
shr edx, 1
|
|
mov 6[edi], cl
|
|
add esi, PITCH // increment source pointer
|
|
mov 7[edi], dl
|
|
dec ebp // loop counter
|
|
jne main_loop
|
|
|
|
// restore registers and return
|
|
pop esi
|
|
pop edi
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
} //end of asm
|
|
}
|
|
// end Interpolate_Half_Int()
|
|
//--------------------------------------------------------------------------//
|
|
|
|
|
|
__declspec(naked)
|
|
void Interpolate_Int_Half (U32 pRef, U32 pNewRef)
|
|
{
|
|
__asm {
|
|
push ebp
|
|
push ebx
|
|
push edi
|
|
push esi
|
|
|
|
mov esi, [esp+20] // pRef = esp + 4 pushes + ret
|
|
mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
|
|
dec edi // pre-decrement destination
|
|
mov ebp, 8 // loop counter
|
|
xor eax, eax // clear registers
|
|
xor ebx, ebx
|
|
xor ecx, ecx
|
|
xor edx, edx
|
|
|
|
//--------------------------------------------------------------------------//
|
|
//
|
|
// This loop is, basically, a 4 instruction, 2 cycle loop.
|
|
// It is 3-folded, meaning that it works on 3 results per each
|
|
// 2 cycle unit. It is 8-unrolled, meaning that it does 8 results
|
|
// (one block's row) per loop iteration. The basic calculations
|
|
// follow this pattern:
|
|
//
|
|
// pass-> 1 2 3
|
|
// cycle
|
|
// 1 load | | shift
|
|
// -----------------------
|
|
// 2 | add | store
|
|
//
|
|
// This assumes that the prior pell's value was loaded and
|
|
// preserved from the prior result's calculation. Therefore
|
|
// each result uses 2 registers -- one to load (and preserve)
|
|
// the right-hand pell, and the other (overwriting the previous
|
|
// result's stored pell value) to add into, shift, and store out
|
|
// of. The add is accomplished with the lea instruction, allowing
|
|
// a round bit to be added in without using a separate instruction.
|
|
//
|
|
// The preamble loads & adds for the first result, and loads
|
|
// for the second. The body executes the basic pattern six times.
|
|
// The postamble shifts and stores for the seventh result and
|
|
// adds, shifts, and stores for the eighth.
|
|
//
|
|
// Timing:
|
|
// 4 preamble (including bank conflict)
|
|
// 12 body
|
|
// 4 postamble
|
|
// ----------------
|
|
// 20 per loop
|
|
// x 8 loops
|
|
// ----------------
|
|
// 160 subtotal
|
|
// 6 initialize
|
|
// 3 finalize
|
|
// ================
|
|
// 169 total cycles
|
|
//--------------------------------------------------------------------------//
|
|
|
|
main_loop:
|
|
// preamble
|
|
mov al, [esi]
|
|
mov bl, PITCH[esi] // probable BANK CONFLICT
|
|
mov dl, [edi] // heat the cache
|
|
inc edi // increment destination at top
|
|
lea eax, [1+eax+ebx] // use a regular add in the preamble
|
|
mov cl, [2*PITCH+esi]
|
|
|
|
// body (6 pels)
|
|
shr eax, 1
|
|
mov dl, [3*PITCH+esi]
|
|
lea ebx, [ebx+ecx+1]
|
|
mov [edi], al
|
|
|
|
shr ebx, 1
|
|
mov al, [4*PITCH+esi]
|
|
lea ecx, [ecx+edx+1]
|
|
mov [PITCH+edi], bl
|
|
|
|
shr ecx, 1
|
|
mov bl, [5*PITCH+esi]
|
|
lea edx, [edx+eax+1]
|
|
mov [2*PITCH+edi], cl
|
|
|
|
shr edx, 1
|
|
mov cl, [6*PITCH+esi]
|
|
lea eax, [eax+ebx+1]
|
|
mov [3*PITCH+edi], dl
|
|
|
|
shr eax, 1
|
|
mov dl, [7*PITCH+esi]
|
|
lea ebx, [ebx+ecx+1]
|
|
mov [4*PITCH+edi], al
|
|
|
|
shr ebx, 1
|
|
mov al, [8*PITCH+esi]
|
|
lea ecx, [ecx+edx+1]
|
|
mov [5*PITCH+edi], bl
|
|
|
|
// postamble
|
|
shr ecx, 1
|
|
lea edx, [edx+eax+1]
|
|
shr edx, 1
|
|
mov [6*PITCH+edi], cl
|
|
inc esi // increment source pointer
|
|
mov [7*PITCH+edi], dl
|
|
dec ebp // loop counter
|
|
jne main_loop
|
|
|
|
// restore registers and return
|
|
pop esi
|
|
pop edi
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
} // end of asm
|
|
}
|
|
// end Interpolate_Int_Half()
|
|
//--------------------------------------------------------------------------//
|
|
|
|
|
|
__declspec(naked)
|
|
void Interpolate_Half_Half (U32 pRef, U32 pNewRef)
|
|
{
|
|
__asm {
|
|
push ebp
|
|
push ebx
|
|
push edi
|
|
push esi
|
|
|
|
mov esi, [esp+20] // pRef = esp + 4 pushes + ret
|
|
mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
|
|
mov ebp, 8 // loop counter
|
|
sub edi, PITCH // pre-decrement destination pointer
|
|
xor ecx, ecx
|
|
xor edx, edx
|
|
|
|
//--------------------------------------------------------------------------//
|
|
//
|
|
// This loop is, basically, a 6 instruction, 3 cycle loop.
|
|
// It is 3-folded, meaning that it works on 3 results per each
|
|
// 3 cycle unit. It is 8-unrolled, meaning that it does 8 results
|
|
// (one block's row) per loop iteration. The basic calculations
|
|
// follow this pattern:
|
|
//
|
|
// pass-> 1 2 3
|
|
// cycle
|
|
// 1 load | add left |
|
|
// ----------------------------
|
|
// 2 load | | shift
|
|
// ----------------------------
|
|
// 3 | add all | store
|
|
//
|
|
// Five registers are used to preserve values from one pass to the next:
|
|
// cl & dl hold the last two pell values
|
|
// ebp or ebx holds the sum of the two left-hand pells + 1
|
|
// eax holds the sum of all four pells
|
|
// Both adds are accomplished with the lea instruction. For the sum
|
|
// of the two left-hand pells, this allows a rounding bit to be added
|
|
// in without using a separate instruction. For both sums it allows
|
|
// the result to be placed into a register independent of the sources'.
|
|
// Since the sum of the two left-hand pells is used twice, it is place
|
|
// alternately into ebx and ebp.
|
|
//
|
|
// The preamble does two preliminary loads plus passes 1 & 2 for the
|
|
// first result, and pass 1 for the second. The body executes the basic
|
|
// pattern six times. The postamble does pass 3 for the
|
|
// seventh result and passes 2 & 3 for the eighth.
|
|
//
|
|
// Due to the need for five registers, the loop counter is kept on
|
|
// the stack.
|
|
//
|
|
// Timing:
|
|
// 8 preamble
|
|
// 18 body
|
|
// 5 postamble
|
|
// ----------------
|
|
// 31 per loop
|
|
// x 8 loops
|
|
// ----------------
|
|
// 248 subtotal
|
|
// 5 initialize
|
|
// 3 finalize
|
|
// ================
|
|
// 256 total cycles
|
|
//--------------------------------------------------------------------------//
|
|
|
|
main_loop:
|
|
// preamble
|
|
mov cl, [esi] // pell 0
|
|
xor eax, eax
|
|
mov al, [esi+PITCH] // pell 0
|
|
xor ebx, ebx
|
|
mov dl, [esi+1] // pell 1
|
|
add eax, ecx // partial sum 0 sans round
|
|
mov bl, [esi+PITCH+1] // pell 1
|
|
inc eax // partial sum 0
|
|
mov cl, [esi+2] // pell 2
|
|
add ebx, edx // partial sum 1 sans round
|
|
mov dl, [esi+PITCH+2] // pell 2
|
|
inc ebx // partial sum 1
|
|
add eax, ebx // full sum 0
|
|
push ebp // save loop counter on stack
|
|
mov ebp, [edi+PITCH] // heat the cache
|
|
add edi, PITCH // increment dst. pointer at top of loop
|
|
|
|
// body (x 6)
|
|
lea ebp, [ecx+edx+1] // partial sum 2 with round
|
|
mov cl, [esi+3] // pell 3
|
|
shr eax, 2 // value 0
|
|
mov dl, [esi+PITCH+3] // pell 3
|
|
mov [edi], al // write value 0
|
|
lea eax, [ebx+ebp] // full sum 1
|
|
|
|
lea ebx, [ecx+edx+1] // partial sum 3 with round
|
|
mov cl, [esi+4] // pell 4
|
|
shr eax, 2 // value 1
|
|
mov dl, [esi+PITCH+4] // pell 4
|
|
mov [edi+1], al // write value 1
|
|
lea eax, [ebx+ebp] // full sum 2
|
|
|
|
lea ebp, [ecx+edx+1] // partial sum 4 with round
|
|
mov cl, [esi+5] // pell 5
|
|
shr eax, 2 // value 2
|
|
mov dl, [esi+PITCH+5] // pell 5
|
|
mov [edi+2], al // write value 2
|
|
lea eax, [ebx+ebp] // full sum 3
|
|
|
|
lea ebx, [ecx+edx+1] // partial sum 5 with round
|
|
mov cl, [esi+6] // pell 6
|
|
shr eax, 2 // value 3
|
|
mov dl, [esi+PITCH+6] // pell 6
|
|
mov [edi+3], al // write value 3
|
|
lea eax, [ebx+ebp] // full sum 4
|
|
|
|
lea ebp, [ecx+edx+1] // partial sum 6 with round
|
|
mov cl, [esi+7] // pell 7
|
|
shr eax, 2 // value 4
|
|
mov dl, [esi+PITCH+7] // pell 7
|
|
mov [edi+4], al // write value 4
|
|
lea eax, [ebx+ebp] // full sum 5
|
|
|
|
lea ebx, [ecx+edx+1] // partial sum 7 with round
|
|
mov cl, [esi+8] // pell 8
|
|
shr eax, 2 // value 5
|
|
mov dl, [esi+PITCH+8] // pell 8
|
|
mov [edi+5], al // write value 5
|
|
lea eax, [ebx+ebp] // full sum 6
|
|
|
|
// postamble
|
|
shr eax, 2 // value 6
|
|
lea ebp, [ecx+edx+1] // partial sum 8 with round
|
|
mov [edi+6], al // write value 6
|
|
add esi, PITCH // increment read pointer
|
|
lea eax, [ebx+ebp] // full sum 7
|
|
pop ebp // restore loop counter
|
|
shr eax, 2 // value 7
|
|
dec ebp // decrement loop counter
|
|
mov [edi+7], al // write value 7
|
|
jne main_loop // loop if not done
|
|
|
|
// restore registers and return
|
|
pop esi
|
|
pop edi
|
|
pop ebx
|
|
pop ebp
|
|
ret
|
|
} //end of asm
|
|
}
|
|
#pragma code_seg()
|
|
// end Interpolate_Half_Half()
|
|
//--------------------------------------------------------------------------//
|
|
|
|
|
|
/*
|
|
void Interpolate_Half_Half_C (U32 pRef, U32 pNewRef)
|
|
{
|
|
U8 * pSrc = (U8 *) pRef;
|
|
U8 * pDst = (U8 *) pNewRef;
|
|
int i, j;
|
|
|
|
for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
|
|
for (j=0; j<8; j++)
|
|
pDst[j] = (pSrc[j] + pSrc[j+1] + pSrc[PITCH+j] + pSrc[PITCH+j+1] + 2) >> 2;
|
|
}
|
|
|
|
void Interpolate_Int_Half_C (U32 pRef, U32 pNewRef)
|
|
{
|
|
U8 * pSrc = (U8 *) pRef;
|
|
U8 * pDst = (U8 *) pNewRef;
|
|
int i, j;
|
|
|
|
for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
|
|
for (j=0; j<8; j++)
|
|
pDst[j] = (pSrc[j] + pSrc[PITCH+j] + 1) >> 1;
|
|
}
|
|
|
|
void Interpolate_Half_Int_C (U32 pRef, U32 pNewRef)
|
|
{
|
|
U8 * pSrc = (U8 *) pRef;
|
|
U8 * pDst = (U8 *) pNewRef;
|
|
int i, j;
|
|
|
|
for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
|
|
for (j=0; j<8; j++)
|
|
pDst[j] = (pSrc[j] + pSrc[j+1] + 1) >> 1;
|
|
}
|
|
*/
|
|
|