/* ************************************************************************* ** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */ ////////////////////////////////////////////////////////////////////////// // $Author: AGUPTA2 $ // $Date: 08 Mar 1996 16:46:18 $ // $Archive: S:\h26x\src\dec\d3halfmc.cpv $ // $Header: S:\h26x\src\dec\d3halfmc.cpv 1.15 08 Mar 1996 16:46:18 AGUPTA2 $ // $Log: S:\h26x\src\dec\d3halfmc.cpv $ // // Rev 1.15 08 Mar 1996 16:46:18 AGUPTA2 // Added pragma code_seg. // // // Rev 1.14 29 Jan 1996 17:53:56 RMCKENZX // Completely re-wrote all 3 routines. The loops no longer use pseudo // SIMD logic and have been tightened to 256, 169, and 169 cycles // for half-half, half-int, and int-half respectively. // // Rev 1.13 19 Jan 1996 17:40:36 RMCKENZX // fixed half-int so it will correctly round // // Rev 1.12 19 Jan 1996 13:29:32 RHAZRA // Fixed halfpixel prediction by bilinear interpolation in ASM code // // Rev 1.11 27 Dec 1995 14:36:06 RMCKENZX // Added copyright notice // // Rev 1.10 09 Oct 1995 09:43:36 CZHU // Fixed bug in (half,half) interpolation optimization // // Rev 1.9 08 Oct 1995 13:40:14 CZHU // Added C version of (half,half) and use it for now until we fix the bug // in the optimized version // // Rev 1.8 03 Oct 1995 15:06:30 CZHU // // Adding debug assistance // // Rev 1.7 28 Sep 1995 15:32:22 CZHU // Fixed bugs mast off bits after shift // // Rev 1.6 26 Sep 1995 11:13:36 CZHU // // Adjust pitch back to normal, and changed UINT to U32 // // Rev 1.5 25 Sep 1995 09:04:14 CZHU // Added and cleaned some comments // // Rev 1.4 22 Sep 1995 16:42:00 CZHU // // improve pairing // // Rev 1.3 22 Sep 1995 15:59:48 CZHU // finished first around coding of half pel interpolation and tested // with the standalone program // // Rev 1.2 21 Sep 1995 16:56:28 CZHU // Unit tested (half, int) case // // Rev 1.1 21 Sep 1995 12:06:22 CZHU // More development // // Rev 1.0 20 Sep 1995 16:27:56 CZHU // Initial revision. // #include "precomp.h" #define FRAMEPOINTER esp //Interpolat_Int_half interpolated the pels from the pRef block //Write to pNewRef. //Assumes that pRef area has been expanded // Todo: Loop control and setup the stack for locals,CZHU,9/20/95 // preload output cache lines, 9/21 // Cache preload is no longer needed, 9/21/95 // Cycles count: 50*4 =200 cycles #pragma code_seg("IACODE2") __declspec(naked) void Interpolate_Half_Int (U32 pRef, U32 pNewRef) { __asm { push ebp push ebx push edi push esi mov esi, [esp+20] // pRef = esp + 4 pushes + ret mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef sub edi, PITCH // pre-decrement destination mov ebp, 8 // loop counter xor eax, eax // clear registers xor ebx, ebx xor ecx, ecx xor edx, edx //--------------------------------------------------------------------------// // // This loop is, basically, a 4 instruction, 2 cycle loop. // It is 3-folded, meaning that it works on 3 results per each // 2 cycle unit. It is 8-unrolled, meaning that it does 8 results // (one block's row) per loop iteration. The basic calculations // follow this pattern: // // pass-> 1 2 3 // cycle // 1 load | | shift // ----------------------- // 2 | add | store // // This assumes that the prior pell's value was loaded and // preserved from the prior result's calculation. Therefore // each result uses 2 registers -- one to load (and preserve) // the right-hand pell, and the other (overwriting the previous // result's stored pell value) to add into, shift, and store out // of. The add is accomplished with the lea instruction, allowing // a round bit to be added in without using a separate instruction. // // The preamble loads & adds for the first result, and loads // for the second. The body executes the basic pattern six times. // The postamble shifts and stores for the seventh result and // adds, shifts, and stores for the eighth. // // Timing: // 4 preamble (including bank conflict) // 12 body // 4 postamble // ---------------- // 20 per loop // x 8 loops // ---------------- // 160 subtotal // 6 initialize // 3 finalize // ================ // 169 total cycles //--------------------------------------------------------------------------// main_loop: // preamble mov al, 0[esi] mov bl, 1[esi] // probable BANK CONFLICT mov dl, 0[edi] // heat the cache add edi, PITCH // increment destination at top lea eax, [1+eax+ebx] // use a regular add in the preamble mov cl, 2[esi] // body (6 pels) shr eax, 1 mov dl, 3[esi] lea ebx, [ebx+ecx+1] mov 0[edi], al shr ebx, 1 mov al, 4[esi] lea ecx, [ecx+edx+1] mov 1[edi], bl shr ecx, 1 mov bl, 5[esi] lea edx, [edx+eax+1] mov 2[edi], cl shr edx, 1 mov cl, 6[esi] lea eax, [eax+ebx+1] mov 3[edi], dl shr eax, 1 mov dl, 7[esi] lea ebx, [ebx+ecx+1] mov 4[edi], al shr ebx, 1 mov al, 8[esi] lea ecx, [ecx+edx+1] mov 5[edi], bl // postamble shr ecx, 1 lea edx, [edx+eax+1] shr edx, 1 mov 6[edi], cl add esi, PITCH // increment source pointer mov 7[edi], dl dec ebp // loop counter jne main_loop // restore registers and return pop esi pop edi pop ebx pop ebp ret } //end of asm } // end Interpolate_Half_Int() //--------------------------------------------------------------------------// __declspec(naked) void Interpolate_Int_Half (U32 pRef, U32 pNewRef) { __asm { push ebp push ebx push edi push esi mov esi, [esp+20] // pRef = esp + 4 pushes + ret mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef dec edi // pre-decrement destination mov ebp, 8 // loop counter xor eax, eax // clear registers xor ebx, ebx xor ecx, ecx xor edx, edx //--------------------------------------------------------------------------// // // This loop is, basically, a 4 instruction, 2 cycle loop. // It is 3-folded, meaning that it works on 3 results per each // 2 cycle unit. It is 8-unrolled, meaning that it does 8 results // (one block's row) per loop iteration. The basic calculations // follow this pattern: // // pass-> 1 2 3 // cycle // 1 load | | shift // ----------------------- // 2 | add | store // // This assumes that the prior pell's value was loaded and // preserved from the prior result's calculation. Therefore // each result uses 2 registers -- one to load (and preserve) // the right-hand pell, and the other (overwriting the previous // result's stored pell value) to add into, shift, and store out // of. The add is accomplished with the lea instruction, allowing // a round bit to be added in without using a separate instruction. // // The preamble loads & adds for the first result, and loads // for the second. The body executes the basic pattern six times. // The postamble shifts and stores for the seventh result and // adds, shifts, and stores for the eighth. // // Timing: // 4 preamble (including bank conflict) // 12 body // 4 postamble // ---------------- // 20 per loop // x 8 loops // ---------------- // 160 subtotal // 6 initialize // 3 finalize // ================ // 169 total cycles //--------------------------------------------------------------------------// main_loop: // preamble mov al, [esi] mov bl, PITCH[esi] // probable BANK CONFLICT mov dl, [edi] // heat the cache inc edi // increment destination at top lea eax, [1+eax+ebx] // use a regular add in the preamble mov cl, [2*PITCH+esi] // body (6 pels) shr eax, 1 mov dl, [3*PITCH+esi] lea ebx, [ebx+ecx+1] mov [edi], al shr ebx, 1 mov al, [4*PITCH+esi] lea ecx, [ecx+edx+1] mov [PITCH+edi], bl shr ecx, 1 mov bl, [5*PITCH+esi] lea edx, [edx+eax+1] mov [2*PITCH+edi], cl shr edx, 1 mov cl, [6*PITCH+esi] lea eax, [eax+ebx+1] mov [3*PITCH+edi], dl shr eax, 1 mov dl, [7*PITCH+esi] lea ebx, [ebx+ecx+1] mov [4*PITCH+edi], al shr ebx, 1 mov al, [8*PITCH+esi] lea ecx, [ecx+edx+1] mov [5*PITCH+edi], bl // postamble shr ecx, 1 lea edx, [edx+eax+1] shr edx, 1 mov [6*PITCH+edi], cl inc esi // increment source pointer mov [7*PITCH+edi], dl dec ebp // loop counter jne main_loop // restore registers and return pop esi pop edi pop ebx pop ebp ret } // end of asm } // end Interpolate_Int_Half() //--------------------------------------------------------------------------// __declspec(naked) void Interpolate_Half_Half (U32 pRef, U32 pNewRef) { __asm { push ebp push ebx push edi push esi mov esi, [esp+20] // pRef = esp + 4 pushes + ret mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef mov ebp, 8 // loop counter sub edi, PITCH // pre-decrement destination pointer xor ecx, ecx xor edx, edx //--------------------------------------------------------------------------// // // This loop is, basically, a 6 instruction, 3 cycle loop. // It is 3-folded, meaning that it works on 3 results per each // 3 cycle unit. It is 8-unrolled, meaning that it does 8 results // (one block's row) per loop iteration. The basic calculations // follow this pattern: // // pass-> 1 2 3 // cycle // 1 load | add left | // ---------------------------- // 2 load | | shift // ---------------------------- // 3 | add all | store // // Five registers are used to preserve values from one pass to the next: // cl & dl hold the last two pell values // ebp or ebx holds the sum of the two left-hand pells + 1 // eax holds the sum of all four pells // Both adds are accomplished with the lea instruction. For the sum // of the two left-hand pells, this allows a rounding bit to be added // in without using a separate instruction. For both sums it allows // the result to be placed into a register independent of the sources'. // Since the sum of the two left-hand pells is used twice, it is place // alternately into ebx and ebp. // // The preamble does two preliminary loads plus passes 1 & 2 for the // first result, and pass 1 for the second. The body executes the basic // pattern six times. The postamble does pass 3 for the // seventh result and passes 2 & 3 for the eighth. // // Due to the need for five registers, the loop counter is kept on // the stack. // // Timing: // 8 preamble // 18 body // 5 postamble // ---------------- // 31 per loop // x 8 loops // ---------------- // 248 subtotal // 5 initialize // 3 finalize // ================ // 256 total cycles //--------------------------------------------------------------------------// main_loop: // preamble mov cl, [esi] // pell 0 xor eax, eax mov al, [esi+PITCH] // pell 0 xor ebx, ebx mov dl, [esi+1] // pell 1 add eax, ecx // partial sum 0 sans round mov bl, [esi+PITCH+1] // pell 1 inc eax // partial sum 0 mov cl, [esi+2] // pell 2 add ebx, edx // partial sum 1 sans round mov dl, [esi+PITCH+2] // pell 2 inc ebx // partial sum 1 add eax, ebx // full sum 0 push ebp // save loop counter on stack mov ebp, [edi+PITCH] // heat the cache add edi, PITCH // increment dst. pointer at top of loop // body (x 6) lea ebp, [ecx+edx+1] // partial sum 2 with round mov cl, [esi+3] // pell 3 shr eax, 2 // value 0 mov dl, [esi+PITCH+3] // pell 3 mov [edi], al // write value 0 lea eax, [ebx+ebp] // full sum 1 lea ebx, [ecx+edx+1] // partial sum 3 with round mov cl, [esi+4] // pell 4 shr eax, 2 // value 1 mov dl, [esi+PITCH+4] // pell 4 mov [edi+1], al // write value 1 lea eax, [ebx+ebp] // full sum 2 lea ebp, [ecx+edx+1] // partial sum 4 with round mov cl, [esi+5] // pell 5 shr eax, 2 // value 2 mov dl, [esi+PITCH+5] // pell 5 mov [edi+2], al // write value 2 lea eax, [ebx+ebp] // full sum 3 lea ebx, [ecx+edx+1] // partial sum 5 with round mov cl, [esi+6] // pell 6 shr eax, 2 // value 3 mov dl, [esi+PITCH+6] // pell 6 mov [edi+3], al // write value 3 lea eax, [ebx+ebp] // full sum 4 lea ebp, [ecx+edx+1] // partial sum 6 with round mov cl, [esi+7] // pell 7 shr eax, 2 // value 4 mov dl, [esi+PITCH+7] // pell 7 mov [edi+4], al // write value 4 lea eax, [ebx+ebp] // full sum 5 lea ebx, [ecx+edx+1] // partial sum 7 with round mov cl, [esi+8] // pell 8 shr eax, 2 // value 5 mov dl, [esi+PITCH+8] // pell 8 mov [edi+5], al // write value 5 lea eax, [ebx+ebp] // full sum 6 // postamble shr eax, 2 // value 6 lea ebp, [ecx+edx+1] // partial sum 8 with round mov [edi+6], al // write value 6 add esi, PITCH // increment read pointer lea eax, [ebx+ebp] // full sum 7 pop ebp // restore loop counter shr eax, 2 // value 7 dec ebp // decrement loop counter mov [edi+7], al // write value 7 jne main_loop // loop if not done // restore registers and return pop esi pop edi pop ebx pop ebp ret } //end of asm } #pragma code_seg() // end Interpolate_Half_Half() //--------------------------------------------------------------------------// /* void Interpolate_Half_Half_C (U32 pRef, U32 pNewRef) { U8 * pSrc = (U8 *) pRef; U8 * pDst = (U8 *) pNewRef; int i, j; for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH) for (j=0; j<8; j++) pDst[j] = (pSrc[j] + pSrc[j+1] + pSrc[PITCH+j] + pSrc[PITCH+j+1] + 2) >> 2; } void Interpolate_Int_Half_C (U32 pRef, U32 pNewRef) { U8 * pSrc = (U8 *) pRef; U8 * pDst = (U8 *) pNewRef; int i, j; for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH) for (j=0; j<8; j++) pDst[j] = (pSrc[j] + pSrc[PITCH+j] + 1) >> 1; } void Interpolate_Half_Int_C (U32 pRef, U32 pNewRef) { U8 * pSrc = (U8 *) pRef; U8 * pDst = (U8 *) pNewRef; int i, j; for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH) for (j=0; j<8; j++) pDst[j] = (pSrc[j] + pSrc[j+1] + 1) >> 1; } */