You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1515 lines
72 KiB
1515 lines
72 KiB
/* *************************************************************************
|
|
** INTEL Corporation Proprietary Information
|
|
**
|
|
** This listing is supplied under the terms of a license
|
|
** agreement with INTEL Corporation and may not be copied
|
|
** nor disclosed except in accordance with the terms of
|
|
** that agreement.
|
|
**
|
|
** Copyright (c) 1995-1996 Intel Corporation.
|
|
** All Rights Reserved.
|
|
**
|
|
** *************************************************************************
|
|
*/
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// $Author: MBODART $
|
|
// $Date: 05 Aug 1996 11:03:52 $
|
|
// $Archive: S:\h26x\src\dec\d1idct.cpv $
|
|
// $Header: S:\h26x\src\dec\d1idct.cpv 1.0 05 Aug 1996 11:03:52 MBODART $
|
|
// $Log: S:\h26x\src\dec\d1idct.cpv $
|
|
//
|
|
// Rev 1.0 05 Aug 1996 11:03:52 MBODART
|
|
// Initial revision.
|
|
//
|
|
// Started from d3idct.cpp
|
|
//
|
|
// Rev 1.8 08 Mar 1996 16:46:20 AGUPTA2
|
|
// Added pragma code_seg. Rolled the initialization code. Got rid of most
|
|
// of 32-bit displacements in instructions. Aligned frequently executed loops
|
|
// at 4-byte boundary. Made changes to reflect new size of MapMatrix. Removed
|
|
// nop instructions. Deleted code that prefetches output lines in case of
|
|
// INTRA blocks. Use ClampTbl instead of ClipPixIntra. Do not clip output
|
|
// of INTER blocks; clipping is done in dxblkadd().
|
|
//
|
|
//
|
|
//Block level decoding for H.261 decoder
|
|
|
|
#include "precomp.h"
|
|
|
|
/////////////////////////////////////////////////////////////////////////
|
|
// Decode each none-empty block
|
|
// Input: lpInst: decoder instance,
|
|
// lpSrc: input bitstream,
|
|
// lpBlockAction:
|
|
// the pointer to the block action stream structure
|
|
// bitsread: number of bits in the buffer already,
|
|
/////////////////////////////////////////////////////////////////////////
|
|
|
|
// local variable definitions
|
|
#define FRAMEPOINTER esp
|
|
#define L_BITSUSED FRAMEPOINTER + 0 // 4 byte
|
|
#define L_ACCUM L_BITSUSED + 4 //
|
|
#define L_DESTBLOCK L_ACCUM + 256 //64 DWORD
|
|
#define L_NO_COEFF L_DESTBLOCK + 4
|
|
#define L_PRODUCT L_NO_COEFF + 4
|
|
|
|
#define L_LOOPCOUNTER L_PRODUCT + 80 //20 DWORD
|
|
#define L_INPUT_INTER L_LOOPCOUNTER + 4
|
|
#define L_esi L_INPUT_INTER + 4
|
|
|
|
#define L_DESTBLOCK_1 L_esi + 4 // akk
|
|
#define L_DESTBLOCK_2 L_DESTBLOCK_1 + 4 // akk
|
|
|
|
#ifdef PTEL_WORK_AROUND
|
|
#define L_COEFFCOUNT L_DESTBLOCK_1 + 4 //akk
|
|
#define L_COEFFVALUE L_COEFFCOUNT + 4 //akk
|
|
#endif
|
|
|
|
#define L_END_OF_FRAME FRAMEPOINTER + 512
|
|
#define LOCALSIZE ((512+3)&~3) // keep aligned
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Input:
|
|
// pIQ_INDEX, pointer to pointer for Inverse quantization and index
|
|
// for the current block.
|
|
// No_Coeff, A 32 bit number indicate block types, etc.
|
|
// 0--63, inter block, number of coeff
|
|
// 64--127 64+ intra block, number of coeff
|
|
// pIntraBuf, Buffer pointer for intra blocks.
|
|
//
|
|
// pInterBuf, Buffer pointer for inter blocks.
|
|
//
|
|
//
|
|
// return:
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////////////
|
|
#pragma code_seg("IACODE2")
|
|
__declspec(naked)
|
|
U32 DecodeBlock_IDCT ( U32 pIQ_INDEX,
|
|
U32 No_Coeff,
|
|
U32 pIntraBuf,
|
|
U32 pInterBuf)
|
|
{
|
|
__asm
|
|
{
|
|
push ebp // save callers frame pointer
|
|
mov ebp, esp // make parameters accessible
|
|
push esi // assumed preserved
|
|
push edi
|
|
push ebx
|
|
sub esp, LOCALSIZE // reserve local storage
|
|
mov eax, pInterBuf
|
|
lea edi, [L_ACCUM+128]
|
|
mov [L_INPUT_INTER], eax
|
|
;add edi, 128 // Adjust offset to save code space
|
|
mov edx, No_Coeff
|
|
;
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
// Initialize accumulators for IDCT
|
|
// ROUNDER was pre-computed.
|
|
//
|
|
// C code:
|
|
//
|
|
// for (x=0; x<16; x++)
|
|
// acc[x] = rounder;
|
|
// for (x=16; x<64; x++)
|
|
// acc[x] = 0L;
|
|
//
|
|
mov esi, [edi-128] ; pre-fetch accumulators
|
|
mov ebx, [edi-96] ; pre-fetch
|
|
mov esi, [edi-64] ; pre-fetch more
|
|
mov ebx, [edi-32] ; pre-fetch more
|
|
mov esi, [edi] ; pre-fetch more
|
|
mov ebx, [edi+32] ; pre-fetch more
|
|
mov esi, [edi+64] ; pre-fetch more
|
|
mov ebx, [edi+96] ; pre-fetch more
|
|
xor esi, esi
|
|
|
|
sub edi, 128
|
|
mov eax, ROUNDER
|
|
mov ebx, 64
|
|
loop_for_init:
|
|
mov [edi], eax
|
|
mov [edi+4], eax
|
|
mov [edi+ebx], esi
|
|
mov [edi+ebx+4], esi
|
|
mov [edi+ebx+8], esi
|
|
mov [edi+ebx+12], esi
|
|
mov [edi+ebx+16], esi
|
|
mov [edi+ebx+20], esi
|
|
add edi, 8
|
|
add ebx, 16
|
|
cmp ebx, 192
|
|
jl loop_for_init
|
|
|
|
//end of IDCT init.
|
|
|
|
#ifdef PTEL_WORK_AROUND
|
|
mov [L_COEFFCOUNT], esi // zero out coefficient counter
|
|
mov [L_COEFFVALUE], esi // zero out coefficient value
|
|
#endif
|
|
|
|
cmp edx, 65
|
|
jg intra_block
|
|
|
|
mov ebx, pInterBuf
|
|
jmp pre_acc_loop
|
|
|
|
intra_block:
|
|
mov ebx, pIntraBuf
|
|
sub edx, 65
|
|
|
|
// register:
|
|
// ebp: loop counter
|
|
// ebx: inverse quant
|
|
// ecx: index [0,63]
|
|
|
|
pre_acc_loop:
|
|
mov esi, pIQ_INDEX
|
|
mov [L_DESTBLOCK], ebx
|
|
mov [L_esi], esi
|
|
|
|
ALIGN 4
|
|
acc_loop:
|
|
mov ebx,[esi+edx*8-8] //Invserse Quant
|
|
mov ecx,[esi+edx*8-4] //Coeff index
|
|
mov [L_NO_COEFF], edx
|
|
call idct_acc
|
|
mov esi, [L_esi]
|
|
mov edx, [L_NO_COEFF]
|
|
dec edx
|
|
jnz acc_loop
|
|
|
|
mov edx, [L_DESTBLOCK]
|
|
mov ecx, [L_INPUT_INTER]
|
|
cmp edx, ecx
|
|
jnz call_intra_bfly
|
|
|
|
call idct_bfly_inter
|
|
|
|
add esp, LOCALSIZE // free locals
|
|
add eax, edi
|
|
pop ebx
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
call_intra_bfly:
|
|
call idct_bfly_intra
|
|
|
|
add esp, LOCALSIZE // free locals
|
|
add eax, edi
|
|
pop ebx
|
|
pop edi
|
|
pop esi
|
|
pop ebp
|
|
ret
|
|
|
|
///////////////////////////////////////////////////////////////
|
|
// This "subroutine" idct_acc performs the accumulator phase of
|
|
// the fmidct.
|
|
//
|
|
// assume parameter passed in by registers
|
|
// ebx, inversed quantized value, input
|
|
// ecx, index [0,63]
|
|
//
|
|
// C code:
|
|
//
|
|
// for (i=0; i<NUM_ELEM; i++) // Loop through each input
|
|
// {
|
|
// if (input[i])
|
|
// {
|
|
// pNKernel = &NKernel[i]; // initialize kernel pointer
|
|
// totalU = pNKernel->totalUnique;
|
|
// for (x=0; x<totalU; x++) // compute positive and negative products
|
|
// {
|
|
// product[x] = input[i] * pNKernel->coeff[x];
|
|
// product[x+totalU] = -product[x];
|
|
// }
|
|
// // Loop through each entry in the output matrix
|
|
// acc[pNKernel->PClass] += product[ pNKernel->matrix[0] ];
|
|
// acc[1+pNKernel->PClass] += product[ pNKernel->matrix[1] ];
|
|
// acc[2+pNKernel->PClass] += product[ pNKernel->matrix[2] ];
|
|
// acc[3+pNKernel->PClass] += product[ pNKernel->matrix[3] ];
|
|
// acc[4+pNKernel->PClass] += product[ pNKernel->matrix[4] ];
|
|
// acc[5+pNKernel->PClass] += product[ pNKernel->matrix[5] ];
|
|
// acc[6+pNKernel->PClass] += product[ pNKernel->matrix[6] ];
|
|
// acc[7+pNKernel->PClass] += product[ pNKernel->matrix[7] ];
|
|
// acc[8+pNKernel->PClass] += product[ pNKernel->matrix[8] ];
|
|
// acc[9+pNKernel->PClass] += product[ pNKernel->matrix[9] ];
|
|
// acc[10+pNKernel->PClass] += product[ pNKernel->matrix[10] ];
|
|
// acc[11+pNKernel->PClass] += product[ pNKernel->matrix[11] ];
|
|
// acc[12+pNKernel->PClass] += product[ pNKernel->matrix[12] ];
|
|
// acc[13+pNKernel->PClass] += product[ pNKernel->matrix[13] ];
|
|
// acc[14+pNKernel->PClass] += product[ pNKernel->matrix[14] ];
|
|
// acc[15+pNKernel->PClass] += product[ pNKernel->matrix[15] ];
|
|
// }
|
|
// }
|
|
///////////////////////////////////////////////////////////////
|
|
// assume parameter passed in by registers
|
|
// ebx, inverse quant
|
|
// ecx, index [0,63]
|
|
idct_acc:
|
|
|
|
; For every non-zero coefficient:
|
|
; LoopCounter, on local stack, has index
|
|
; ecx = index (0-63)
|
|
; ebx = non-zero input
|
|
; Note i = index
|
|
;
|
|
#ifdef PTEL_WORK_AROUND
|
|
mov edx, [L_COEFFCOUNT+4] ; get coefficient counter
|
|
mov [L_COEFFVALUE+4], ebx ; store coefficient value
|
|
inc edx
|
|
;
|
|
mov [L_COEFFCOUNT+4], edx ; store updated coefficient counter
|
|
;
|
|
#endif
|
|
and ecx, 03fh ; Chad added to prevent GPF
|
|
xor edx, edx ; zero out for byte read, use as dword
|
|
mov [L_LOOPCOUNTER+4], ecx ; Store Loop counter
|
|
mov esi, ecx ; move index to esi
|
|
lea eax, Unique ; eax = Address of Unique[0]
|
|
mov ebp, ecx ; move index to ebp
|
|
shl esi, 3 ; index*8
|
|
add ecx, ecx ; index*2
|
|
add esi, ecx ; index*10
|
|
lea ecx, KernelCoeff ; get KernelCoeff[0][0]
|
|
lea edi, [L_PRODUCT+4] ; edi = address of product[0]
|
|
mov dl, [eax+ebp] ; get Unique[i]
|
|
lea esi, [ecx+4*esi] ; address of KernelCoeff[i][0]
|
|
mov ebp, edx ; ebp = Unique[i]
|
|
lea eax, [edi+edx*4] ; eax = address of product[totalU]
|
|
;nop
|
|
|
|
; ----------------------------------------------------------------------
|
|
|
|
; Register usage
|
|
; eax = addr of product[Unique[i]]
|
|
; ebx = input[i]
|
|
; ecx = 0, -product[x]
|
|
; edx = KernelCoeff[i][x], product[x]= KernelCoeff[i][x] * input[i]
|
|
; ebp = x
|
|
; edi = addr of product[0]
|
|
; esi = addr of KernelCoeff[i][x]
|
|
|
|
ALIGN 4
|
|
loop_for_x:
|
|
xor ecx, ecx
|
|
mov edx, [esi+ebp*4-4] ; read KernelCoeff[i][x]
|
|
imul edx, ebx ; KernelCoeff[i][x] * input[i]
|
|
mov [edi+ebp*4-4], edx ; product[x] = result of imul
|
|
sub ecx, edx
|
|
mov [eax+ebp*4-4], ecx ; product[totalU+x] = -product[x]
|
|
dec ebp ; decrement x
|
|
jnz loop_for_x
|
|
|
|
; ----------------------------------------------------------------------
|
|
|
|
; Register usage
|
|
; eax = MapMatrix[i][0-15]
|
|
; ebx = address of PClass[0], accum[PClass[i]]
|
|
; ecx = LoopCounter, addr of MapMatrix[i][0]
|
|
; edx = [0-15]+PClass[i], accum[[0-15]+PClass[i]]
|
|
; ebp = product[MapMatrix[i][0-15]]
|
|
; edi = addr of product[0]
|
|
; esi = address of accum[0], address of accum[PClass[i]]
|
|
|
|
mov ecx, [L_LOOPCOUNTER+4] ; get i
|
|
and ecx, 03fh ; Chad added to prevent GPF
|
|
lea ebx, PClass ; get addr of PClass[0]
|
|
mov esi, ecx ; save i in esi
|
|
shl ecx, 4 ; i*16
|
|
lea eax, MapMatrix ; get addr of MapMatrix[0][0]
|
|
xor edx, edx
|
|
nop
|
|
mov dl, [ebx+esi] ; get PClass[i]
|
|
lea ecx, [eax+ecx] ; get addr of MapMatrix[i][0]
|
|
shl edx, 2 ; PClass[i]*4
|
|
lea esi, [L_ACCUM+4] ; get addr of accum[0]
|
|
; ----------------------------------------------------------------------
|
|
xor eax, eax
|
|
add esi, edx ; esi = address of accum[PClass[i]]
|
|
mov al, [ecx] ; get MapMatrix[i][0]
|
|
nop
|
|
;nop
|
|
mov ebx, [esi] ; get accum[PClass[i]]
|
|
nop
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][0]]
|
|
mov al, [ecx+1] ; get MapMatrix[i][1]
|
|
add ebx, ebp ; accum[PClass[i]] += product[
|
|
; MapMatrix[i][0]]
|
|
mov edx, [esi+4] ; get accum[1+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][1]]
|
|
mov al, [ecx+2] ; get MapMatrix[i][2]
|
|
add edx, ebp ; accum[1+PClass[i]] += product[
|
|
; MapMatrix[i][1]]
|
|
mov [esi], ebx ; store accum[PClass[i]] += product[
|
|
; MapMatrix[i][0]]
|
|
mov [esi+4], edx ; store accum[1+PClass[i]] +=
|
|
; product[MapMatrix[i][1]]
|
|
mov ebx, [esi+8] ; get accum[2+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][2]]
|
|
mov al, [ecx+3] ; get MapMatrix[i][3]
|
|
add ebx, ebp ; accum[2+PClass[i]] += product[
|
|
; MapMatrix[i][2]]
|
|
mov edx, [esi+12] ; get accum[3+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][3]]
|
|
mov al, [ecx+4] ; get MapMatrix[i][4]
|
|
add edx, ebp ; accum[3+PClass[i]] += product[
|
|
; MapMatrix[i][3]]
|
|
mov [esi+8], ebx ; store accum[2+PClass[i]] +=
|
|
; product[MapMatrix[i][2]]
|
|
mov [esi+12], edx ; store accum[3+PClass[i]] +=
|
|
; product[MapMatrix[i][3]]
|
|
|
|
; ----------------------------------------------------------------------
|
|
mov ebx, [esi+16] ; get accum[4+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][4]]
|
|
mov al, [ecx+5] ; get MapMatrix[i][5]
|
|
add ebx, ebp ; accum[4+PClass[i]] += product[
|
|
; MapMatrix[i][4]]
|
|
mov edx, [esi+20] ; get accum[5+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][5]]
|
|
mov al, [ecx+6] ; get MapMatrix[i][6]
|
|
add edx, ebp ; accum[5+pNkernel->PClass] += product[
|
|
; MapMatrix[i][5]]
|
|
mov [esi+16], ebx ; store accum[4+PClass[i]] +=
|
|
; product[MapMatrix[i][4]]
|
|
mov [esi+20], edx ; store accum[5+PClass[i]] +=
|
|
; product[MapMatrix[i][5]]
|
|
mov ebx, [esi+24] ; get accum[6+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][6]]
|
|
mov al, [ecx+7] ; get MapMatrix[i][7]
|
|
add ebx, ebp
|
|
mov edx, [esi+28] ; get accum[7+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][7]]
|
|
mov al, [ecx+8] ; get MapMatrix[i][8]
|
|
add edx, ebp ; accum[7+PClass[i]] += product[
|
|
; MapMatrix[i][7]]
|
|
mov [esi+24], ebx ; store accum[6+PClass[i]] +=
|
|
; product[MapMatrix[i][6]]
|
|
mov [esi+28], edx ; store accum[7+PClass[i]] +=
|
|
; product[MapMatrix[i][7]]
|
|
|
|
; ----------------------------------------------------------------------
|
|
mov ebx, [esi+32] ; get accum[8+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][8]]
|
|
mov al, [ecx+9] ; get MapMatrix[i][9]
|
|
add ebx, ebp ; accum[8+PClass[i]] += product[
|
|
; MapMatrix[i][8]]
|
|
mov edx, [esi+36] ; get accum[9+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][9]]
|
|
mov al, [ecx+10] ; get MapMatrix[i][10]
|
|
add edx, ebp ; accum[9+pNkernel->PClass] += product[
|
|
; MapMatrix[i][9]]
|
|
mov [esi+32], ebx ; store accum[8+PClass[i]] +=
|
|
; product[MapMatrix[i][8]]
|
|
mov [esi+36], edx ; store accum[9+PClass[i]] +=
|
|
; product[MapMatrix[i][9]]
|
|
mov ebx, [esi+40] ; get accum[10+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][10]]
|
|
mov al, [ecx+11] ; get MapMatrix[i][11]
|
|
add ebx, ebp
|
|
mov edx, [esi+44] ; get accum[11+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][11]]
|
|
; product[MapMatrix[i][11]]
|
|
mov al, [ecx+12] ; get MapMatrix[i][12]
|
|
add edx, ebp ; accum[11+PClass[i]] += product[
|
|
; MapMatrix[i][11]]
|
|
mov [esi+40], ebx ; store accum[10+PClass[i]] +=
|
|
; product[MapMatrix[i][10]]
|
|
mov [esi+44], edx ; store accum[11+PClass[i]] +=
|
|
; product[MapMatrix[i][11]]
|
|
; ----------------------------------------------------------------------
|
|
mov ebx, [esi+48] ; get accum[12+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][12]]
|
|
mov al, [ecx+13] ; get MapMatrix[i][13]
|
|
add ebx, ebp ; accum[12+PClass[i]] += product[
|
|
; MapMatrix[i][12]]
|
|
mov edx, [esi+52] ; get accum[13+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][13]]
|
|
mov al, [ecx+14] ; get MapMatrix[i][14]
|
|
add edx, ebp ; accum[13+pNkernel->PClass] += product[
|
|
; MapMatrix[i][13]]
|
|
mov [esi+48], ebx ; store accum[PClass[i]] += product[
|
|
; MapMatrix[i][13]]
|
|
mov [esi+52], edx ; store accum[13+PClass[i]] +=
|
|
; product[MapMatrix[i][13]]
|
|
mov ebx, [esi+56] ; get accum[14+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][14]]
|
|
mov al, [ecx+15] ; get MapMatrix[i][15]
|
|
add ebx, ebp
|
|
mov edx, [esi+60] ; get accum[15+PClass[i]]
|
|
mov ebp, [edi+eax*4] ; get product[MapMatrix[i][15]]
|
|
mov [esi+56], ebx ; store accum[14+PClass[i]] +=
|
|
; product[MapMatrix[i][14]]
|
|
add edx, ebp ; accum[15+PClass[i]] += product[
|
|
; MapMatrix[i][15]]
|
|
mov [esi+60], edx ; store accum[15+PClass[i]] +=
|
|
; product[MapMatrix[i][15]]
|
|
ret
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
// This "subroutine" idct_bfly_intra performs the butterfly phase of
|
|
// the fmidct for intra blocks.
|
|
//
|
|
// assume parameters passed in by registers
|
|
//
|
|
// C code:
|
|
//
|
|
// Upper Left Quadrant
|
|
// Upper Right Quadrant
|
|
// Lower Left Quadrant
|
|
// Lower Right Quadrant
|
|
//
|
|
// lOut[0][0] = CLIP_INTRA[acc[0]+acc[16] + acc[32]+acc[48]];
|
|
// lOut[0][7] = CLIP_INTRA[acc[0]+acc[16] - (acc[32]+acc[48])];
|
|
// lOut[7][0] = CLIP_INTRA[(acc[0]-acc[16]) + (acc[32]-acc[48])];
|
|
// lOut[7][7] = CLIP_INTRA[(acc[0]-acc[16]) - (acc[32]-acc[48])];
|
|
//
|
|
// lOut[0][1] = CLIP_INTRA[acc[1]+acc[17] + acc[33]+acc[49]];
|
|
// lOut[0][6] = CLIP_INTRA[acc[1]+acc[17] - (acc[33]+acc[49])];
|
|
// lOut[7][1] = CLIP_INTRA[(acc[1]-acc[17]) + (acc[33]-acc[49])];
|
|
// lOut[7][6] = CLIP_INTRA[(acc[1]-acc[17]) - (acc[33]-acc[49])];
|
|
//
|
|
// lOut[0][2] = CLIP_INTRA[acc[2]+acc[18] + acc[34]+acc[50]];
|
|
// lOut[0][5] = CLIP_INTRA[acc[2]+acc[18] - (acc[34]+acc[50])];
|
|
// lOut[7][2] = CLIP_INTRA[(acc[2]-acc[18]) + (acc[34]-acc[50])];
|
|
// lOut[7][5] = CLIP_INTRA[(acc[2]-acc[18]) - (acc[34]-acc[50])];
|
|
//
|
|
// lOut[0][3] = CLIP_INTRA[acc[3]+acc[19] + acc[35]+acc[51]];
|
|
// lOut[0][4] = CLIP_INTRA[acc[3]+acc[19] - (acc[35]+acc[51])];
|
|
// lOut[7][3] = CLIP_INTRA[(acc[3]-acc[19]) + (acc[35]-acc[51])];
|
|
// lOut[7][4] = CLIP_INTRA[(acc[3]-acc[19]) - (acc[35]-acc[51])];
|
|
//
|
|
//
|
|
// lOut[1][0] = CLIP_INTRA[acc[4]+acc[20] + acc[36]+acc[52]];
|
|
// lOut[1][7] = CLIP_INTRA[acc[4]+acc[20] - (acc[36]+acc[52])];
|
|
// lOut[6][0] = CLIP_INTRA[(acc[4]-acc[20]) + (acc[36]-acc[52])];
|
|
// lOut[6][7] = CLIP_INTRA[(acc[4]-acc[20]) - (acc[36]-acc[52])];
|
|
//
|
|
// lOut[1][1] = CLIP_INTRA[acc[5]+acc[21] + acc[37]+acc[53]];
|
|
// lOut[1][6] = CLIP_INTRA[acc[5]+acc[21] - (acc[37]+acc[53])];
|
|
// lOut[6][1] = CLIP_INTRA[(acc[5]-acc[21]) + (acc[37]-acc[53])];
|
|
// lOut[6][6] = CLIP_INTRA[(acc[5]-acc[21]) - (acc[37]-acc[53])];
|
|
//
|
|
// lOut[1][2] = CLIP_INTRA[acc[6]+acc[22] + acc[38]+acc[54]];
|
|
// lOut[1][5] = CLIP_INTRA[acc[6]+acc[22] - (acc[38]+acc[54])];
|
|
// lOut[6][2] = CLIP_INTRA[(acc[6]-acc[22]) + (acc[38]-acc[54])];
|
|
// lOut[6][5] = CLIP_INTRA[(acc[6]-acc[22]) - (acc[38]-acc[54])];
|
|
//
|
|
// lOut[1][3] = CLIP_INTRA[acc[7]+acc[23] + acc[39]+acc[55]];
|
|
// lOut[1][4] = CLIP_INTRA[acc[7]+acc[23] - (acc[39]+acc[55])];
|
|
// lOut[6][3] = CLIP_INTRA[(acc[7]-acc[23]) + (acc[39]-acc[55])];
|
|
// lOut[6][4] = CLIP_INTRA[(acc[7]-acc[23]) - (acc[39]-acc[55])];
|
|
//
|
|
//
|
|
// lOut[2][0] = CLIP_INTRA[acc[8]+acc[24] + acc[40]+acc[56]];
|
|
// lOut[2][7] = CLIP_INTRA[acc[8]+acc[24] - (acc[40]+acc[56])];
|
|
// lOut[5][0] = CLIP_INTRA[(acc[8]-acc[24]) + (acc[40]-acc[56])];
|
|
// lOut[5][7] = CLIP_INTRA[(acc[8]-acc[24]) - (acc[40]-acc[56])];
|
|
//
|
|
// lOut[2][1] = CLIP_INTRA[acc[9]+acc[25] + acc[41]+acc[57]];
|
|
// lOut[2][6] = CLIP_INTRA[acc[9]+acc[25] - (acc[41]+acc[57])];
|
|
// lOut[5][1] = CLIP_INTRA[(acc[9]-acc[25]) + (acc[41]-acc[57])];
|
|
// lOut[5][6] = CLIP_INTRA[(acc[9]-acc[25]) - (acc[41]-acc[57])];
|
|
//
|
|
// lOut[2][2] = CLIP_INTRA[acc[10]+acc[26] + acc[42]+acc[58]];
|
|
// lOut[2][5] = CLIP_INTRA[acc[10]+acc[26] - (acc[42]+acc[58])];
|
|
// lOut[5][2] = CLIP_INTRA[(acc[10]-acc[26]) + (acc[42]-acc[58])];
|
|
// lOut[5][5] = CLIP_INTRA[(acc[10]-acc[26]) - (acc[42]-acc[58])];
|
|
//
|
|
// lOut[2][3] = CLIP_INTRA[acc[11]+acc[27] + acc[43]+acc[59]];
|
|
// lOut[2][4] = CLIP_INTRA[acc[11]+acc[27] - (acc[43]+acc[59])];
|
|
// lOut[5][3] = CLIP_INTRA[(acc[11]-acc[27]) + (acc[43]-acc[59])];
|
|
// lOut[5][4] = CLIP_INTRA[(acc[11]-acc[27]) - (acc[43]-acc[59])];
|
|
//
|
|
//
|
|
// lOut[3][0] = CLIP_INTRA[acc[12]+acc[28] + acc[44]+acc[60]];
|
|
// lOut[3][7] = CLIP_INTRA[acc[12]+acc[28] - (acc[44]+acc[60])];
|
|
// lOut[4][0] = CLIP_INTRA[(acc[12]-acc[28]) + (acc[44]-acc[60])];
|
|
// lOut[4][7] = CLIP_INTRA[(acc[12]-acc[28]) - (acc[44]-acc[60])];
|
|
//
|
|
// lOut[3][1] = CLIP_INTRA[acc[13]+acc[29] + acc[45]+acc[61]];
|
|
// lOut[3][6] = CLIP_INTRA[acc[13]+acc[29] - (acc[45]+acc[61])];
|
|
// lOut[4][1] = CLIP_INTRA[(acc[13]-acc[29]) + (acc[45]-acc[61])];
|
|
// lOut[4][6] = CLIP_INTRA[(acc[13]-acc[29]) - (acc[45]-acc[61])];
|
|
//
|
|
// lOut[3][2] = CLIP_INTRA[acc[14]+acc[30] + acc[46]+acc[62]];
|
|
// lOut[3][5] = CLIP_INTRA[acc[14]+acc[30] - (acc[46]+acc[62])];
|
|
// lOut[4][2] = CLIP_INTRA[(acc[14]-acc[30]) + (acc[46]-acc[62])];
|
|
// lOut[4][5] = CLIP_INTRA[(acc[14]-acc[30]) - (acc[46]-acc[62])];
|
|
//
|
|
// lOut[3][3] = CLIP_INTRA[acc[15]+acc[31] + acc[47]+acc[63]];
|
|
// lOut[3][4] = CLIP_INTRA[acc[15]+acc[31] - (acc[47]+acc[63])];
|
|
// lOut[4][3] = CLIP_INTRA[(acc[15]-acc[31]) + (acc[47]-acc[63])];
|
|
// lOut[4][4] = CLIP_INTRA[(acc[15]-acc[31]) - (acc[47]-acc[63])];
|
|
//
|
|
; ----------------------------------------------------------------------
|
|
////////////////////////////////////////////////////////////////////////////
|
|
//assume parameters passed in by registers
|
|
|
|
idct_bfly_intra:
|
|
|
|
; ----------------------------------------------------------------------
|
|
; INTRA ONLY Butterfly and clamp
|
|
; Uses all registers.
|
|
; Uses all accumulators[64], accum
|
|
; Uses ClipPixIntra[2048] of BYTES, ClipPixIntra
|
|
; Writes to Output matrix of BYTES, OutputCoeff
|
|
;
|
|
; Process 4 outputs per group, 0-7, 8-15
|
|
; 0
|
|
|
|
mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of OutputCoeff
|
|
lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
mov [L_DESTBLOCK_1+4], edi
|
|
mov edx, 2 ; just loop 2 times
|
|
mov [L_DESTBLOCK_2+4], edi
|
|
|
|
ALIGN 4
|
|
loop_intra_bfly:
|
|
mov [L_LOOPCOUNTER+4], edx ; Store local loop counter
|
|
nop
|
|
mov eax, [esi-128] ; get acc[0]
|
|
mov ebx, [esi-64] ; get acc[16]
|
|
mov ebp, [esi] ; get acc[32]
|
|
mov edx, [esi+64] ; get acc[48]
|
|
lea ecx, [eax+ebx] ; acc[0]+acc[16]
|
|
sub eax, ebx ; acc[0]-acc[16]
|
|
lea ebx, [ebp+edx] ; acc[32]+acc[48]
|
|
sub ebp, edx ; acc[32]-acc[48]
|
|
|
|
mov edx, [edi] ; pre-fetch output cache line 0
|
|
mov edi, [edi+7*PITCH] ; pre-fetch output cache line 7
|
|
;mov esi, [edi+7*PITCH] ; pre-fetch output cache line 7
|
|
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
|
|
sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
|
|
sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi], dl ; output[0][0] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+7], cl ; output[0][7] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+7*PITCH], bl ; output[7][0] = tmp3
|
|
mov ebx, [esi-60] ; get acc[17]
|
|
|
|
; -------------------------------------------------------------------------
|
|
; 1
|
|
mov BYTE PTR [edi+7*PITCH+7], al; output[7][7] = tmp4
|
|
mov eax, [esi-124] ; get acc[1]
|
|
mov ebp, [esi+4] ; get acc[33]
|
|
mov edx, [esi+68] ; get acc[49]
|
|
lea ecx, [eax+ebx] ; acc[1]+acc[17]
|
|
sub eax, ebx ; acc[1]-acc[17]
|
|
lea ebx, [ebp+edx] ; acc[33]+acc[49]
|
|
sub ebp, edx ; acc[33]-acc[49]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
|
|
sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
|
|
sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;nop
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+1], dl ; output[0][1] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+6], cl ; output[0][6] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+7*PITCH+1], bl ; output[7][1] = tmp3
|
|
mov ebx, [esi-56] ; get acc[18]
|
|
; -------------------------------------------------------------------------
|
|
; 2
|
|
mov BYTE PTR [edi+7*PITCH+6], al ; output[7][6] = tmp4
|
|
mov eax, [esi-120] ; get acc[2]
|
|
mov ebp, [esi+8] ; get acc[34]
|
|
mov edx, [esi+72] ; get acc[50]
|
|
lea ecx, [eax+ebx] ; acc[2]+acc[18]
|
|
sub eax, ebx ; acc[2]-acc[18]
|
|
lea ebx, [ebp+edx] ; acc[34]+acc[50]
|
|
sub ebp, edx ; acc[34]-acc[50]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
|
|
sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
|
|
sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;nop
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+2], dl ; output[0][2] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+5], cl ; output[0][5] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+7*PITCH+2], bl ; output[7][2] = tmp3
|
|
mov ebx, [esi-52] ; get acc[19]
|
|
; -------------------------------------------------------------------------
|
|
; 3
|
|
mov BYTE PTR [edi+7*PITCH+5], al ; output[7][5] = tmp4
|
|
mov eax, [esi-116] ; get acc[3]
|
|
mov ebp, [esi+12] ; get acc[35]
|
|
mov edx, [esi+76] ; get acc[51]
|
|
lea ecx, [eax+ebx] ; acc[3]+acc[19]
|
|
sub eax, ebx ; acc[3]-acc[19]
|
|
lea ebx, [ebp+edx] ; acc[35]+acc[51]
|
|
sub ebp, edx ; acc[35]-acc[51]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
|
|
sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
|
|
sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;nop
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+3], dl ; output[0][3] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+4], cl ; output[0][4] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+7*PITCH+3], bl ; output[7][3] = tmp3
|
|
mov ebx, [esi-48] ; get acc[20]
|
|
; -------------------------------------------------------------------------
|
|
; 4
|
|
mov BYTE PTR [edi+7*PITCH+4], al ; output[7][4] = tmp4
|
|
mov eax, [esi-112] ; get acc[4]
|
|
mov ebp, [esi+16] ; get acc[36]
|
|
mov edx, [esi+80] ; get acc[52]
|
|
lea ecx, [eax+ebx] ; acc[4]+acc[20]
|
|
sub eax, ebx ; acc[4]-acc[20]
|
|
lea ebx, [ebp+edx] ; acc[36]+acc[52]
|
|
sub ebp, edx ; acc[36]-acc[52]
|
|
|
|
mov edx, [edi+PITCH] ; pre-fetch output cache line 1
|
|
mov edi, [edi+6*PITCH] ; pre-fetch output cache line 6
|
|
;mov esi, [edi+6*PITCH] ; pre-fetch output cache line 6
|
|
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
|
|
sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
|
|
sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+PITCH], dl ; output[1][0] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+PITCH+7], cl ; output[1][7] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+6*PITCH], bl ; output[6][0] = tmp3
|
|
mov ebx, [esi-44] ; get acc[21]
|
|
|
|
; -------------------------------------------------------------------------
|
|
; 5
|
|
mov BYTE PTR [edi+6*PITCH+7], al ; output[6][7] = tmp4
|
|
mov eax, [esi-108] ; get acc[5]
|
|
mov ebp, [esi+20] ; get acc[37]
|
|
mov edx, [esi+84] ; get acc[53]
|
|
lea ecx, [eax+ebx] ; acc[5]+acc[21]
|
|
sub eax, ebx ; acc[5]-acc[21]
|
|
lea ebx, [ebp+edx] ; acc[37]+acc[53]
|
|
sub ebp, edx ; acc[37]-acc[53]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
|
|
sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
|
|
sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;nop
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+PITCH+1], dl ; output[1][1] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+PITCH+6], cl ; output[1][6] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+6*PITCH+1], bl ; output[6][1] = tmp3
|
|
mov ebx, [esi-40] ; get acc[22]
|
|
; -------------------------------------------------------------------------
|
|
; 6
|
|
mov BYTE PTR [edi+6*PITCH+6], al ; output[6][6] = tmp4
|
|
mov eax, [esi-104] ; get acc[6]
|
|
mov ebp, [esi+24] ; get acc[38]
|
|
mov edx, [esi+88] ; get acc[54]
|
|
lea ecx, [eax+ebx] ; acc[6]+acc[22]
|
|
sub eax, ebx ; acc[6]-acc[22]
|
|
lea ebx, [ebp+edx] ; acc[38]+acc[54]
|
|
sub ebp, edx ; acc[38]-acc[54]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
|
|
sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
|
|
sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;nop
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+PITCH+2], dl ; output[1][2] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+PITCH+5], cl ; output[1][5] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+6*PITCH+2], bl ; output[6][2] = tmp3
|
|
mov ebx, [esi-36] ; get acc[23]
|
|
; -------------------------------------------------------------------------
|
|
; 7
|
|
mov BYTE PTR [edi+6*PITCH+5], al ; output[6][5] = tmp4
|
|
mov eax, [esi-100] ; get acc[7]
|
|
mov ebp, [esi+28] ; get acc[39]
|
|
mov edx, [esi+92] ; get acc[55]
|
|
lea ecx, [eax+ebx] ; acc[7]+acc[23]
|
|
sub eax, ebx ; acc[7]-acc[23]
|
|
lea ebx, [ebp+edx] ; acc[39]+acc[55]
|
|
sub ebp, edx ; acc[39]-acc[55]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
|
|
sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
|
|
sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
;nop
|
|
mov edi, [L_DESTBLOCK_1+4]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
|
|
mov BYTE PTR [edi+PITCH+3], dl ; output[1][3] = tmp1
|
|
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
|
|
mov BYTE PTR [edi+PITCH+4], cl ; output[1][4] = tmp2
|
|
mov edi, [L_DESTBLOCK_2+4]
|
|
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
|
|
nop
|
|
mov BYTE PTR [edi+6*PITCH+3], bl ; output[6][3] = tmp3
|
|
mov edx, [L_LOOPCOUNTER+4] ; fetch local loop counter
|
|
mov BYTE PTR [edi+6*PITCH+4], al ; output[6][4] = tmp4
|
|
add edi, 2*PITCH
|
|
add esi, 32 ; add 32 to esi for second pass
|
|
mov [L_DESTBLOCK_1+4], edi
|
|
sub edi, 4*PITCH
|
|
dec edx
|
|
mov [L_DESTBLOCK_2+4], edi
|
|
jnz loop_intra_bfly
|
|
|
|
|
|
ret
|
|
|
|
//////////////////////////////////////////////////////////////////////
|
|
// This "subroutine" idct_bfly_inter performs the butterfly phase of
|
|
// the fmidct for inter blocks.
|
|
//
|
|
// assume parameters passed in by registers
|
|
//
|
|
// C code:
|
|
//
|
|
// Upper Left Quadrant
|
|
// Upper Right Quadrant
|
|
// Lower Left Quadrant
|
|
// Lower Right Quadrant
|
|
//
|
|
// lOut[0][0] = CLIP_INTER[acc[0]+acc[16] + acc[32]+acc[48]];
|
|
// lOut[0][7] = CLIP_INTER[acc[0]+acc[16] - (acc[32]+acc[48])];
|
|
// lOut[7][0] = CLIP_INTER[(acc[0]-acc[16]) + (acc[32]-acc[48])];
|
|
// lOut[7][7] = CLIP_INTER[(acc[0]-acc[16]) - (acc[32]-acc[48])];
|
|
//
|
|
// lOut[0][1] = CLIP_INTER[acc[1]+acc[17] + acc[33]+acc[49]];
|
|
// lOut[0][6] = CLIP_INTER[acc[1]+acc[17] - (acc[33]+acc[49])];
|
|
// lOut[7][1] = CLIP_INTER[(acc[1]-acc[17]) + (acc[33]-acc[49])];
|
|
// lOut[7][6] = CLIP_INTER[(acc[1]-acc[17]) - (acc[33]-acc[49])];
|
|
//
|
|
// lOut[0][2] = CLIP_INTER[acc[2]+acc[18] + acc[34]+acc[50]];
|
|
// lOut[0][5] = CLIP_INTER[acc[2]+acc[18] - (acc[34]+acc[50])];
|
|
// lOut[7][2] = CLIP_INTER[(acc[2]-acc[18]) + (acc[34]-acc[50])];
|
|
// lOut[7][5] = CLIP_INTER[(acc[2]-acc[18]) - (acc[34]-acc[50])];
|
|
//
|
|
// lOut[0][3] = CLIP_INTER[acc[3]+acc[19] + acc[35]+acc[51]];
|
|
// lOut[0][4] = CLIP_INTER[acc[3]+acc[19] - (acc[35]+acc[51])];
|
|
// lOut[7][3] = CLIP_INTER[(acc[3]-acc[19]) + (acc[35]-acc[51])];
|
|
// lOut[7][4] = CLIP_INTER[(acc[3]-acc[19]) - (acc[35]-acc[51])];
|
|
//
|
|
//
|
|
// lOut[1][0] = CLIP_INTER[acc[4]+acc[20] + acc[36]+acc[52]];
|
|
// lOut[1][7] = CLIP_INTER[acc[4]+acc[20] - (acc[36]+acc[52])];
|
|
// lOut[6][0] = CLIP_INTER[(acc[4]-acc[20]) + (acc[36]-acc[52])];
|
|
// lOut[6][7] = CLIP_INTER[(acc[4]-acc[20]) - (acc[36]-acc[52])];
|
|
//
|
|
// lOut[1][1] = CLIP_INTER[acc[5]+acc[21] + acc[37]+acc[53]];
|
|
// lOut[1][6] = CLIP_INTER[acc[5]+acc[21] - (acc[37]+acc[53])];
|
|
// lOut[6][1] = CLIP_INTER[(acc[5]-acc[21]) + (acc[37]-acc[53])];
|
|
// lOut[6][6] = CLIP_INTER[(acc[5]-acc[21]) - (acc[37]-acc[53])];
|
|
//
|
|
// lOut[1][2] = CLIP_INTER[acc[6]+acc[22] + acc[38]+acc[54]];
|
|
// lOut[1][5] = CLIP_INTER[acc[6]+acc[22] - (acc[38]+acc[54])];
|
|
// lOut[6][2] = CLIP_INTER[(acc[6]-acc[22]) + (acc[38]-acc[54])];
|
|
// lOut[6][5] = CLIP_INTER[(acc[6]-acc[22]) - (acc[38]-acc[54])];
|
|
//
|
|
// lOut[1][3] = CLIP_INTER[acc[7]+acc[23] + acc[39]+acc[55]];
|
|
// lOut[1][4] = CLIP_INTER[acc[7]+acc[23] - (acc[39]+acc[55])];
|
|
// lOut[6][3] = CLIP_INTER[(acc[7]-acc[23]) + (acc[39]-acc[55])];
|
|
// lOut[6][4] = CLIP_INTER[(acc[7]-acc[23]) - (acc[39]-acc[55])];
|
|
//
|
|
//
|
|
// lOut[2][0] = CLIP_INTER[acc[8]+acc[24] + acc[40]+acc[56]];
|
|
// lOut[2][7] = CLIP_INTER[acc[8]+acc[24] - (acc[40]+acc[56])];
|
|
// lOut[5][0] = CLIP_INTER[(acc[8]-acc[24]) + (acc[40]-acc[56])];
|
|
// lOut[5][7] = CLIP_INTER[(acc[8]-acc[24]) - (acc[40]-acc[56])];
|
|
//
|
|
// lOut[2][1] = CLIP_INTER[acc[9]+acc[25] + acc[41]+acc[57]];
|
|
// lOut[2][6] = CLIP_INTER[acc[9]+acc[25] - (acc[41]+acc[57])];
|
|
// lOut[5][1] = CLIP_INTER[(acc[9]-acc[25]) + (acc[41]-acc[57])];
|
|
// lOut[5][6] = CLIP_INTER[(acc[9]-acc[25]) - (acc[41]-acc[57])];
|
|
//
|
|
// lOut[2][2] = CLIP_INTER[acc[10]+acc[26] + acc[42]+acc[58]];
|
|
// lOut[2][5] = CLIP_INTER[acc[10]+acc[26] - (acc[42]+acc[58])];
|
|
// lOut[5][2] = CLIP_INTER[(acc[10]-acc[26]) + (acc[42]-acc[58])];
|
|
// lOut[5][5] = CLIP_INTER[(acc[10]-acc[26]) - (acc[42]-acc[58])];
|
|
//
|
|
// lOut[2][3] = CLIP_INTER[acc[11]+acc[27] + acc[43]+acc[59]];
|
|
// lOut[2][4] = CLIP_INTER[acc[11]+acc[27] - (acc[43]+acc[59])];
|
|
// lOut[5][3] = CLIP_INTER[(acc[11]-acc[27]) + (acc[43]-acc[59])];
|
|
// lOut[5][4] = CLIP_INTER[(acc[11]-acc[27]) - (acc[43]-acc[59])];
|
|
//
|
|
//
|
|
// lOut[3][0] = CLIP_INTER[acc[12]+acc[28] + acc[44]+acc[60]];
|
|
// lOut[3][7] = CLIP_INTER[acc[12]+acc[28] - (acc[44]+acc[60])];
|
|
// lOut[4][0] = CLIP_INTER[(acc[12]-acc[28]) + (acc[44]-acc[60])];
|
|
// lOut[4][7] = CLIP_INTER[(acc[12]-acc[28]) - (acc[44]-acc[60])];
|
|
//
|
|
// lOut[3][1] = CLIP_INTER[acc[13]+acc[29] + acc[45]+acc[61]];
|
|
// lOut[3][6] = CLIP_INTER[acc[13]+acc[29] - (acc[45]+acc[61])];
|
|
// lOut[4][1] = CLIP_INTER[(acc[13]-acc[29]) + (acc[45]-acc[61])];
|
|
// lOut[4][6] = CLIP_INTER[(acc[13]-acc[29]) - (acc[45]-acc[61])];
|
|
//
|
|
// lOut[3][2] = CLIP_INTER[acc[14]+acc[30] + acc[46]+acc[62]];
|
|
// lOut[3][5] = CLIP_INTER[acc[14]+acc[30] - (acc[46]+acc[62])];
|
|
// lOut[4][2] = CLIP_INTER[(acc[14]-acc[30]) + (acc[46]-acc[62])];
|
|
// lOut[4][5] = CLIP_INTER[(acc[14]-acc[30]) - (acc[46]-acc[62])];
|
|
//
|
|
// lOut[3][3] = CLIP_INTER[acc[15]+acc[31] + acc[47]+acc[63]];
|
|
// lOut[3][4] = CLIP_INTER[acc[15]+acc[31] - (acc[47]+acc[63])];
|
|
// lOut[4][3] = CLIP_INTER[(acc[15]-acc[31]) + (acc[47]-acc[63])];
|
|
// lOut[4][4] = CLIP_INTER[(acc[15]-acc[31]) - (acc[47]-acc[63])];
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////
|
|
//assume parameters passed in by registers
|
|
|
|
idct_bfly_inter:
|
|
|
|
; ----------------------------------------------------------------------
|
|
; INTER ONLY Butterfly and clamp
|
|
; Uses all registers.
|
|
; Uses all accumulators[64], accum
|
|
; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra
|
|
; Writes to Intermediate matrix [8][8] of DWORDS, Intermediate
|
|
;
|
|
; Process 4 outputs per group, 0-15
|
|
; 0
|
|
|
|
#ifdef PTEL_WORK_AROUND
|
|
mov eax, [L_COEFFCOUNT+4] ; get coefficient counter
|
|
mov ebx, [L_COEFFVALUE+4] ; get coefficient value
|
|
cmp eax, 1 ; compare counter with 1
|
|
jg Normal_Process ; if greater than 1 jump to normal process
|
|
cmp ebx, 3
|
|
jz Zero_Output ; if value == 3 zero output
|
|
cmp ebx, -3
|
|
jnz Normal_Process ; if value != -3 Process as usual
|
|
|
|
Zero_Output:
|
|
////////////////////////////////////////////////////////////////////////
|
|
// Zero out intermediate matrix [8][8] of DWORDS
|
|
//
|
|
// C code:
|
|
//
|
|
// for (x=0; x<8; x++)
|
|
// for (y=16; y<8; y++)
|
|
// Intermediate[x][y] = 0L;
|
|
//
|
|
|
|
mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate
|
|
xor eax, eax
|
|
mov ebx, 8
|
|
;
|
|
|
|
ALIGN 4
|
|
loop_for_reinit:
|
|
mov [edi], eax
|
|
mov [edi+4], eax
|
|
mov [edi+8], eax
|
|
mov [edi+12], eax
|
|
mov [edi+16], eax
|
|
mov [edi+20], eax
|
|
mov [edi+24], eax
|
|
mov [edi+28], eax
|
|
add edi, 32
|
|
dec ebx
|
|
jnz loop_for_reinit
|
|
|
|
|
|
ret
|
|
|
|
Normal_Process:
|
|
#endif
|
|
lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate
|
|
add edi, 128
|
|
nop
|
|
mov eax, [esi-128] ; get acc[0]
|
|
mov ebx, [esi-64] ; get acc[16]
|
|
mov ebp, [esi] ; get acc[32]
|
|
mov edx, [esi+64] ; get acc[48]
|
|
lea ecx, [eax+ebx] ; acc[0]+acc[16]
|
|
sub eax, ebx ; acc[0]-acc[16]
|
|
lea ebx, [ebp+edx] ; acc[32]+acc[48]
|
|
sub ebp, edx ; acc[32]-acc[48]
|
|
|
|
mov edx, [edi-128] ; pre-fetch output cache line 0
|
|
mov esi, [edi+96] ; pre-fetch output cache line 7
|
|
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
|
|
sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
|
|
sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-128], edx ; Intermediate[0][0] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-128+7*4], ecx ; Intermediate[0][7] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+96], ebx ; Intermediate[7][0] = tmp3
|
|
mov ebx, [esi-60] ; get acc[17]
|
|
|
|
; -------------------------------------------------------------------------
|
|
; 1
|
|
mov DWORD PTR [edi+96+7*4], eax ; Intermediate[7][7] = tmp4
|
|
mov eax, [esi-124] ; get acc[1]
|
|
mov ebp, [esi+4] ; get acc[33]
|
|
mov edx, [esi+68] ; get acc[49]
|
|
lea ecx, [eax+ebx] ; acc[1]+acc[17]
|
|
sub eax, ebx ; acc[1]-acc[17]
|
|
lea ebx, [ebp+edx] ; acc[33]+acc[49]
|
|
sub ebp, edx ; acc[33]-acc[49]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
|
|
sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
|
|
sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-128+1*4], edx ; Intermediate[0][1] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-128+6*4], ecx ; Intermediate[0][6] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+96+1*4], ebx ; Intermediate[7][1] = tmp3
|
|
mov ebx, [esi-56] ; get acc[18]
|
|
; -------------------------------------------------------------------------
|
|
; 2
|
|
mov DWORD PTR [edi+96+6*4], eax ; Intermediate[7][6] = tmp4
|
|
mov eax, [esi-120] ; get acc[2]
|
|
mov ebp, [esi+8] ; get acc[34]
|
|
mov edx, [esi+72] ; get acc[50]
|
|
lea ecx, [eax+ebx] ; acc[2]+acc[18]
|
|
sub eax, ebx ; acc[2]-acc[18]
|
|
lea ebx, [ebp+edx] ; acc[34]+acc[50]
|
|
sub ebp, edx ; acc[34]-acc[50]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
|
|
sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
|
|
sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-128+2*4], edx ; Intermediate[0][2] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-128+5*4], ecx ; Intermediate[0][5] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+96+2*4], ebx ; Intermediate[7][2] = tmp3
|
|
mov ebx, [esi-52] ; get acc[19]
|
|
; -------------------------------------------------------------------------
|
|
; 3
|
|
mov DWORD PTR [edi+96+5*4], eax ; Intermediate[7][5] = tmp4
|
|
mov eax, [esi-116] ; get acc[3]
|
|
mov ebp, [esi+12] ; get acc[35]
|
|
mov edx, [esi+76] ; get acc[51]
|
|
lea ecx, [eax+ebx] ; acc[3]+acc[19]
|
|
sub eax, ebx ; acc[3]-acc[19]
|
|
lea ebx, [ebp+edx] ; acc[35]+acc[51]
|
|
sub ebp, edx ; acc[35]-acc[51]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
|
|
sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
|
|
sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-128+3*4], edx ; Intermediate[0][3] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-128+4*4], ecx ; Intermediate[0][4] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+96+3*4], ebx ; Intermediate[7][3] = tmp3
|
|
mov ebx, [esi-48] ; get acc[20]
|
|
; -------------------------------------------------------------------------
|
|
; 4
|
|
mov DWORD PTR [edi+96+4*4], eax ; Intermediate[7][4] = tmp4
|
|
mov eax, [esi-112] ; get acc[4]
|
|
mov ebp, [esi+16] ; get acc[36]
|
|
mov edx, [esi+80] ; get acc[52]
|
|
lea ecx, [eax+ebx] ; acc[4]+acc[20]
|
|
sub eax, ebx ; acc[4]-acc[20]
|
|
lea ebx, [ebp+edx] ; acc[36]+acc[52]
|
|
sub ebp, edx ; acc[36]-acc[52]
|
|
|
|
mov edx, [edi-96] ; pre-fetch output cache line 1
|
|
mov esi, [edi+64] ; pre-fetch output cache line 6
|
|
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
|
|
sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
|
|
sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-96], edx ; Intermediate[1][0] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-96+7*4], ecx ; Intermediate[1][7] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+64], ebx ; Intermediate[6][0] = tmp3
|
|
mov ebx, [esi-44] ; get acc[21]
|
|
|
|
; -------------------------------------------------------------------------
|
|
; 5
|
|
mov DWORD PTR [edi+64+7*4], eax ; Intermediate[6][7] = tmp4
|
|
mov eax, [esi-108] ; get acc[5]
|
|
mov ebp, [esi+20] ; get acc[37]
|
|
mov edx, [esi+84] ; get acc[53]
|
|
lea ecx, [eax+ebx] ; acc[5]+acc[21]
|
|
sub eax, ebx ; acc[5]-acc[21]
|
|
lea ebx, [ebp+edx] ; acc[37]+acc[53]
|
|
sub ebp, edx ; acc[37]-acc[53]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
|
|
sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
|
|
sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-96+1*4], edx ; Intermediate[1][1] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-96+6*4], ecx ; Intermediate[1][6] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+64+1*4], ebx ; Intermediate[6][1] = tmp3
|
|
mov ebx, [esi-40] ; get acc[22]
|
|
; -------------------------------------------------------------------------
|
|
; 6
|
|
mov DWORD PTR [edi+64+6*4], eax ; Intermediate[6][6] = tmp4
|
|
mov eax, [esi-104] ; get acc[6]
|
|
mov ebp, [esi+24] ; get acc[38]
|
|
mov edx, [esi+88] ; get acc[54]
|
|
lea ecx, [eax+ebx] ; acc[6]+acc[22]
|
|
sub eax, ebx ; acc[6]-acc[22]
|
|
lea ebx, [ebp+edx] ; acc[38]+acc[54]
|
|
sub ebp, edx ; acc[38]-acc[54]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
|
|
sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
|
|
sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-96+2*4], edx ; Intermediate[1][2] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-96+5*4], ecx ; Intermediate[1][5] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+64+2*4], ebx ; Intermediate[6][2] = tmp3
|
|
mov ebx, [esi-36] ; get acc[23]
|
|
; -------------------------------------------------------------------------
|
|
; 7
|
|
mov DWORD PTR [edi+64+5*4], eax ; Intermediate[6][5] = tmp4
|
|
mov eax, [esi-100] ; get acc[7]
|
|
mov ebp, [esi+28] ; get acc[39]
|
|
mov edx, [esi+92] ; get acc[55]
|
|
lea ecx, [eax+ebx] ; acc[7]+acc[23]
|
|
sub eax, ebx ; acc[7]-acc[23]
|
|
lea ebx, [ebp+edx] ; acc[39]+acc[55]
|
|
sub ebp, edx ; acc[39]-acc[55]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
|
|
sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
|
|
sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-96+3*4], edx ; Intermediate[1][3] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-96+4*4], ecx ; Intermediate[1][4] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+64+3*4], ebx ; Intermediate[6][3] = tmp3
|
|
mov ebx, [esi-32] ; get acc[24]
|
|
; -------------------------------------------------------------------------
|
|
; 8
|
|
mov DWORD PTR [edi+64+4*4], eax ; Intermediate[6][4] = tmp4
|
|
mov eax, [esi-96] ; get acc[8]
|
|
mov ebp, [esi+32] ; get acc[40]
|
|
mov edx, [esi+96] ; get acc[56]
|
|
lea ecx, [eax+ebx] ; acc[8]+acc[24]
|
|
sub eax, ebx ; acc[8]-acc[24]
|
|
lea ebx, [ebp+edx] ; acc[40]+acc[56]
|
|
sub ebp, edx ; acc[40]-acc[56]
|
|
|
|
mov edx, [edi-64] ; pre-fetch output cache line 2
|
|
mov esi, [edi+32] ; pre-fetch output cache line 5
|
|
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56]
|
|
sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56])
|
|
sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-64], edx ; Intermediate[2][0] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-64+7*4], ecx ; Intermediate[2][7] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+32], ebx ; Intermediate[5][0] = tmp3
|
|
mov ebx, [esi-28] ; get acc[25]
|
|
|
|
; -------------------------------------------------------------------------
|
|
; 9
|
|
mov DWORD PTR [edi+32+7*4], eax ; Intermediate[5][7] = tmp4
|
|
mov eax, [esi-92] ; get acc[9]
|
|
mov ebp, [esi+36] ; get acc[41]
|
|
mov edx, [esi+100] ; get acc[57]
|
|
lea ecx, [eax+ebx] ; acc[9]+acc[25]
|
|
sub eax, ebx ; acc[9]-acc[25]
|
|
lea ebx, [ebp+edx] ; acc[41]+acc[57]
|
|
sub ebp, edx ; acc[41]-acc[57]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57]
|
|
sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57])
|
|
sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-64+1*4], edx ; Intermediate[2][1] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-64+6*4], ecx ; Intermediate[2][6] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+32+1*4], ebx ; Intermediate[5][1] = tmp3
|
|
mov ebx, [esi-24] ; get acc[26]
|
|
; -------------------------------------------------------------------------
|
|
; 10
|
|
mov DWORD PTR [edi+32+6*4], eax ; Intermediate[5][6] = tmp4
|
|
mov eax, [esi-88] ; get acc[10]
|
|
mov ebp, [esi+40] ; get acc[42]
|
|
mov edx, [esi+104] ; get acc[58]
|
|
lea ecx, [eax+ebx] ; acc[10]+acc[26]
|
|
sub eax, ebx ; acc[10]-acc[26]
|
|
lea ebx, [ebp+edx] ; acc[42]+acc[58]
|
|
sub ebp, edx ; acc[42]-acc[58]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58]
|
|
sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58])
|
|
sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-64+2*4], edx ; Intermediate[2][2] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-64+5*4], ecx ; Intermediate[2][5] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+32+2*4], ebx ; Intermediate[5][2] = tmp3
|
|
mov ebx, [esi-20] ; get acc[27]
|
|
; -------------------------------------------------------------------------
|
|
; 11
|
|
mov DWORD PTR [edi+32+5*4], eax ; Intermediate[5][5] = tmp4
|
|
mov eax, [esi-84] ; get acc[11]
|
|
mov ebp, [esi+44] ; get acc[43]
|
|
mov edx, [esi+108] ; get acc[59]
|
|
lea ecx, [eax+ebx] ; acc[11]+acc[27]
|
|
sub eax, ebx ; acc[11]-acc[27]
|
|
lea ebx, [ebp+edx] ; acc[43]+acc[59]
|
|
sub ebp, edx ; acc[43]-acc[59]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59]
|
|
sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59])
|
|
sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-64+3*4], edx ; Intermediate[2][3] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-64+4*4], ecx ; Intermediate[2][4] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+32+3*4], ebx ; Intermediate[5][3] = tmp3
|
|
mov ebx, [esi-16] ; get acc[28]
|
|
; -------------------------------------------------------------------------
|
|
; 12
|
|
mov DWORD PTR [edi+32+4*4], eax ; Intermediate[5][4] = tmp4
|
|
mov eax, [esi-80] ; get acc[12]
|
|
mov ebp, [esi+48] ; get acc[44]
|
|
mov edx, [esi+112] ; get acc[60]
|
|
lea ecx, [eax+ebx] ; acc[12]+acc[28]
|
|
sub eax, ebx ; acc[12]-acc[28]
|
|
lea ebx, [ebp+edx] ; acc[44]+acc[60]
|
|
sub ebp, edx ; acc[44]-acc[60]
|
|
|
|
mov edx, [edi-32] ; pre-fetch output cache line 3
|
|
mov esi, [edi] ; pre-fetch output cache line 4
|
|
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60]
|
|
sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60])
|
|
sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-32], edx ; Intermediate[3][0] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-32+7*4], ecx ; Intermediate[3][7] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi], ebx ; Intermediate[4][0] = tmp3
|
|
mov ebx, [esi-12] ; get acc[29]
|
|
|
|
; -------------------------------------------------------------------------
|
|
; 13
|
|
mov DWORD PTR [edi+7*4], eax ; Intermediate[4][7] = tmp4
|
|
mov eax, [esi-76] ; get acc[13]
|
|
mov ebp, [esi+52] ; get acc[45]
|
|
mov edx, [esi+116] ; get acc[61]
|
|
lea ecx, [eax+ebx] ; acc[13]+acc[29]
|
|
sub eax, ebx ; acc[13]-acc[29]
|
|
lea ebx, [ebp+edx] ; acc[45]+acc[61]
|
|
sub ebp, edx ; acc[45]-acc[61]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61]
|
|
sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61])
|
|
sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-32+1*4], edx ; Intermediate[3][1] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-32+6*4], ecx ; Intermediate[3][6] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+1*4], ebx ; Intermediate[4][1] = tmp3
|
|
mov ebx, [esi-8] ; get acc[30]
|
|
; -------------------------------------------------------------------------
|
|
; 14
|
|
mov DWORD PTR [edi+6*4], eax ; Intermediate[4][6] = tmp4
|
|
mov eax, [esi-72] ; get acc[14]
|
|
mov ebp, [esi+56] ; get acc[46]
|
|
mov edx, [esi+120] ; get acc[62]
|
|
lea ecx, [eax+ebx] ; acc[14]+acc[30]
|
|
sub eax, ebx ; acc[14]-acc[30]
|
|
lea ebx, [ebp+edx] ; acc[46]+acc[62]
|
|
sub ebp, edx ; acc[46]-acc[62]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62]
|
|
sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62])
|
|
sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-32+2*4], edx ; Intermediate[3][2] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-32+5*4], ecx ; Intermediate[3][5] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+2*4], ebx ; Intermediate[4][2] = tmp3
|
|
mov ebx, [esi-4] ; get acc[31]
|
|
; -------------------------------------------------------------------------
|
|
; 15
|
|
mov DWORD PTR [edi+5*4], eax ; Intermediate[4][5] = tmp4
|
|
mov eax, [esi-68] ; get acc[15]
|
|
mov ebp, [esi+60] ; get acc[47]
|
|
mov edx, [esi+124] ; get acc[63]
|
|
lea ecx, [eax+ebx] ; acc[15]+acc[31]
|
|
sub eax, ebx ; acc[15]-acc[31]
|
|
lea ebx, [ebp+edx] ; acc[47]+acc[63]
|
|
sub ebp, edx ; acc[47]-acc[63]
|
|
;nop
|
|
;nop
|
|
lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63]
|
|
sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63])
|
|
lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63])
|
|
sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63])
|
|
sar edx, SCALER ; tmp1 >> 13
|
|
lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
|
|
sar ecx, SCALER ; tmp2 >> 13
|
|
nop
|
|
sar ebx, SCALER ; tmp3 >> 13
|
|
mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
|
|
sar eax, SCALER ; tmp4 >> 13
|
|
mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
|
|
mov DWORD PTR [edi-32+3*4], edx ; Intermediate[3][3] = tmp1
|
|
mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
|
|
mov DWORD PTR [edi-32+4*4], ecx ; Intermediate[3][4] = tmp2
|
|
mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
|
|
mov DWORD PTR [edi+3*4], ebx ; Intermediate[4][3] = tmp3
|
|
mov DWORD PTR [edi+4*4], eax ; Intermediate[4][4] = tmp4
|
|
ret
|
|
} //end of asm
|
|
}
|
|
|
|
#pragma code_seg()
|