// version 003; everything except 1) segment // /* ************************************************************************* ** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */ ////////////////////////////////////////////////////////////////////////// // $Author: AGUPTA2 $ // $Date: 25 Oct 1996 13:32:28 $ // $Archive: S:\h26x\src\dec\d3idct.cpv $ // $Header: S:\h26x\src\dec\d3idct.cpv 1.11 25 Oct 1996 13:32:28 AGUPTA2 $ // $Log: S:\h26x\src\dec\d3idct.cpv $ // // Rev 1.11 25 Oct 1996 13:32:28 AGUPTA2 // Re-scheduled butterfky code; re-arranged local var declarations. // // Rev 1.10 30 Aug 1996 08:39:56 KLILLEVO // added C version of block edge filter, and changed the bias in // ClampTbl[] from 128 to CLAMP_BIAS (defined to 128) // The C version of the block edge filter takes up way too much CPU time // relative to the rest of the decode time (4 ms for QCIF and 16 ms // for CIF on a P120, so this needs to coded in assembly) // // Rev 1.9 17 Jul 1996 15:33:18 AGUPTA2 // Increased the size of clamping table ClampTbl to 128+256+128. // // Rev 1.8 08 Mar 1996 16:46:20 AGUPTA2 // Added pragma code_seg. Rolled the initialization code. Got rid of most // of 32-bit displacements in instructions. Aligned frequently executed loops // at 4-byte boundary. Made changes to reflect new size of MapMatrix. Removed // nop instructions. Deleted code that prefetches output lines in case of // INTRA blocks. Use ClampTbl instead of ClipPixIntra. Do not clip output // of INTER blocks; clipping is done in dxblkadd(). // // // Rev 1.7 27 Dec 1995 14:36:06 RMCKENZX // Added copyright notice // // Rev 1.6 09 Dec 1995 17:33:20 RMCKENZX // Re-checked in module to support decoder re-architecture (thru PB Frames) // // Rev 1.4 30 Nov 1995 18:02:14 CZHU // Save and restore register before and after idct_acc // // Rev 1.1 27 Nov 1995 13:13:28 CZHU // // // Rev 1.0 27 Nov 1995 13:08:24 CZHU // Initial revision. // //Block level decoding for H.26x decoder #include "precomp.h" ///////////////////////////////////////////////////////////////////////// // Decode each none-empty block // Input: lpInst: decoder instance, // lpSrc: input bitstream, // lpBlockAction: // the pointer to the block action stream structure // bitsread: number of bits in the buffer already, ///////////////////////////////////////////////////////////////////////// // local variable definitions #define FRAMEPOINTER esp ////////////////////////////////////////////////////////////// // L_ACCUM MUST BE LAST 256 BYTES OF A PAGE ///////////////////////////////////////////////////////////// #define L_PRODUCT FRAMEPOINTER + 0 // 20 DWORD #define L_INPUT_INTER L_PRODUCT + 20*4 // DWORD #define L_esi L_INPUT_INTER + 1*4 // DWORD #define L_NO_COEFF L_esi + 1*4 // DWORD #define L_DESTBLOCK L_NO_COEFF + 1*4 // DWORD #define L_LOOPCOUNTER L_DESTBLOCK + 1*4 // DWORD #define L_STASHESP L_LOOPCOUNTER + 1*4 // DWORD #define L_dummy L_STASHESP + 1*4 // 6 DWORDS #define L_ACCUM L_dummy + 6*4 // 64 DWORD #define LOCALSIZE (96*4) // 96 DWORDS;multiple of cache line size //////////////////////////////////////////////////////////////////////////////// // Input: // pIQ_INDEX, pointer to pointer for Inverse quantization and index // for the current block. // No_Coeff, A 32 bit number indicate block types, etc. // 0--63, inter block, number of coeff // 64--127 64+ intra block, number of coeff // pIntraBuf, Buffer pointer for intra blocks. // // pInterBuf, Buffer pointer for inter blocks. // // // return: // ////////////////////////////////////////////////////////////////////////////////// #pragma code_seg("IACODE2") __declspec(naked) U32 DecodeBlock_IDCT ( U32 pIQ_INDEX, U32 No_Coeff, U32 pIntraBuf, U32 pInterBuf) { __asm { //////////////////////////////////////////////////////////////// // DON'T CHANGE LOCAL DECLARATIONS OR STACK POINTER ADJUSTMENT // CODE WITHOUT TALKING TO ATUL //////////////////////////////////////////////////////////////// push ebp // save callers frame pointer mov ebp, esp // make parameters accessible push esi // assumed preserved push edi push ebx mov eax, pInterBuf mov edx, esp // Save old ESP in edx and esp, -4096 // align at page boundary xor esi, esi // loop init sub esp, LOCALSIZE // last 96 DWORDS of page lea edi, [L_ACCUM] mov ebx, 64 // loop init mov [L_STASHESP], edx // Save old esp mov edx, No_Coeff mov [L_INPUT_INTER], eax mov eax, ROUNDER // loop init ; ///////////////////////////////////////////////////////////////// // There is no point in pre-loading the cache. That is because // after the first block it is likely to be in the cache. // loop_for_init: mov [edi], eax mov [edi+4], eax mov [edi+ebx], esi mov [edi+ebx+4], esi mov [edi+ebx+8], esi mov [edi+ebx+12], esi mov [edi+ebx+16], esi mov [edi+ebx+20], esi add edi, 8 add ebx, 16 cmp ebx, 192 jl loop_for_init ///////////////////////////////////////////////////////////////////// // end of new init code //end of IDCT init. cmp edx, 65 jg intra_block mov ebx, pInterBuf jmp pre_acc_loop intra_block: mov ebx, pIntraBuf sub edx, 65 // register: // ebp: loop counter // ebx: inverse quant // ecx: index [0,63] pre_acc_loop: mov esi, pIQ_INDEX mov [L_DESTBLOCK], ebx mov [L_esi], esi ALIGN 4 acc_loop: mov ebx,[esi+edx*8-8] //Invserse Quant mov ecx,[esi+edx*8-4] //Coeff index mov [L_NO_COEFF], edx call idct_acc mov esi, [L_esi] mov edx, [L_NO_COEFF] dec edx jnz acc_loop mov edx, [L_DESTBLOCK] mov ecx, [L_INPUT_INTER] cmp edx, ecx jnz call_intra_bfly call idct_bfly_inter mov esp, [L_STASHESP] // free locals add eax, edi pop ebx pop edi pop esi pop ebp ret call_intra_bfly: call idct_bfly_intra mov esp, [L_STASHESP] // free locals add eax, edi pop ebx pop edi pop esi pop ebp ret /////////////////////////////////////////////////////////////// // assume parameter passed in by registers // ebx, inverse quant // ecx, index [0,63] idct_acc: ; For every non-zero coefficient: ; LoopCounter, on local stack, has index ; ecx = index (0-63) ; ebx = non-zero input ; Note i = index ; and ecx, 03fh ; Chad added to prevent GPF mov [L_LOOPCOUNTER+4], ecx ; Store Loop counter xor edx, edx ; zero out for byte read, use as dword mov esi, ecx ; move index to esi lea eax, Unique ; eax = Address of Unique[0] mov ebp, ecx ; move index to ebp shl esi, 3 ; index*8 add ecx, ecx ; index*2 add esi, ecx ; index*10 lea ecx, KernelCoeff ; get KernelCoeff[0][0] lea edi, [L_PRODUCT+4] ; edi = address of product[0] mov dl, [eax+ebp] ; get Unique[i] lea esi, [ecx+4*esi] ; address of KernelCoeff[i][0] mov ebp, edx ; ebp = Unique[i] lea eax, [edi+edx*4] ; eax = address of product[totalU] nop ; ---------------------------------------------------------------------- ; Register usage ; eax = addr of product[Unique[i]] ; ebx = input[i] ; ecx = 0, -product[x] ; edx = KernelCoeff[i][x], product[x]= KernelCoeff[i][x] * input[i] ; ebp = x ; edi = addr of product[0] ; esi = addr of KernelCoeff[i][x] ALIGN 4 loop_for_x: xor ecx, ecx mov edx, [esi+ebp*4-4] ; read KernelCoeff[i][x] imul edx, ebx ; KernelCoeff[i][x] * input[i] mov [edi+ebp*4-4], edx ; product[x] = result of imul sub ecx, edx mov [eax+ebp*4-4], ecx ; product[totalU+x] = -product[x] dec ebp ; decrement x jnz loop_for_x ; ---------------------------------------------------------------------- ; Register usage ; eax = MapMatrix[0][0] ; ebx = PClass[0], accum[xxx] ; ecx = LoopCounter, addr of MapMatrix[i][0] ; edx = product[0], accum[PClass[i][0-15]] ; ebp = addr of accum[0], product[MapMatrix[i][0-15]] ; edi = addr of product[0] ; esi = PClass[i], address of accum[PClass[i]] mov ecx, [L_LOOPCOUNTER+4] ; get i and ecx, 0ffh ; Chad added to prevent GPF lea ebx, PClass ; get addr of PClass[0] mov esi, ecx shl ecx, 4 lea eax, MapMatrix ; get addr of MapMatrix[0][0] xor edx, edx nop mov dl, [ebx+esi] ; get PClass[i] lea ecx, [eax+1*ecx] ; get addr of MapMatrix[i][0] shl edx, 2 ; esi*4 lea esi, [L_ACCUM+4] ; get addr of accum[0] ; ---------------------------------------------------------------------- xor eax, eax ; get MapMatrix[i][0] add esi, edx ; esi = address of accum[PClass[i]] mov al, [ecx] mov ebx, [esi] ; get accum[PClass[i]] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[0]] mov al, [ecx+1] ; get pNKernel->matrix[1] add ebx, ebp ; accum[pNKernel->PClass] += product[ ; pNKernel->matrix[0]] mov edx, [esi+4] ; get accum[1+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[1]] mov al, [ecx+2] ; get pNKernel->matrix[2] add edx, ebp ; accum[1+pNkernel->PClass] += product[ ; pNKernel->matrix[1]] mov [esi], ebx ; store accum[pNKernel->PClass] += product[ ; pNKernel->matrix[0]] mov [esi+4], edx ; store accum[1+pNKernel->PClass] += ; product[pNKernel->matrix[1]] mov ebx, [esi+8] ; get accum[2+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[2]] mov al, [ecx+3] ; get pNKernel->matrix[3] add ebx, ebp ; accum[2+pNKernel->PClass] += product[ ; pNKernel->matrix[2]] mov edx, [esi+12] ; get accum[3+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[3]] mov al, [ecx+4] ; get pNKernel->matrix[4] add edx, ebp ; accum[3+pNKernel->PClass] += product[ ; pNKernel->matrix[3]] mov [esi+8], ebx ; store accum[2+pNKernel->PClass] += ; product[pNKernel->matrix[2]] mov [esi+12], edx ; store accum[3+pNKernel->PClass] += ; product[pNKernel->matrix[3]] ; ---------------------------------------------------------------------- mov ebx, [esi+16] ; get accum[4+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[4]] mov al, [ecx+5] ; get pNKernel->matrix[5] add ebx, ebp ; accum[4+pNKernel->PClass] += product[ ; pNKernel->matrix[4]] mov edx, [esi+20] ; get accum[5+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[5]] mov al, [ecx+6] ; get pNKernel->matrix[6] add edx, ebp ; accum[5+pNkernel->PClass] += product[ ; pNKernel->matrix[5]] mov [esi+16], ebx ; store accum[4+pNKernel->PClass] += ; product[pNKernel->matrix[4]] mov [esi+20], edx ; store accum[5+pNKernel->PClass] += ; product[pNKernel->matrix[5]] mov ebx, [esi+24] ; get accum[6+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[6]] mov al, [ecx+7] ; get pNKernel->matrix[7] add ebx, ebp mov edx, [esi+28] ; get accum[7+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[7]] mov al, [ecx+8] ; get pNKernel->matrix[8] add edx, ebp ; accum[7+pNKernel->PClass] += product[ ; pNKernel->matrix[7]] mov [esi+24], ebx ; store accum[6+pNKernel->PClass] += ; product[pNKernel->matrix[6]] mov [esi+28], edx ; store accum[7+pNKernel->PClass] += ; product[pNKernel->matrix[7]] ; ---------------------------------------------------------------------- mov ebx, [esi+32] ; get accum[8+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[8]] mov al, [ecx+9] ; get pNKernel->matrix[9] add ebx, ebp ; accum[8+pNKernel->PClass] += product[ ; pNKernel->matrix[8]] mov edx, [esi+36] ; get accum[9+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[9]] mov al, [ecx+10] ; get pNKernel->matrix[10] add edx, ebp ; accum[9+pNkernel->PClass] += product[ ; pNKernel->matrix[9]] mov [esi+32], ebx ; store accum[8+pNKernel->PClass] += ; product[pNKernel->matrix[8]] mov [esi+36], edx ; store accum[9+pNKernel->PClass] += ; product[pNKernel->matrix[9]] mov ebx, [esi+40] ; get accum[10+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[10]] mov al, [ecx+11] ; get pNKernel->matrix[11] add ebx, ebp mov edx, [esi+44] ; get accum[11+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[11]] ; product[pNKernel->matrix[11]] mov al, [ecx+12] ; get pNKernel->matrix[12] add edx, ebp ; accum[11+pNKernel->PClass] += product[ ; pNKernel->matrix[11]] mov [esi+40], ebx ; store accum[10+pNKernel->PClass] += ; product[pNKernel->matrix[10]] mov [esi+44], edx ; store accum[11+pNKernel->PClass] += ; product[pNKernel->matrix[11]] ; ---------------------------------------------------------------------- mov ebx, [esi+48] ; get accum[12+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[12]] mov al, [ecx+13] ; get pNKernel->matrix[13] add ebx, ebp ; accum[12+pNKernel->PClass] += product[ ; pNKernel->matrix[12]] mov edx, [esi+52] ; get accum[13+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[13]] mov al, [ecx+14] ; get pNKernel->matrix[14] add edx, ebp ; accum[13+pNkernel->PClass] += product[ ; pNKernel->matrix[13]] mov [esi+48], ebx ; store accum[pNKernel->PClass] += product[ ; pNKernel->matrix[13]] mov [esi+52], edx ; store accum[13+pNKernel->PClass] += ; product[pNKernel->matrix[13]] mov ebx, [esi+56] ; get accum[14+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[14]] mov al, [ecx+15] ; get pNKernel->matrix[15] add ebx, ebp mov edx, [esi+60] ; get accum[15+pNKernel->PClass] mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[15]] mov [esi+56], ebx ; store accum[14+pNKernel->PClass] += ; product[pNKernel->matrix[14]] add edx, ebp ; accum[15+pNKernel->PClass] += product[ ; pNKernel->matrix[15]] mov [esi+60], edx ; store accum[15+pNKernel->PClass] += ; product[pNKernel->matrix[15]] ret //////////////////////////////////////////////////////////////////////////// //assume parameters passed in by registers idct_bfly_intra: ; ---------------------------------------------------------------------- ; INTRA ONLY Butterfly and clamp ; Uses all registers. ; Uses all accumulators[64], accum ; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra ; Writes to Output matrix of BYTES, OutputCoeff ; ; Process 4 outputs per group, 0-15 ; 0 lea esi, [L_ACCUM+4] ; get addr of accum[0] mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of OutputCoeff add esi, 128 nop mov eax, [esi-128] ; get acc[0] mov ebx, [esi+64-128] ; get acc[16] mov ebp, [esi+128-128] ; get acc[32] mov edx, [esi+192-128] ; get acc[48] lea ecx, [eax+ebx] ; acc[0]+acc[16] sub eax, ebx ; acc[0]-acc[16] lea ebx, [ebp+edx] ; acc[32]+acc[48] sub ebp, edx ; acc[32]-acc[48] lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48] sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48]) lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48]) sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ;lea esi, [L_ACCUM+4] ; get addr of accum[0] sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi], dl ; output[0][0] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+7], cl ; output[0][7] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+7*PITCH], bl ; output[7][0] = tmp3 mov ebx, [esi+68-128] ; get acc[17] ; ------------------------------------------------------------------------- ; 1 mov BYTE PTR [edi+7*PITCH+7], al ; output[7][7] = tmp4 mov eax, [esi+4-128] ; get acc[1] mov ebp, [esi+132-128] ; get acc[33] mov edx, [esi+196-128] ; get acc[49] lea ecx, [eax+ebx] ; acc[1]+acc[17] sub eax, ebx ; acc[1]-acc[17] lea ebx, [ebp+edx] ; acc[33]+acc[49] sub ebp, edx ; acc[33]-acc[49] lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49] sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49]) lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49]) sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+1], dl ; output[0][1] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+6], cl ; output[0][6] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+7*PITCH+1], bl ; output[7][1] = tmp3 mov ebx, [esi+72-128] ; get acc[18] ; ------------------------------------------------------------------------- ; 2 mov BYTE PTR [edi+7*PITCH+6], al ; output[7][6] = tmp4 mov eax, [esi+8-128] ; get acc[2] mov ebp, [esi+136-128] ; get acc[34] mov edx, [esi+200-128] ; get acc[50] lea ecx, [eax+ebx] ; acc[2]+acc[18] sub eax, ebx ; acc[2]-acc[18] lea ebx, [ebp+edx] ; acc[34]+acc[50] sub ebp, edx ; acc[34]-acc[50] nop nop lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50] sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50]) lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50]) sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+2], dl ; output[0][2] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+5], cl ; output[0][5] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+7*PITCH+2], bl ; output[7][2] = tmp3 mov ebx, [esi+76-128] ; get acc[19] ; ------------------------------------------------------------------------- ; 3 mov BYTE PTR [edi+7*PITCH+5], al ; output[7][5] = tmp4 mov eax, [esi+12-128] ; get acc[3] mov ebp, [esi+140-128] ; get acc[35] mov edx, [esi+204-128] ; get acc[51] lea ecx, [eax+ebx] ; acc[3]+acc[19] sub eax, ebx ; acc[3]-acc[19] lea ebx, [ebp+edx] ; acc[35]+acc[51] sub ebp, edx ; acc[35]-acc[51] lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51] sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51]) lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51]) sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+3], dl ; output[0][3] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+4], cl ; output[0][4] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+7*PITCH+3], bl ; output[7][3] = tmp3 mov ebx, [esi+80-128] ; get acc[20] ; ------------------------------------------------------------------------- ; 4 mov BYTE PTR [edi+7*PITCH+4], al ; output[7][4] = tmp4 mov eax, [esi+16-128] ; get acc[4] mov ebp, [esi+144-128] ; get acc[36] mov edx, [esi+208-128] ; get acc[52] lea ecx, [eax+ebx] ; acc[4]+acc[20] sub eax, ebx ; acc[4]-acc[20] lea ebx, [ebp+edx] ; acc[36]+acc[52] sub ebp, edx ; acc[36]-acc[52] lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52] sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52]) lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52]) sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ;lea esi, [L_ACCUM+4] ; get addr of accum[0] sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+PITCH], dl ; output[1][0] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+PITCH+7], cl ; output[1][7] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+6*PITCH], bl ; output[6][0] = tmp3 mov ebx, [esi+84-128] ; get acc[21] ; ------------------------------------------------------------------------- ; 5 mov BYTE PTR [edi+6*PITCH+7], al ; output[6][7] = tmp4 mov eax, [esi+20-128] ; get acc[5] mov ebp, [esi+148-128] ; get acc[37] mov edx, [esi+212-128] ; get acc[53] lea ecx, [eax+ebx] ; acc[5]+acc[21] sub eax, ebx ; acc[5]-acc[21] lea ebx, [ebp+edx] ; acc[37]+acc[53] sub ebp, edx ; acc[37]-acc[53] lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53] sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53]) lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53]) sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+PITCH+1], dl ; output[1][1] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+PITCH+6], cl ; output[1][6] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+6*PITCH+1], bl ; output[6][1] = tmp3 mov ebx, [esi+88-128] ; get acc[22] ; ------------------------------------------------------------------------- ; 6 mov BYTE PTR [edi+6*PITCH+6], al ; output[6][6] = tmp4 mov eax, [esi+24-128] ; get acc[6] mov ebp, [esi+152-128] ; get acc[38] mov edx, [esi+216-128] ; get acc[54] lea ecx, [eax+ebx] ; acc[6]+acc[22] sub eax, ebx ; acc[6]-acc[22] lea ebx, [ebp+edx] ; acc[38]+acc[54] sub ebp, edx ; acc[38]-acc[54] lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54] sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54]) lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54]) sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+PITCH+2], dl ; output[1][2] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+PITCH+5], cl ; output[1][5] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+6*PITCH+2], bl ; output[6][2] = tmp3 mov ebx, [esi+92-128] ; get acc[23] ; ------------------------------------------------------------------------- ; 7 mov BYTE PTR [edi+6*PITCH+5], al ; output[6][5] = tmp4 mov eax, [esi+28-128] ; get acc[7] mov ebp, [esi+156-128] ; get acc[39] mov edx, [esi+220-128] ; get acc[55] lea ecx, [eax+ebx] ; acc[7]+acc[23] sub eax, ebx ; acc[7]-acc[23] lea ebx, [ebp+edx] ; acc[39]+acc[55] sub ebp, edx ; acc[39]-acc[55] lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55] sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55]) lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55]) sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+PITCH+3], dl ; output[1][3] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+PITCH+4], cl ; output[1][4] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+6*PITCH+3], bl ; output[6][3] = tmp3 mov ebx, [esi+96-128] ; get acc[24] ; ------------------------------------------------------------------------- ; 8 mov BYTE PTR [edi+6*PITCH+4], al ; output[6][4] = tmp4 mov eax, [esi+32-128] ; get acc[8] mov ebp, [esi+160-128] ; get acc[40] mov edx, [esi+224-128] ; get acc[56] lea ecx, [eax+ebx] ; acc[8]+acc[24] sub eax, ebx ; acc[8]-acc[24] lea ebx, [ebp+edx] ; acc[40]+acc[56] sub ebp, edx ; acc[40]-acc[56] lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56] sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56]) lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56]) sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ;lea esi, [L_ACCUM+4] ; get addr of accum[0] sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+2*PITCH], dl ; output[2][0] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+2*PITCH+7], cl ; output[2][7] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+5*PITCH], bl ; output[5][0] = tmp3 mov ebx, [esi+100-128] ; get acc[25] ; ------------------------------------------------------------------------- ; 9 mov BYTE PTR [edi+5*PITCH+7], al ; output[5][7] = tmp4 mov eax, [esi+36-128] ; get acc[9] mov ebp, [esi+164-128] ; get acc[41] mov edx, [esi+228-128] ; get acc[57] lea ecx, [eax+ebx] ; acc[9]+acc[25] sub eax, ebx ; acc[9]-acc[25] lea ebx, [ebp+edx] ; acc[41]+acc[57] sub ebp, edx ; acc[41]-acc[57] lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57] sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57]) lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57]) sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+2*PITCH+1], dl ; output[2][1] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+2*PITCH+6], cl ; output[2][6] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+5*PITCH+1], bl ; output[5][1] = tmp3 mov ebx, [esi+104-128] ; get acc[26] ; ------------------------------------------------------------------------- ; 10 mov BYTE PTR [edi+5*PITCH+6], al ; output[5][6] = tmp4 mov eax, [esi+40-128] ; get acc[10] mov ebp, [esi+168-128] ; get acc[42] mov edx, [esi+232-128] ; get acc[58] lea ecx, [eax+ebx] ; acc[10]+acc[26] sub eax, ebx ; acc[10]-acc[26] lea ebx, [ebp+edx] ; acc[42]+acc[58] sub ebp, edx ; acc[42]-acc[58] lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58] sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58]) lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58]) sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+2*PITCH+2], dl ; output[2][2] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+2*PITCH+5], cl ; output[2][5] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+5*PITCH+2], bl ; output[5][2] = tmp3 mov ebx, [esi+108-128] ; get acc[27] ; ------------------------------------------------------------------------- ; 11 mov BYTE PTR [edi+5*PITCH+5], al ; output[5][5] = tmp4 mov eax, [esi+44-128] ; get acc[11] mov ebp, [esi+172-128] ; get acc[43] mov edx, [esi+236-128] ; get acc[59] lea ecx, [eax+ebx] ; acc[11]+acc[27] sub eax, ebx ; acc[11]-acc[27] lea ebx, [ebp+edx] ; acc[43]+acc[59] sub ebp, edx ; acc[43]-acc[59] lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59] sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59]) lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59]) sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+2*PITCH+3], dl ; output[2][3] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+2*PITCH+4], cl ; output[2][4] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+5*PITCH+3], bl ; output[5][3] = tmp3 mov ebx, [esi+112-128] ; get acc[28] ; ------------------------------------------------------------------------- ; 12 mov BYTE PTR [edi+5*PITCH+4], al ; output[5][4] = tmp4 mov eax, [esi+48-128] ; get acc[12] mov ebp, [esi+176-128] ; get acc[44] mov edx, [esi+240-128] ; get acc[60] lea ecx, [eax+ebx] ; acc[12]+acc[28] sub eax, ebx ; acc[12]-acc[28] lea ebx, [ebp+edx] ; acc[44]+acc[60] sub ebp, edx ; acc[44]-acc[60] lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60] sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60]) lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60]) sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ;lea esi, [L_ACCUM+4] ; get addr of accum[0] sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+3*PITCH], dl ; output[3][0] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+3*PITCH+7], cl ; output[3][7] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+4*PITCH], bl ; output[4][0] = tmp3 mov ebx, [esi+116-128] ; get acc[29] ; ------------------------------------------------------------------------- ; 13 mov BYTE PTR [edi+4*PITCH+7], al ; output[4][7] = tmp4 mov eax, [esi+52-128] ; get acc[13] mov ebp, [esi+180-128] ; get acc[45] mov edx, [esi+244-128] ; get acc[61] lea ecx, [eax+ebx] ; acc[13]+acc[29] sub eax, ebx ; acc[13]-acc[29] lea ebx, [ebp+edx] ; acc[45]+acc[61] sub ebp, edx ; acc[45]-acc[61] lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61] sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61]) lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61]) sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+3*PITCH+1], dl ; output[3][1] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+3*PITCH+6], cl ; output[3][6] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+4*PITCH+1], bl ; output[4][1] = tmp3 mov ebx, [esi+120-128] ; get acc[30] ; ------------------------------------------------------------------------- ; 14 mov BYTE PTR [edi+4*PITCH+6], al ; output[4][6] = tmp4 mov eax, [esi+56-128] ; get acc[14] mov ebp, [esi+184-128] ; get acc[46] mov edx, [esi+248-128] ; get acc[62] lea ecx, [eax+ebx] ; acc[14]+acc[30] sub eax, ebx ; acc[14]-acc[30] lea ebx, [ebp+edx] ; acc[46]+acc[62] sub ebp, edx ; acc[46]-acc[62] lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62] sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62]) lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62]) sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+3*PITCH+2], dl ; output[3][2] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+3*PITCH+5], cl ; output[3][5] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+4*PITCH+2], bl ; output[4][2] = tmp3 mov ebx, [esi+124-128] ; get acc[31] ; ------------------------------------------------------------------------- ; 15 mov BYTE PTR [edi+4*PITCH+5], al ; output[4][5] = tmp4 mov eax, [esi+60-128] ; get acc[15] mov ebp, [esi+188-128] ; get acc[47] mov edx, [esi+252-128] ; get acc[63] lea ecx, [eax+ebx] ; acc[15]+acc[31] sub eax, ebx ; acc[15]-acc[31] lea ebx, [ebp+edx] ; acc[47]+acc[63] sub ebp, edx ; acc[47]-acc[63] lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63] sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63]) lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63]) sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63]) sar edx, SCALER ; tmp1 >> 13 lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra sar ecx, SCALER ; tmp2 >> 13 ; sar ebx, SCALER ; tmp3 >> 13 mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1] sar eax, SCALER ; tmp4 >> 13 mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2] mov BYTE PTR [edi+3*PITCH+3], dl ; output[3][3] = tmp1 mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3] mov BYTE PTR [edi+3*PITCH+4], cl ; output[3][4] = tmp2 mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4] mov BYTE PTR [edi+4*PITCH+3], bl ; output[4][3] = tmp3 mov BYTE PTR [edi+4*PITCH+4], al ; output[4][4] = tmp4 ret //////////////////////////////////////////////////////////////////////////// //assume parameters passed in by registers idct_bfly_inter: ; ---------------------------------------------------------------------- ; INTER ONLY Butterfly and clamp ; Uses all registers. ; Uses all accumulators[64], accum ; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra ; Writes to Intermediate matrix [8][8] of DWORDS, Intermediate ; NOTE: ; Code assumes that Intermediate and accumulator arrays are aligned at ; cache-line boundary ; Process 4 outputs per group, 0-15 ; 0 mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate lea esi, [L_ACCUM+4+128] ; get addr of accum[0] biased by 128 add edi, 128 nop mov ebx, [esi+64-128] ; get acc[16] mov eax, [esi-128] ; get acc[0] bank conflict ; mov edx, [edi-128] ; pre-fetch line 0; 4 to avoid bank conflict ; mov ecx, [edi+1*32-128+4] ; pre-fetch line 1 ; mov edx, [edi+2*32-128] ; pre-fetch line 2 ; mov ecx, [edi+3*32-128+4] ; pre-fetch line 3 ; mov edx, [edi+4*32-128] ; pre-fetch line 4 ; mov ecx, [edi+5*32-128+4] ; pre-fetch line 5 ; mov edx, [edi+6*32-128] ; pre-fetch line 6 ; mov ecx, [edi+7*32-128+4] ; pre-fetch line 7 mov ebp, [esi+128-128] ; get acc[32] lea ecx, [eax+ebx] ; acc[0]+acc[16] mov edx, [esi+192-128] ; get acc[48] sub eax, ebx ; acc[0]-acc[16] lea ebx, [ebp+edx] ; acc[32]+acc[48] sub ebp, edx ; acc[32]-acc[48] lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48] sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi-128], edx ; Intermediate[0][0] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+7*4-128], ecx ; Intermediate[0][7] = tmp2 mov DWORD PTR [edi+7*32-128], ebx ; Intermediate[7][0] = tmp3 mov ebx, [esi+68-128] ; get acc[17] ; ------------------------------------------------------------------------- ; 1 mov DWORD PTR [edi+7*32+7*4-128], eax ; Intermediate[7][7] = tmp4 mov eax, [esi+4-128] ; get acc[1] mov ebp, [esi+132-128] ; get acc[33] lea ecx, [eax+ebx] ; acc[1]+acc[17] mov edx, [esi+196-128] ; get acc[49] sub eax, ebx ; acc[1]-acc[17] lea ebx, [ebp+edx] ; acc[33]+acc[49] sub ebp, edx ; acc[33]-acc[49] lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49] sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+1*4-128], edx ; Intermediate[0][1] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+6*4-128], ecx ; Intermediate[0][6] = tmp2 mov DWORD PTR [edi+7*32+1*4-128], ebx ; Intermediate[7][1] = tmp3 mov ebx, [esi+72-128] ; get acc[18] ; ------------------------------------------------------------------------- ; 2 mov DWORD PTR [edi+7*32+6*4-128], eax ; Intermediate[7][6] = tmp4 mov eax, [esi+8-128] ; get acc[2] mov ebp, [esi+136-128] ; get acc[34] lea ecx, [eax+ebx] ; acc[2]+acc[18] mov edx, [esi+200-128] ; get acc[50] sub eax, ebx ; acc[2]-acc[18] lea ebx, [ebp+edx] ; acc[34]+acc[50] sub ebp, edx ; acc[34]-acc[50] lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50] sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+2*4-128], edx ; Intermediate[0][2] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+5*4-128], ecx ; Intermediate[0][5] = tmp2 mov DWORD PTR [edi+7*32+2*4-128], ebx ; Intermediate[7][2] = tmp3 mov ebx, [esi+76-128] ; get acc[19] ; ------------------------------------------------------------------------- ; 3 mov DWORD PTR [edi+7*32+5*4-128], eax ; Intermediate[7][5] = tmp4 mov eax, [esi+12-128] ; get acc[3] mov ebp, [esi+140-128] ; get acc[35] lea ecx, [eax+ebx] ; acc[3]+acc[19] mov edx, [esi+204-128] ; get acc[51] sub eax, ebx ; acc[3]-acc[19] lea ebx, [ebp+edx] ; acc[35]+acc[51] sub ebp, edx ; acc[35]-acc[51] lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51] sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+3*4-128], edx ; Intermediate[0][3] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+4*4-128], ecx ; Intermediate[0][4] = tmp2 mov DWORD PTR [edi+7*32+3*4-128], ebx ; Intermediate[7][3] = tmp3 mov ebx, [esi+80-128] ; get acc[20] ; ------------------------------------------------------------------------- ; 4 mov DWORD PTR [edi+7*32+4*4-128], eax ; Intermediate[7][4] = tmp4 mov eax, [esi+16-128] ; get acc[4] mov ebp, [esi+144-128] ; get acc[36] lea ecx, [eax+ebx] ; acc[4]+acc[20] mov edx, [esi+208-128] ; get acc[52] sub eax, ebx ; acc[4]-acc[20] lea ebx, [ebp+edx] ; acc[36]+acc[52] sub ebp, edx ; acc[36]-acc[52] lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52] sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+32-128], edx ; Intermediate[1][0] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+32+7*4-128], ecx ; Intermediate[1][7] = tmp2 mov DWORD PTR [edi+6*32-128], ebx ; Intermediate[6][0] = tmp3 mov ebx, [esi+84-128] ; get acc[21] ; ------------------------------------------------------------------------- ; 5 mov DWORD PTR [edi+6*32+7*4-128], eax ; Intermediate[6][7] = tmp4 mov eax, [esi+20-128] ; get acc[5] mov ebp, [esi+148-128] ; get acc[37] lea ecx, [eax+ebx] ; acc[5]+acc[21] mov edx, [esi+212-128] ; get acc[53] sub eax, ebx ; acc[5]-acc[21] lea ebx, [ebp+edx] ; acc[37]+acc[53] sub ebp, edx ; acc[37]-acc[53] lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53] sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+32+1*4-128], edx ; Intermediate[1][1] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+32+6*4-128], ecx ; Intermediate[1][6] = tmp2 mov DWORD PTR [edi+6*32+1*4-128], ebx ; Intermediate[6][1] = tmp3 mov ebx, [esi+88-128] ; get acc[22] ; ------------------------------------------------------------------------- ; 6 mov DWORD PTR [edi+6*32+6*4-128], eax ; Intermediate[6][6] = tmp4 mov eax, [esi+24-128] ; get acc[6] Bank conflict mov ebp, [esi+152-128] ; get acc[38] lea ecx, [eax+ebx] ; acc[6]+acc[22] mov edx, [esi+216-128] ; get acc[54] sub eax, ebx ; acc[6]-acc[22] lea ebx, [ebp+edx] ; acc[38]+acc[54] sub ebp, edx ; acc[38]-acc[54] lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54] sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+32+2*4-128], edx ; Intermediate[1][2] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+32+5*4-128], ecx ; Intermediate[1][5] = tmp2 mov DWORD PTR [edi+6*32+2*4-128], ebx ; Intermediate[6][2] = tmp3 mov ebx, [esi+92-128] ; get acc[23] ; ------------------------------------------------------------------------- ; 7 mov DWORD PTR [edi+6*32+5*4-128], eax ; Intermediate[6][5] = tmp4 mov eax, [esi+28-128] ; get acc[7] mov ebp, [esi+156-128] ; get acc[39] lea ecx, [eax+ebx] ; acc[7]+acc[23] mov edx, [esi+220-128] ; get acc[55] sub eax, ebx ; acc[7]-acc[23] lea ebx, [ebp+edx] ; acc[39]+acc[55] sub ebp, edx ; acc[39]-acc[55] lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55] sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+32+3*4-128], edx ; Intermediate[1][3] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+32+4*4-128], ecx ; Intermediate[1][4] = tmp2 mov DWORD PTR [edi+6*32+3*4-128], ebx ; Intermediate[6][3] = tmp3 mov ebx, [esi+96-128] ; get acc[24] ; ------------------------------------------------------------------------- ; 8 mov DWORD PTR [edi+6*32+4*4-128], eax ; Intermediate[6][4] = tmp4 mov eax, [esi+32-128] ; get acc[8] mov ebp, [esi+160-128] ; get acc[40] lea ecx, [eax+ebx] ; acc[8]+acc[24] mov edx, [esi+224-128] ; get acc[56] sub eax, ebx ; acc[8]-acc[24] lea ebx, [ebp+edx] ; acc[40]+acc[56] sub ebp, edx ; acc[40]-acc[56] lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56] sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+2*32-128], edx ; Intermediate[2][0] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+2*32+7*4-128], ecx ; Intermediate[2][7] = tmp2 mov DWORD PTR [edi+5*32-128], ebx ; Intermediate[5][0] = tmp3 mov ebx, [esi+100-128] ; get acc[25] ; ------------------------------------------------------------------------- ; 9 mov DWORD PTR [edi+5*32+7*4-128], eax ; Intermediate[5][7] = tmp4 mov eax, [esi+36-128] ; get acc[9] mov ebp, [esi+164-128] ; get acc[41] lea ecx, [eax+ebx] ; acc[9]+acc[25] mov edx, [esi+228-128] ; get acc[57] sub eax, ebx ; acc[9]-acc[25] lea ebx, [ebp+edx] ; acc[41]+acc[57] sub ebp, edx ; acc[41]-acc[57] lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57] sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+2*32+1*4-128], edx ; Intermediate[2][1] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+2*32+6*4-128], ecx ; Intermediate[2][6] = tmp2 mov DWORD PTR [edi+5*32+1*4-128], ebx ; Intermediate[5][1] = tmp3 mov ebx, [esi+104-128] ; get acc[26] ; ------------------------------------------------------------------------- ; 10 mov DWORD PTR [edi+5*32+6*4-128], eax ; Intermediate[5][6] = tmp4 mov eax, [esi+40-128] ; get acc[10] mov ebp, [esi+168-128] ; get acc[42] lea ecx, [eax+ebx] ; acc[10]+acc[26] mov edx, [esi+232-128] ; get acc[58] sub eax, ebx ; acc[10]-acc[26] lea ebx, [ebp+edx] ; acc[42]+acc[58] sub ebp, edx ; acc[42]-acc[58] lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58] sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+2*32+2*4-128], edx ; Intermediate[2][2] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+2*32+5*4-128], ecx ; Intermediate[2][5] = tmp2 mov DWORD PTR [edi+5*32+2*4-128], ebx ; Intermediate[5][2] = tmp3 mov ebx, [esi+108-128] ; get acc[27] ; ------------------------------------------------------------------------- ; 11 mov DWORD PTR [edi+5*32+5*4-128], eax ; Intermediate[5][5] = tmp4 mov eax, [esi+44-128] ; get acc[11] mov ebp, [esi+172-128] ; get acc[43] lea ecx, [eax+ebx] ; acc[11]+acc[27] mov edx, [esi+236-128] ; get acc[59] sub eax, ebx ; acc[11]-acc[27] lea ebx, [ebp+edx] ; acc[43]+acc[59] sub ebp, edx ; acc[43]-acc[59] lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59] sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+2*32+3*4-128], edx ; Intermediate[2][3] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+2*32+4*4-128], ecx ; Intermediate[2][4] = tmp2 mov DWORD PTR [edi+5*32+3*4-128], ebx ; Intermediate[5][3] = tmp3 mov ebx, [esi+112-128] ; get acc[28] ; ------------------------------------------------------------------------- ; 12 mov DWORD PTR [edi+5*32+4*4-128], eax ; Intermediate[5][4] = tmp4 mov eax, [esi+48-128] ; get acc[12] Bank conflict mov ebp, [esi+176-128] ; get acc[44] lea ecx, [eax+ebx] ; acc[12]+acc[28] mov edx, [esi+240-128] ; get acc[60] sub eax, ebx ; acc[12]-acc[28] lea ebx, [ebp+edx] ; acc[44]+acc[60] sub ebp, edx ; acc[44]-acc[60] lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60] sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+3*32-128], edx ; Intermediate[3][0] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+3*32+7*4-128], ecx ; Intermediate[3][7] = tmp2 mov DWORD PTR [edi+4*32-128], ebx ; Intermediate[4][0] = tmp3 mov ebx, [esi+116-128] ; get acc[29] ; ------------------------------------------------------------------------- ; 13 mov DWORD PTR [edi+4*32+7*4-128], eax ; Intermediate[4][7] = tmp4 mov eax, [esi+52-128] ; get acc[13] mov ebp, [esi+180-128] ; get acc[45] lea ecx, [eax+ebx] ; acc[13]+acc[29] mov edx, [esi+244-128] ; get acc[61] sub eax, ebx ; acc[13]-acc[29] lea ebx, [ebp+edx] ; acc[45]+acc[61] sub ebp, edx ; acc[45]-acc[61] lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61] sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+3*32+1*4-128], edx ; Intermediate[3][1] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+3*32+6*4-128], ecx ; Intermediate[3][6] = tmp2 mov DWORD PTR [edi+4*32+1*4-128], ebx ; Intermediate[4][1] = tmp3 mov ebx, [esi+120-128] ; get acc[30] ; ------------------------------------------------------------------------- ; 14 mov DWORD PTR [edi+4*32+6*4-128], eax ; Intermediate[4][6] = tmp4 mov eax, [esi+56-128] ; get acc[14] Bank conflict mov ebp, [esi+184-128] ; get acc[46] lea ecx, [eax+ebx] ; acc[14]+acc[30] mov edx, [esi+248-128] ; get acc[62] sub eax, ebx ; acc[14]-acc[30] lea ebx, [ebp+edx] ; acc[46]+acc[62] sub ebp, edx ; acc[46]-acc[62] lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62] sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+3*32+2*4-128], edx ; Intermediate[3][2] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+3*32+5*4-128], ecx ; Intermediate[3][5] = tmp2 mov DWORD PTR [edi+4*32+2*4-128], ebx ; Intermediate[4][2] = tmp3 mov ebx, [esi+124-128] ; get acc[31] ; ------------------------------------------------------------------------- ; 15 mov DWORD PTR [edi+4*32+5*4-128], eax ; Intermediate[4][5] = tmp4 mov eax, [esi+60-128] ; get acc[15] mov ebp, [esi+188-128] ; get acc[47] lea ecx, [eax+ebx] ; acc[15]+acc[31] mov edx, [esi+252-128] ; get acc[63] sub eax, ebx ; acc[15]-acc[31] lea ebx, [ebp+edx] ; acc[47]+acc[63] sub ebp, edx ; acc[47]-acc[63] lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63] sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63]) sar edx, SCALER ; tmp1 >> 13 lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63]) sar ecx, SCALER ; tmp2 >> 13 sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63]) sar ebx, SCALER ; tmp3 >> 13 mov DWORD PTR [edi+3*32+3*4-128], edx ; Intermediate[3][3] = tmp1 sar eax, SCALER ; tmp4 >> 13 mov DWORD PTR [edi+3*32+4*4-128], ecx ; Intermediate[3][4] = tmp2 mov DWORD PTR [edi+4*32+3*4-128], ebx ; Intermediate[4][3] = tmp3 mov DWORD PTR [edi+4*32+4*4-128], eax ; Intermediate[4][4] = tmp4 ret } //end of asm } #pragma code_seg()