Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1267 lines
67 KiB

// version 003; everything except 1) segment
//
/* *************************************************************************
** INTEL Corporation Proprietary Information
**
** This listing is supplied under the terms of a license
** agreement with INTEL Corporation and may not be copied
** nor disclosed except in accordance with the terms of
** that agreement.
**
** Copyright (c) 1995 Intel Corporation.
** All Rights Reserved.
**
** *************************************************************************
*/
//////////////////////////////////////////////////////////////////////////
// $Author: AGUPTA2 $
// $Date: 25 Oct 1996 13:32:28 $
// $Archive: S:\h26x\src\dec\d3idct.cpv $
// $Header: S:\h26x\src\dec\d3idct.cpv 1.11 25 Oct 1996 13:32:28 AGUPTA2 $
// $Log: S:\h26x\src\dec\d3idct.cpv $
//
// Rev 1.11 25 Oct 1996 13:32:28 AGUPTA2
// Re-scheduled butterfky code; re-arranged local var declarations.
//
// Rev 1.10 30 Aug 1996 08:39:56 KLILLEVO
// added C version of block edge filter, and changed the bias in
// ClampTbl[] from 128 to CLAMP_BIAS (defined to 128)
// The C version of the block edge filter takes up way too much CPU time
// relative to the rest of the decode time (4 ms for QCIF and 16 ms
// for CIF on a P120, so this needs to coded in assembly)
//
// Rev 1.9 17 Jul 1996 15:33:18 AGUPTA2
// Increased the size of clamping table ClampTbl to 128+256+128.
//
// Rev 1.8 08 Mar 1996 16:46:20 AGUPTA2
// Added pragma code_seg. Rolled the initialization code. Got rid of most
// of 32-bit displacements in instructions. Aligned frequently executed loops
// at 4-byte boundary. Made changes to reflect new size of MapMatrix. Removed
// nop instructions. Deleted code that prefetches output lines in case of
// INTRA blocks. Use ClampTbl instead of ClipPixIntra. Do not clip output
// of INTER blocks; clipping is done in dxblkadd().
//
//
// Rev 1.7 27 Dec 1995 14:36:06 RMCKENZX
// Added copyright notice
//
// Rev 1.6 09 Dec 1995 17:33:20 RMCKENZX
// Re-checked in module to support decoder re-architecture (thru PB Frames)
//
// Rev 1.4 30 Nov 1995 18:02:14 CZHU
// Save and restore register before and after idct_acc
//
// Rev 1.1 27 Nov 1995 13:13:28 CZHU
//
//
// Rev 1.0 27 Nov 1995 13:08:24 CZHU
// Initial revision.
//
//Block level decoding for H.26x decoder
#include "precomp.h"
/////////////////////////////////////////////////////////////////////////
// Decode each none-empty block
// Input: lpInst: decoder instance,
// lpSrc: input bitstream,
// lpBlockAction:
// the pointer to the block action stream structure
// bitsread: number of bits in the buffer already,
/////////////////////////////////////////////////////////////////////////
// local variable definitions
#define FRAMEPOINTER esp
//////////////////////////////////////////////////////////////
// L_ACCUM MUST BE LAST 256 BYTES OF A PAGE
/////////////////////////////////////////////////////////////
#define L_PRODUCT FRAMEPOINTER + 0 // 20 DWORD
#define L_INPUT_INTER L_PRODUCT + 20*4 // DWORD
#define L_esi L_INPUT_INTER + 1*4 // DWORD
#define L_NO_COEFF L_esi + 1*4 // DWORD
#define L_DESTBLOCK L_NO_COEFF + 1*4 // DWORD
#define L_LOOPCOUNTER L_DESTBLOCK + 1*4 // DWORD
#define L_STASHESP L_LOOPCOUNTER + 1*4 // DWORD
#define L_dummy L_STASHESP + 1*4 // 6 DWORDS
#define L_ACCUM L_dummy + 6*4 // 64 DWORD
#define LOCALSIZE (96*4) // 96 DWORDS;multiple of cache line size
////////////////////////////////////////////////////////////////////////////////
// Input:
// pIQ_INDEX, pointer to pointer for Inverse quantization and index
// for the current block.
// No_Coeff, A 32 bit number indicate block types, etc.
// 0--63, inter block, number of coeff
// 64--127 64+ intra block, number of coeff
// pIntraBuf, Buffer pointer for intra blocks.
//
// pInterBuf, Buffer pointer for inter blocks.
//
//
// return:
//
//////////////////////////////////////////////////////////////////////////////////
#pragma code_seg("IACODE2")
__declspec(naked)
U32 DecodeBlock_IDCT ( U32 pIQ_INDEX,
U32 No_Coeff,
U32 pIntraBuf,
U32 pInterBuf)
{
__asm
{
////////////////////////////////////////////////////////////////
// DON'T CHANGE LOCAL DECLARATIONS OR STACK POINTER ADJUSTMENT
// CODE WITHOUT TALKING TO ATUL
////////////////////////////////////////////////////////////////
push ebp // save callers frame pointer
mov ebp, esp // make parameters accessible
push esi // assumed preserved
push edi
push ebx
mov eax, pInterBuf
mov edx, esp // Save old ESP in edx
and esp, -4096 // align at page boundary
xor esi, esi // loop init
sub esp, LOCALSIZE // last 96 DWORDS of page
lea edi, [L_ACCUM]
mov ebx, 64 // loop init
mov [L_STASHESP], edx // Save old esp
mov edx, No_Coeff
mov [L_INPUT_INTER], eax
mov eax, ROUNDER // loop init
;
/////////////////////////////////////////////////////////////////
// There is no point in pre-loading the cache. That is because
// after the first block it is likely to be in the cache.
//
loop_for_init:
mov [edi], eax
mov [edi+4], eax
mov [edi+ebx], esi
mov [edi+ebx+4], esi
mov [edi+ebx+8], esi
mov [edi+ebx+12], esi
mov [edi+ebx+16], esi
mov [edi+ebx+20], esi
add edi, 8
add ebx, 16
cmp ebx, 192
jl loop_for_init
/////////////////////////////////////////////////////////////////////
// end of new init code
//end of IDCT init.
cmp edx, 65
jg intra_block
mov ebx, pInterBuf
jmp pre_acc_loop
intra_block:
mov ebx, pIntraBuf
sub edx, 65
// register:
// ebp: loop counter
// ebx: inverse quant
// ecx: index [0,63]
pre_acc_loop:
mov esi, pIQ_INDEX
mov [L_DESTBLOCK], ebx
mov [L_esi], esi
ALIGN 4
acc_loop:
mov ebx,[esi+edx*8-8] //Invserse Quant
mov ecx,[esi+edx*8-4] //Coeff index
mov [L_NO_COEFF], edx
call idct_acc
mov esi, [L_esi]
mov edx, [L_NO_COEFF]
dec edx
jnz acc_loop
mov edx, [L_DESTBLOCK]
mov ecx, [L_INPUT_INTER]
cmp edx, ecx
jnz call_intra_bfly
call idct_bfly_inter
mov esp, [L_STASHESP] // free locals
add eax, edi
pop ebx
pop edi
pop esi
pop ebp
ret
call_intra_bfly:
call idct_bfly_intra
mov esp, [L_STASHESP] // free locals
add eax, edi
pop ebx
pop edi
pop esi
pop ebp
ret
///////////////////////////////////////////////////////////////
// assume parameter passed in by registers
// ebx, inverse quant
// ecx, index [0,63]
idct_acc:
; For every non-zero coefficient:
; LoopCounter, on local stack, has index
; ecx = index (0-63)
; ebx = non-zero input
; Note i = index
;
and ecx, 03fh ; Chad added to prevent GPF
mov [L_LOOPCOUNTER+4], ecx ; Store Loop counter
xor edx, edx ; zero out for byte read, use as dword
mov esi, ecx ; move index to esi
lea eax, Unique ; eax = Address of Unique[0]
mov ebp, ecx ; move index to ebp
shl esi, 3 ; index*8
add ecx, ecx ; index*2
add esi, ecx ; index*10
lea ecx, KernelCoeff ; get KernelCoeff[0][0]
lea edi, [L_PRODUCT+4] ; edi = address of product[0]
mov dl, [eax+ebp] ; get Unique[i]
lea esi, [ecx+4*esi] ; address of KernelCoeff[i][0]
mov ebp, edx ; ebp = Unique[i]
lea eax, [edi+edx*4] ; eax = address of product[totalU]
nop
; ----------------------------------------------------------------------
; Register usage
; eax = addr of product[Unique[i]]
; ebx = input[i]
; ecx = 0, -product[x]
; edx = KernelCoeff[i][x], product[x]= KernelCoeff[i][x] * input[i]
; ebp = x
; edi = addr of product[0]
; esi = addr of KernelCoeff[i][x]
ALIGN 4
loop_for_x:
xor ecx, ecx
mov edx, [esi+ebp*4-4] ; read KernelCoeff[i][x]
imul edx, ebx ; KernelCoeff[i][x] * input[i]
mov [edi+ebp*4-4], edx ; product[x] = result of imul
sub ecx, edx
mov [eax+ebp*4-4], ecx ; product[totalU+x] = -product[x]
dec ebp ; decrement x
jnz loop_for_x
; ----------------------------------------------------------------------
; Register usage
; eax = MapMatrix[0][0]
; ebx = PClass[0], accum[xxx]
; ecx = LoopCounter, addr of MapMatrix[i][0]
; edx = product[0], accum[PClass[i][0-15]]
; ebp = addr of accum[0], product[MapMatrix[i][0-15]]
; edi = addr of product[0]
; esi = PClass[i], address of accum[PClass[i]]
mov ecx, [L_LOOPCOUNTER+4] ; get i
and ecx, 0ffh ; Chad added to prevent GPF
lea ebx, PClass ; get addr of PClass[0]
mov esi, ecx
shl ecx, 4
lea eax, MapMatrix ; get addr of MapMatrix[0][0]
xor edx, edx
nop
mov dl, [ebx+esi] ; get PClass[i]
lea ecx, [eax+1*ecx] ; get addr of MapMatrix[i][0]
shl edx, 2 ; esi*4
lea esi, [L_ACCUM+4] ; get addr of accum[0]
; ----------------------------------------------------------------------
xor eax, eax ; get MapMatrix[i][0]
add esi, edx ; esi = address of accum[PClass[i]]
mov al, [ecx]
mov ebx, [esi] ; get accum[PClass[i]]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[0]]
mov al, [ecx+1] ; get pNKernel->matrix[1]
add ebx, ebp ; accum[pNKernel->PClass] += product[
; pNKernel->matrix[0]]
mov edx, [esi+4] ; get accum[1+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[1]]
mov al, [ecx+2] ; get pNKernel->matrix[2]
add edx, ebp ; accum[1+pNkernel->PClass] += product[
; pNKernel->matrix[1]]
mov [esi], ebx ; store accum[pNKernel->PClass] += product[
; pNKernel->matrix[0]]
mov [esi+4], edx ; store accum[1+pNKernel->PClass] +=
; product[pNKernel->matrix[1]]
mov ebx, [esi+8] ; get accum[2+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[2]]
mov al, [ecx+3] ; get pNKernel->matrix[3]
add ebx, ebp ; accum[2+pNKernel->PClass] += product[
; pNKernel->matrix[2]]
mov edx, [esi+12] ; get accum[3+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[3]]
mov al, [ecx+4] ; get pNKernel->matrix[4]
add edx, ebp ; accum[3+pNKernel->PClass] += product[
; pNKernel->matrix[3]]
mov [esi+8], ebx ; store accum[2+pNKernel->PClass] +=
; product[pNKernel->matrix[2]]
mov [esi+12], edx ; store accum[3+pNKernel->PClass] +=
; product[pNKernel->matrix[3]]
; ----------------------------------------------------------------------
mov ebx, [esi+16] ; get accum[4+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[4]]
mov al, [ecx+5] ; get pNKernel->matrix[5]
add ebx, ebp ; accum[4+pNKernel->PClass] += product[
; pNKernel->matrix[4]]
mov edx, [esi+20] ; get accum[5+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[5]]
mov al, [ecx+6] ; get pNKernel->matrix[6]
add edx, ebp ; accum[5+pNkernel->PClass] += product[
; pNKernel->matrix[5]]
mov [esi+16], ebx ; store accum[4+pNKernel->PClass] +=
; product[pNKernel->matrix[4]]
mov [esi+20], edx ; store accum[5+pNKernel->PClass] +=
; product[pNKernel->matrix[5]]
mov ebx, [esi+24] ; get accum[6+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[6]]
mov al, [ecx+7] ; get pNKernel->matrix[7]
add ebx, ebp
mov edx, [esi+28] ; get accum[7+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[7]]
mov al, [ecx+8] ; get pNKernel->matrix[8]
add edx, ebp ; accum[7+pNKernel->PClass] += product[
; pNKernel->matrix[7]]
mov [esi+24], ebx ; store accum[6+pNKernel->PClass] +=
; product[pNKernel->matrix[6]]
mov [esi+28], edx ; store accum[7+pNKernel->PClass] +=
; product[pNKernel->matrix[7]]
; ----------------------------------------------------------------------
mov ebx, [esi+32] ; get accum[8+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[8]]
mov al, [ecx+9] ; get pNKernel->matrix[9]
add ebx, ebp ; accum[8+pNKernel->PClass] += product[
; pNKernel->matrix[8]]
mov edx, [esi+36] ; get accum[9+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[9]]
mov al, [ecx+10] ; get pNKernel->matrix[10]
add edx, ebp ; accum[9+pNkernel->PClass] += product[
; pNKernel->matrix[9]]
mov [esi+32], ebx ; store accum[8+pNKernel->PClass] +=
; product[pNKernel->matrix[8]]
mov [esi+36], edx ; store accum[9+pNKernel->PClass] +=
; product[pNKernel->matrix[9]]
mov ebx, [esi+40] ; get accum[10+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[10]]
mov al, [ecx+11] ; get pNKernel->matrix[11]
add ebx, ebp
mov edx, [esi+44] ; get accum[11+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[11]]
; product[pNKernel->matrix[11]]
mov al, [ecx+12] ; get pNKernel->matrix[12]
add edx, ebp ; accum[11+pNKernel->PClass] += product[
; pNKernel->matrix[11]]
mov [esi+40], ebx ; store accum[10+pNKernel->PClass] +=
; product[pNKernel->matrix[10]]
mov [esi+44], edx ; store accum[11+pNKernel->PClass] +=
; product[pNKernel->matrix[11]]
; ----------------------------------------------------------------------
mov ebx, [esi+48] ; get accum[12+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[12]]
mov al, [ecx+13] ; get pNKernel->matrix[13]
add ebx, ebp ; accum[12+pNKernel->PClass] += product[
; pNKernel->matrix[12]]
mov edx, [esi+52] ; get accum[13+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[13]]
mov al, [ecx+14] ; get pNKernel->matrix[14]
add edx, ebp ; accum[13+pNkernel->PClass] += product[
; pNKernel->matrix[13]]
mov [esi+48], ebx ; store accum[pNKernel->PClass] += product[
; pNKernel->matrix[13]]
mov [esi+52], edx ; store accum[13+pNKernel->PClass] +=
; product[pNKernel->matrix[13]]
mov ebx, [esi+56] ; get accum[14+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[14]]
mov al, [ecx+15] ; get pNKernel->matrix[15]
add ebx, ebp
mov edx, [esi+60] ; get accum[15+pNKernel->PClass]
mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[15]]
mov [esi+56], ebx ; store accum[14+pNKernel->PClass] +=
; product[pNKernel->matrix[14]]
add edx, ebp ; accum[15+pNKernel->PClass] += product[
; pNKernel->matrix[15]]
mov [esi+60], edx ; store accum[15+pNKernel->PClass] +=
; product[pNKernel->matrix[15]]
ret
////////////////////////////////////////////////////////////////////////////
//assume parameters passed in by registers
idct_bfly_intra:
; ----------------------------------------------------------------------
; INTRA ONLY Butterfly and clamp
; Uses all registers.
; Uses all accumulators[64], accum
; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra
; Writes to Output matrix of BYTES, OutputCoeff
;
; Process 4 outputs per group, 0-15
; 0
lea esi, [L_ACCUM+4] ; get addr of accum[0]
mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of OutputCoeff
add esi, 128
nop
mov eax, [esi-128] ; get acc[0]
mov ebx, [esi+64-128] ; get acc[16]
mov ebp, [esi+128-128] ; get acc[32]
mov edx, [esi+192-128] ; get acc[48]
lea ecx, [eax+ebx] ; acc[0]+acc[16]
sub eax, ebx ; acc[0]-acc[16]
lea ebx, [ebp+edx] ; acc[32]+acc[48]
sub ebp, edx ; acc[32]-acc[48]
lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;lea esi, [L_ACCUM+4] ; get addr of accum[0]
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi], dl ; output[0][0] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+7], cl ; output[0][7] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+7*PITCH], bl ; output[7][0] = tmp3
mov ebx, [esi+68-128] ; get acc[17]
; -------------------------------------------------------------------------
; 1
mov BYTE PTR [edi+7*PITCH+7], al ; output[7][7] = tmp4
mov eax, [esi+4-128] ; get acc[1]
mov ebp, [esi+132-128] ; get acc[33]
mov edx, [esi+196-128] ; get acc[49]
lea ecx, [eax+ebx] ; acc[1]+acc[17]
sub eax, ebx ; acc[1]-acc[17]
lea ebx, [ebp+edx] ; acc[33]+acc[49]
sub ebp, edx ; acc[33]-acc[49]
lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+1], dl ; output[0][1] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+6], cl ; output[0][6] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+7*PITCH+1], bl ; output[7][1] = tmp3
mov ebx, [esi+72-128] ; get acc[18]
; -------------------------------------------------------------------------
; 2
mov BYTE PTR [edi+7*PITCH+6], al ; output[7][6] = tmp4
mov eax, [esi+8-128] ; get acc[2]
mov ebp, [esi+136-128] ; get acc[34]
mov edx, [esi+200-128] ; get acc[50]
lea ecx, [eax+ebx] ; acc[2]+acc[18]
sub eax, ebx ; acc[2]-acc[18]
lea ebx, [ebp+edx] ; acc[34]+acc[50]
sub ebp, edx ; acc[34]-acc[50]
nop
nop
lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+2], dl ; output[0][2] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+5], cl ; output[0][5] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+7*PITCH+2], bl ; output[7][2] = tmp3
mov ebx, [esi+76-128] ; get acc[19]
; -------------------------------------------------------------------------
; 3
mov BYTE PTR [edi+7*PITCH+5], al ; output[7][5] = tmp4
mov eax, [esi+12-128] ; get acc[3]
mov ebp, [esi+140-128] ; get acc[35]
mov edx, [esi+204-128] ; get acc[51]
lea ecx, [eax+ebx] ; acc[3]+acc[19]
sub eax, ebx ; acc[3]-acc[19]
lea ebx, [ebp+edx] ; acc[35]+acc[51]
sub ebp, edx ; acc[35]-acc[51]
lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+3], dl ; output[0][3] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+4], cl ; output[0][4] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+7*PITCH+3], bl ; output[7][3] = tmp3
mov ebx, [esi+80-128] ; get acc[20]
; -------------------------------------------------------------------------
; 4
mov BYTE PTR [edi+7*PITCH+4], al ; output[7][4] = tmp4
mov eax, [esi+16-128] ; get acc[4]
mov ebp, [esi+144-128] ; get acc[36]
mov edx, [esi+208-128] ; get acc[52]
lea ecx, [eax+ebx] ; acc[4]+acc[20]
sub eax, ebx ; acc[4]-acc[20]
lea ebx, [ebp+edx] ; acc[36]+acc[52]
sub ebp, edx ; acc[36]-acc[52]
lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;lea esi, [L_ACCUM+4] ; get addr of accum[0]
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+PITCH], dl ; output[1][0] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+PITCH+7], cl ; output[1][7] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+6*PITCH], bl ; output[6][0] = tmp3
mov ebx, [esi+84-128] ; get acc[21]
; -------------------------------------------------------------------------
; 5
mov BYTE PTR [edi+6*PITCH+7], al ; output[6][7] = tmp4
mov eax, [esi+20-128] ; get acc[5]
mov ebp, [esi+148-128] ; get acc[37]
mov edx, [esi+212-128] ; get acc[53]
lea ecx, [eax+ebx] ; acc[5]+acc[21]
sub eax, ebx ; acc[5]-acc[21]
lea ebx, [ebp+edx] ; acc[37]+acc[53]
sub ebp, edx ; acc[37]-acc[53]
lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+PITCH+1], dl ; output[1][1] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+PITCH+6], cl ; output[1][6] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+6*PITCH+1], bl ; output[6][1] = tmp3
mov ebx, [esi+88-128] ; get acc[22]
; -------------------------------------------------------------------------
; 6
mov BYTE PTR [edi+6*PITCH+6], al ; output[6][6] = tmp4
mov eax, [esi+24-128] ; get acc[6]
mov ebp, [esi+152-128] ; get acc[38]
mov edx, [esi+216-128] ; get acc[54]
lea ecx, [eax+ebx] ; acc[6]+acc[22]
sub eax, ebx ; acc[6]-acc[22]
lea ebx, [ebp+edx] ; acc[38]+acc[54]
sub ebp, edx ; acc[38]-acc[54]
lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+PITCH+2], dl ; output[1][2] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+PITCH+5], cl ; output[1][5] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+6*PITCH+2], bl ; output[6][2] = tmp3
mov ebx, [esi+92-128] ; get acc[23]
; -------------------------------------------------------------------------
; 7
mov BYTE PTR [edi+6*PITCH+5], al ; output[6][5] = tmp4
mov eax, [esi+28-128] ; get acc[7]
mov ebp, [esi+156-128] ; get acc[39]
mov edx, [esi+220-128] ; get acc[55]
lea ecx, [eax+ebx] ; acc[7]+acc[23]
sub eax, ebx ; acc[7]-acc[23]
lea ebx, [ebp+edx] ; acc[39]+acc[55]
sub ebp, edx ; acc[39]-acc[55]
lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+PITCH+3], dl ; output[1][3] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+PITCH+4], cl ; output[1][4] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+6*PITCH+3], bl ; output[6][3] = tmp3
mov ebx, [esi+96-128] ; get acc[24]
; -------------------------------------------------------------------------
; 8
mov BYTE PTR [edi+6*PITCH+4], al ; output[6][4] = tmp4
mov eax, [esi+32-128] ; get acc[8]
mov ebp, [esi+160-128] ; get acc[40]
mov edx, [esi+224-128] ; get acc[56]
lea ecx, [eax+ebx] ; acc[8]+acc[24]
sub eax, ebx ; acc[8]-acc[24]
lea ebx, [ebp+edx] ; acc[40]+acc[56]
sub ebp, edx ; acc[40]-acc[56]
lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56]
sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56])
lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56])
sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;lea esi, [L_ACCUM+4] ; get addr of accum[0]
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+2*PITCH], dl ; output[2][0] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+2*PITCH+7], cl ; output[2][7] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+5*PITCH], bl ; output[5][0] = tmp3
mov ebx, [esi+100-128] ; get acc[25]
; -------------------------------------------------------------------------
; 9
mov BYTE PTR [edi+5*PITCH+7], al ; output[5][7] = tmp4
mov eax, [esi+36-128] ; get acc[9]
mov ebp, [esi+164-128] ; get acc[41]
mov edx, [esi+228-128] ; get acc[57]
lea ecx, [eax+ebx] ; acc[9]+acc[25]
sub eax, ebx ; acc[9]-acc[25]
lea ebx, [ebp+edx] ; acc[41]+acc[57]
sub ebp, edx ; acc[41]-acc[57]
lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57]
sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57])
lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57])
sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+2*PITCH+1], dl ; output[2][1] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+2*PITCH+6], cl ; output[2][6] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+5*PITCH+1], bl ; output[5][1] = tmp3
mov ebx, [esi+104-128] ; get acc[26]
; -------------------------------------------------------------------------
; 10
mov BYTE PTR [edi+5*PITCH+6], al ; output[5][6] = tmp4
mov eax, [esi+40-128] ; get acc[10]
mov ebp, [esi+168-128] ; get acc[42]
mov edx, [esi+232-128] ; get acc[58]
lea ecx, [eax+ebx] ; acc[10]+acc[26]
sub eax, ebx ; acc[10]-acc[26]
lea ebx, [ebp+edx] ; acc[42]+acc[58]
sub ebp, edx ; acc[42]-acc[58]
lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58]
sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58])
lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58])
sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+2*PITCH+2], dl ; output[2][2] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+2*PITCH+5], cl ; output[2][5] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+5*PITCH+2], bl ; output[5][2] = tmp3
mov ebx, [esi+108-128] ; get acc[27]
; -------------------------------------------------------------------------
; 11
mov BYTE PTR [edi+5*PITCH+5], al ; output[5][5] = tmp4
mov eax, [esi+44-128] ; get acc[11]
mov ebp, [esi+172-128] ; get acc[43]
mov edx, [esi+236-128] ; get acc[59]
lea ecx, [eax+ebx] ; acc[11]+acc[27]
sub eax, ebx ; acc[11]-acc[27]
lea ebx, [ebp+edx] ; acc[43]+acc[59]
sub ebp, edx ; acc[43]-acc[59]
lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59]
sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59])
lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59])
sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+2*PITCH+3], dl ; output[2][3] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+2*PITCH+4], cl ; output[2][4] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+5*PITCH+3], bl ; output[5][3] = tmp3
mov ebx, [esi+112-128] ; get acc[28]
; -------------------------------------------------------------------------
; 12
mov BYTE PTR [edi+5*PITCH+4], al ; output[5][4] = tmp4
mov eax, [esi+48-128] ; get acc[12]
mov ebp, [esi+176-128] ; get acc[44]
mov edx, [esi+240-128] ; get acc[60]
lea ecx, [eax+ebx] ; acc[12]+acc[28]
sub eax, ebx ; acc[12]-acc[28]
lea ebx, [ebp+edx] ; acc[44]+acc[60]
sub ebp, edx ; acc[44]-acc[60]
lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60]
sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60])
lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60])
sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;lea esi, [L_ACCUM+4] ; get addr of accum[0]
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+3*PITCH], dl ; output[3][0] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+3*PITCH+7], cl ; output[3][7] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+4*PITCH], bl ; output[4][0] = tmp3
mov ebx, [esi+116-128] ; get acc[29]
; -------------------------------------------------------------------------
; 13
mov BYTE PTR [edi+4*PITCH+7], al ; output[4][7] = tmp4
mov eax, [esi+52-128] ; get acc[13]
mov ebp, [esi+180-128] ; get acc[45]
mov edx, [esi+244-128] ; get acc[61]
lea ecx, [eax+ebx] ; acc[13]+acc[29]
sub eax, ebx ; acc[13]-acc[29]
lea ebx, [ebp+edx] ; acc[45]+acc[61]
sub ebp, edx ; acc[45]-acc[61]
lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61]
sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61])
lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61])
sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+3*PITCH+1], dl ; output[3][1] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+3*PITCH+6], cl ; output[3][6] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+4*PITCH+1], bl ; output[4][1] = tmp3
mov ebx, [esi+120-128] ; get acc[30]
; -------------------------------------------------------------------------
; 14
mov BYTE PTR [edi+4*PITCH+6], al ; output[4][6] = tmp4
mov eax, [esi+56-128] ; get acc[14]
mov ebp, [esi+184-128] ; get acc[46]
mov edx, [esi+248-128] ; get acc[62]
lea ecx, [eax+ebx] ; acc[14]+acc[30]
sub eax, ebx ; acc[14]-acc[30]
lea ebx, [ebp+edx] ; acc[46]+acc[62]
sub ebp, edx ; acc[46]-acc[62]
lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62]
sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62])
lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62])
sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+3*PITCH+2], dl ; output[3][2] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+3*PITCH+5], cl ; output[3][5] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+4*PITCH+2], bl ; output[4][2] = tmp3
mov ebx, [esi+124-128] ; get acc[31]
; -------------------------------------------------------------------------
; 15
mov BYTE PTR [edi+4*PITCH+5], al ; output[4][5] = tmp4
mov eax, [esi+60-128] ; get acc[15]
mov ebp, [esi+188-128] ; get acc[47]
mov edx, [esi+252-128] ; get acc[63]
lea ecx, [eax+ebx] ; acc[15]+acc[31]
sub eax, ebx ; acc[15]-acc[31]
lea ebx, [ebp+edx] ; acc[47]+acc[63]
sub ebp, edx ; acc[47]-acc[63]
lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63]
sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63])
lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63])
sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63])
sar edx, SCALER ; tmp1 >> 13
lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
sar ecx, SCALER ; tmp2 >> 13
;
sar ebx, SCALER ; tmp3 >> 13
mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
sar eax, SCALER ; tmp4 >> 13
mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
mov BYTE PTR [edi+3*PITCH+3], dl ; output[3][3] = tmp1
mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
mov BYTE PTR [edi+3*PITCH+4], cl ; output[3][4] = tmp2
mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
mov BYTE PTR [edi+4*PITCH+3], bl ; output[4][3] = tmp3
mov BYTE PTR [edi+4*PITCH+4], al ; output[4][4] = tmp4
ret
////////////////////////////////////////////////////////////////////////////
//assume parameters passed in by registers
idct_bfly_inter:
; ----------------------------------------------------------------------
; INTER ONLY Butterfly and clamp
; Uses all registers.
; Uses all accumulators[64], accum
; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra
; Writes to Intermediate matrix [8][8] of DWORDS, Intermediate
; NOTE:
; Code assumes that Intermediate and accumulator arrays are aligned at
; cache-line boundary
; Process 4 outputs per group, 0-15
; 0
mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate
lea esi, [L_ACCUM+4+128] ; get addr of accum[0] biased by 128
add edi, 128
nop
mov ebx, [esi+64-128] ; get acc[16]
mov eax, [esi-128] ; get acc[0] bank conflict
; mov edx, [edi-128] ; pre-fetch line 0; 4 to avoid bank conflict
; mov ecx, [edi+1*32-128+4] ; pre-fetch line 1
; mov edx, [edi+2*32-128] ; pre-fetch line 2
; mov ecx, [edi+3*32-128+4] ; pre-fetch line 3
; mov edx, [edi+4*32-128] ; pre-fetch line 4
; mov ecx, [edi+5*32-128+4] ; pre-fetch line 5
; mov edx, [edi+6*32-128] ; pre-fetch line 6
; mov ecx, [edi+7*32-128+4] ; pre-fetch line 7
mov ebp, [esi+128-128] ; get acc[32]
lea ecx, [eax+ebx] ; acc[0]+acc[16]
mov edx, [esi+192-128] ; get acc[48]
sub eax, ebx ; acc[0]-acc[16]
lea ebx, [ebp+edx] ; acc[32]+acc[48]
sub ebp, edx ; acc[32]-acc[48]
lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi-128], edx ; Intermediate[0][0] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+7*4-128], ecx ; Intermediate[0][7] = tmp2
mov DWORD PTR [edi+7*32-128], ebx ; Intermediate[7][0] = tmp3
mov ebx, [esi+68-128] ; get acc[17]
; -------------------------------------------------------------------------
; 1
mov DWORD PTR [edi+7*32+7*4-128], eax ; Intermediate[7][7] = tmp4
mov eax, [esi+4-128] ; get acc[1]
mov ebp, [esi+132-128] ; get acc[33]
lea ecx, [eax+ebx] ; acc[1]+acc[17]
mov edx, [esi+196-128] ; get acc[49]
sub eax, ebx ; acc[1]-acc[17]
lea ebx, [ebp+edx] ; acc[33]+acc[49]
sub ebp, edx ; acc[33]-acc[49]
lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+1*4-128], edx ; Intermediate[0][1] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+6*4-128], ecx ; Intermediate[0][6] = tmp2
mov DWORD PTR [edi+7*32+1*4-128], ebx ; Intermediate[7][1] = tmp3
mov ebx, [esi+72-128] ; get acc[18]
; -------------------------------------------------------------------------
; 2
mov DWORD PTR [edi+7*32+6*4-128], eax ; Intermediate[7][6] = tmp4
mov eax, [esi+8-128] ; get acc[2]
mov ebp, [esi+136-128] ; get acc[34]
lea ecx, [eax+ebx] ; acc[2]+acc[18]
mov edx, [esi+200-128] ; get acc[50]
sub eax, ebx ; acc[2]-acc[18]
lea ebx, [ebp+edx] ; acc[34]+acc[50]
sub ebp, edx ; acc[34]-acc[50]
lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+2*4-128], edx ; Intermediate[0][2] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+5*4-128], ecx ; Intermediate[0][5] = tmp2
mov DWORD PTR [edi+7*32+2*4-128], ebx ; Intermediate[7][2] = tmp3
mov ebx, [esi+76-128] ; get acc[19]
; -------------------------------------------------------------------------
; 3
mov DWORD PTR [edi+7*32+5*4-128], eax ; Intermediate[7][5] = tmp4
mov eax, [esi+12-128] ; get acc[3]
mov ebp, [esi+140-128] ; get acc[35]
lea ecx, [eax+ebx] ; acc[3]+acc[19]
mov edx, [esi+204-128] ; get acc[51]
sub eax, ebx ; acc[3]-acc[19]
lea ebx, [ebp+edx] ; acc[35]+acc[51]
sub ebp, edx ; acc[35]-acc[51]
lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+3*4-128], edx ; Intermediate[0][3] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+4*4-128], ecx ; Intermediate[0][4] = tmp2
mov DWORD PTR [edi+7*32+3*4-128], ebx ; Intermediate[7][3] = tmp3
mov ebx, [esi+80-128] ; get acc[20]
; -------------------------------------------------------------------------
; 4
mov DWORD PTR [edi+7*32+4*4-128], eax ; Intermediate[7][4] = tmp4
mov eax, [esi+16-128] ; get acc[4]
mov ebp, [esi+144-128] ; get acc[36]
lea ecx, [eax+ebx] ; acc[4]+acc[20]
mov edx, [esi+208-128] ; get acc[52]
sub eax, ebx ; acc[4]-acc[20]
lea ebx, [ebp+edx] ; acc[36]+acc[52]
sub ebp, edx ; acc[36]-acc[52]
lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+32-128], edx ; Intermediate[1][0] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+32+7*4-128], ecx ; Intermediate[1][7] = tmp2
mov DWORD PTR [edi+6*32-128], ebx ; Intermediate[6][0] = tmp3
mov ebx, [esi+84-128] ; get acc[21]
; -------------------------------------------------------------------------
; 5
mov DWORD PTR [edi+6*32+7*4-128], eax ; Intermediate[6][7] = tmp4
mov eax, [esi+20-128] ; get acc[5]
mov ebp, [esi+148-128] ; get acc[37]
lea ecx, [eax+ebx] ; acc[5]+acc[21]
mov edx, [esi+212-128] ; get acc[53]
sub eax, ebx ; acc[5]-acc[21]
lea ebx, [ebp+edx] ; acc[37]+acc[53]
sub ebp, edx ; acc[37]-acc[53]
lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+32+1*4-128], edx ; Intermediate[1][1] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+32+6*4-128], ecx ; Intermediate[1][6] = tmp2
mov DWORD PTR [edi+6*32+1*4-128], ebx ; Intermediate[6][1] = tmp3
mov ebx, [esi+88-128] ; get acc[22]
; -------------------------------------------------------------------------
; 6
mov DWORD PTR [edi+6*32+6*4-128], eax ; Intermediate[6][6] = tmp4
mov eax, [esi+24-128] ; get acc[6] Bank conflict
mov ebp, [esi+152-128] ; get acc[38]
lea ecx, [eax+ebx] ; acc[6]+acc[22]
mov edx, [esi+216-128] ; get acc[54]
sub eax, ebx ; acc[6]-acc[22]
lea ebx, [ebp+edx] ; acc[38]+acc[54]
sub ebp, edx ; acc[38]-acc[54]
lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+32+2*4-128], edx ; Intermediate[1][2] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+32+5*4-128], ecx ; Intermediate[1][5] = tmp2
mov DWORD PTR [edi+6*32+2*4-128], ebx ; Intermediate[6][2] = tmp3
mov ebx, [esi+92-128] ; get acc[23]
; -------------------------------------------------------------------------
; 7
mov DWORD PTR [edi+6*32+5*4-128], eax ; Intermediate[6][5] = tmp4
mov eax, [esi+28-128] ; get acc[7]
mov ebp, [esi+156-128] ; get acc[39]
lea ecx, [eax+ebx] ; acc[7]+acc[23]
mov edx, [esi+220-128] ; get acc[55]
sub eax, ebx ; acc[7]-acc[23]
lea ebx, [ebp+edx] ; acc[39]+acc[55]
sub ebp, edx ; acc[39]-acc[55]
lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+32+3*4-128], edx ; Intermediate[1][3] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+32+4*4-128], ecx ; Intermediate[1][4] = tmp2
mov DWORD PTR [edi+6*32+3*4-128], ebx ; Intermediate[6][3] = tmp3
mov ebx, [esi+96-128] ; get acc[24]
; -------------------------------------------------------------------------
; 8
mov DWORD PTR [edi+6*32+4*4-128], eax ; Intermediate[6][4] = tmp4
mov eax, [esi+32-128] ; get acc[8]
mov ebp, [esi+160-128] ; get acc[40]
lea ecx, [eax+ebx] ; acc[8]+acc[24]
mov edx, [esi+224-128] ; get acc[56]
sub eax, ebx ; acc[8]-acc[24]
lea ebx, [ebp+edx] ; acc[40]+acc[56]
sub ebp, edx ; acc[40]-acc[56]
lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56]
sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+2*32-128], edx ; Intermediate[2][0] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+2*32+7*4-128], ecx ; Intermediate[2][7] = tmp2
mov DWORD PTR [edi+5*32-128], ebx ; Intermediate[5][0] = tmp3
mov ebx, [esi+100-128] ; get acc[25]
; -------------------------------------------------------------------------
; 9
mov DWORD PTR [edi+5*32+7*4-128], eax ; Intermediate[5][7] = tmp4
mov eax, [esi+36-128] ; get acc[9]
mov ebp, [esi+164-128] ; get acc[41]
lea ecx, [eax+ebx] ; acc[9]+acc[25]
mov edx, [esi+228-128] ; get acc[57]
sub eax, ebx ; acc[9]-acc[25]
lea ebx, [ebp+edx] ; acc[41]+acc[57]
sub ebp, edx ; acc[41]-acc[57]
lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57]
sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+2*32+1*4-128], edx ; Intermediate[2][1] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+2*32+6*4-128], ecx ; Intermediate[2][6] = tmp2
mov DWORD PTR [edi+5*32+1*4-128], ebx ; Intermediate[5][1] = tmp3
mov ebx, [esi+104-128] ; get acc[26]
; -------------------------------------------------------------------------
; 10
mov DWORD PTR [edi+5*32+6*4-128], eax ; Intermediate[5][6] = tmp4
mov eax, [esi+40-128] ; get acc[10]
mov ebp, [esi+168-128] ; get acc[42]
lea ecx, [eax+ebx] ; acc[10]+acc[26]
mov edx, [esi+232-128] ; get acc[58]
sub eax, ebx ; acc[10]-acc[26]
lea ebx, [ebp+edx] ; acc[42]+acc[58]
sub ebp, edx ; acc[42]-acc[58]
lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58]
sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+2*32+2*4-128], edx ; Intermediate[2][2] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+2*32+5*4-128], ecx ; Intermediate[2][5] = tmp2
mov DWORD PTR [edi+5*32+2*4-128], ebx ; Intermediate[5][2] = tmp3
mov ebx, [esi+108-128] ; get acc[27]
; -------------------------------------------------------------------------
; 11
mov DWORD PTR [edi+5*32+5*4-128], eax ; Intermediate[5][5] = tmp4
mov eax, [esi+44-128] ; get acc[11]
mov ebp, [esi+172-128] ; get acc[43]
lea ecx, [eax+ebx] ; acc[11]+acc[27]
mov edx, [esi+236-128] ; get acc[59]
sub eax, ebx ; acc[11]-acc[27]
lea ebx, [ebp+edx] ; acc[43]+acc[59]
sub ebp, edx ; acc[43]-acc[59]
lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59]
sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+2*32+3*4-128], edx ; Intermediate[2][3] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+2*32+4*4-128], ecx ; Intermediate[2][4] = tmp2
mov DWORD PTR [edi+5*32+3*4-128], ebx ; Intermediate[5][3] = tmp3
mov ebx, [esi+112-128] ; get acc[28]
; -------------------------------------------------------------------------
; 12
mov DWORD PTR [edi+5*32+4*4-128], eax ; Intermediate[5][4] = tmp4
mov eax, [esi+48-128] ; get acc[12] Bank conflict
mov ebp, [esi+176-128] ; get acc[44]
lea ecx, [eax+ebx] ; acc[12]+acc[28]
mov edx, [esi+240-128] ; get acc[60]
sub eax, ebx ; acc[12]-acc[28]
lea ebx, [ebp+edx] ; acc[44]+acc[60]
sub ebp, edx ; acc[44]-acc[60]
lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60]
sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+3*32-128], edx ; Intermediate[3][0] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+3*32+7*4-128], ecx ; Intermediate[3][7] = tmp2
mov DWORD PTR [edi+4*32-128], ebx ; Intermediate[4][0] = tmp3
mov ebx, [esi+116-128] ; get acc[29]
; -------------------------------------------------------------------------
; 13
mov DWORD PTR [edi+4*32+7*4-128], eax ; Intermediate[4][7] = tmp4
mov eax, [esi+52-128] ; get acc[13]
mov ebp, [esi+180-128] ; get acc[45]
lea ecx, [eax+ebx] ; acc[13]+acc[29]
mov edx, [esi+244-128] ; get acc[61]
sub eax, ebx ; acc[13]-acc[29]
lea ebx, [ebp+edx] ; acc[45]+acc[61]
sub ebp, edx ; acc[45]-acc[61]
lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61]
sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+3*32+1*4-128], edx ; Intermediate[3][1] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+3*32+6*4-128], ecx ; Intermediate[3][6] = tmp2
mov DWORD PTR [edi+4*32+1*4-128], ebx ; Intermediate[4][1] = tmp3
mov ebx, [esi+120-128] ; get acc[30]
; -------------------------------------------------------------------------
; 14
mov DWORD PTR [edi+4*32+6*4-128], eax ; Intermediate[4][6] = tmp4
mov eax, [esi+56-128] ; get acc[14] Bank conflict
mov ebp, [esi+184-128] ; get acc[46]
lea ecx, [eax+ebx] ; acc[14]+acc[30]
mov edx, [esi+248-128] ; get acc[62]
sub eax, ebx ; acc[14]-acc[30]
lea ebx, [ebp+edx] ; acc[46]+acc[62]
sub ebp, edx ; acc[46]-acc[62]
lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62]
sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+3*32+2*4-128], edx ; Intermediate[3][2] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+3*32+5*4-128], ecx ; Intermediate[3][5] = tmp2
mov DWORD PTR [edi+4*32+2*4-128], ebx ; Intermediate[4][2] = tmp3
mov ebx, [esi+124-128] ; get acc[31]
; -------------------------------------------------------------------------
; 15
mov DWORD PTR [edi+4*32+5*4-128], eax ; Intermediate[4][5] = tmp4
mov eax, [esi+60-128] ; get acc[15]
mov ebp, [esi+188-128] ; get acc[47]
lea ecx, [eax+ebx] ; acc[15]+acc[31]
mov edx, [esi+252-128] ; get acc[63]
sub eax, ebx ; acc[15]-acc[31]
lea ebx, [ebp+edx] ; acc[47]+acc[63]
sub ebp, edx ; acc[47]-acc[63]
lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63]
sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63])
sar edx, SCALER ; tmp1 >> 13
lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63])
sar ecx, SCALER ; tmp2 >> 13
sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63])
sar ebx, SCALER ; tmp3 >> 13
mov DWORD PTR [edi+3*32+3*4-128], edx ; Intermediate[3][3] = tmp1
sar eax, SCALER ; tmp4 >> 13
mov DWORD PTR [edi+3*32+4*4-128], ecx ; Intermediate[3][4] = tmp2
mov DWORD PTR [edi+4*32+3*4-128], ebx ; Intermediate[4][3] = tmp3
mov DWORD PTR [edi+4*32+4*4-128], eax ; Intermediate[4][4] = tmp4
ret
} //end of asm
}
#pragma code_seg()