|
|
/* *************************************************************************
** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */
//////////////////////////////////////////////////////////////////////////
// $Author: AGUPTA2 $
// $Date: 22 Mar 1996 17:23:16 $
// $Archive: S:\h26x\src\dec\d3bvriq.cpv $
// $Header: S:\h26x\src\dec\d3bvriq.cpv 1.7 22 Mar 1996 17:23:16 AGUPTA2 $
// $Log: S:\h26x\src\dec\d3bvriq.cpv $
//
// Rev 1.7 22 Mar 1996 17:23:16 AGUPTA2
// Minor interface change to accomodate MMX rtns. Now the interface is the
// same for MMX and IA.
//
// Rev 1.6 08 Mar 1996 16:46:10 AGUPTA2
// Added pragma code_seg.
//
//
// Rev 1.5 15 Feb 1996 14:54:08 RMCKENZX
// Gutted and re-wrote routine, optimizing for performance
// for the p5. Added clamping to -2048...+2047 to escape code
// portion.
//
// Rev 1.4 27 Dec 1995 14:36:00 RMCKENZX
// Added copyright notice
//
// Rev 1.3 09 Dec 1995 17:35:20 RMCKENZX
// Re-checked in module to support decoder re-architecture (thru PB frames)
//
// Rev 1.0 27 Nov 1995 14:36:46 CZHU
// Initial revision.
//
// Rev 1.28 03 Nov 1995 16:28:50 CZHU
// Cleaning up and added more comments
//
// Rev 1.27 31 Oct 1995 10:27:20 CZHU
// Added error checking for total run value.
//
// Rev 1.26 19 Sep 1995 10:45:12 CZHU
//
// Improved pairing and cleaned up
//
// Rev 1.25 18 Sep 1995 10:20:28 CZHU
// Fixed bugs in handling escape codes for INTER blocks w.r.t. run.
//
// Rev 1.24 15 Sep 1995 09:35:30 CZHU
// fixed bugs in run cumulation for inter
//
// Rev 1.23 14 Sep 1995 10:13:32 CZHU
//
// Initialize cumulated run for the INTER blocks.
//
// Rev 1.22 12 Sep 1995 17:36:06 AKASAI
//
// Fixed bug in addressing to Intermediate when changed from writing
// BYTES to DWORDS. Inter Butterfly only had the problem.
//
// Rev 1.21 12 Sep 1995 13:37:58 AKASAI
// Added Butterfly Inter code. Also added optimizations to pre-fetch
// accumulators and "output" cache lines.
//
// Rev 1.20 11 Sep 1995 16:41:32 CZHU
// Adjust target block address: write to Target if INTRA, write to tempory sto
//
// Rev 1.19 11 Sep 1995 14:30:32 CZHU
// Seperate Butterfly for inter and intra, put place holder for inter blocks
//
// Rev 1.18 08 Sep 1995 11:49:00 CZHU
// Added support for P frames, fixed bugs related to INTRADC's presence.
//
// Rev 1.17 28 Aug 1995 14:51:22 CZHU
// Improve pairing and clean up
//
// Rev 1.16 24 Aug 1995 15:36:24 CZHU
//
// Fixed bugs handling the escape code followed by 22bits fixed length code
//
// Rev 1.15 23 Aug 1995 14:53:32 AKASAI
// Changed butterfly writes to increment by bytes and take a PITCH.
//
// Rev 1.14 23 Aug 1995 11:58:46 CZHU
// Added signed extended inverse quant before calling idct. and others
//
// Rev 1.13 22 Aug 1995 17:38:28 CZHU
// Calls the idct accumulation for each symbol and butterfly at the end.
//
// Rev 1.12 21 Aug 1995 14:39:58 CZHU
//
// Added IDCT initialization code and stubs for accumulation and butterfly.
// Also added register saving and restoration before and after accumulation
//
// Rev 1.11 18 Aug 1995 17:03:32 CZHU
// Added comments and clean up for integration with IDCT
//
// Rev 1.10 18 Aug 1995 15:01:52 CZHU
// Fixed bugs in handling escape codes using byte oriented reading approach
//
// Rev 1.9 16 Aug 1995 14:24:22 CZHU
// Bug fixes for the integration with bitstream parsing. Also changed from DWO
// reading to byte oriented reading.
//
// Rev 1.8 15 Aug 1995 15:07:42 CZHU
// Fixed the stack so that the parameters have been passed in correctly.
//
// Rev 1.7 14 Aug 1995 16:39:02 DBRUCKS
// changed pPBlock to pCurBlock
//
// Rev 1.6 11 Aug 1995 16:08:12 CZHU
// removed local varables in C
//
// Rev 1.5 11 Aug 1995 15:51:26 CZHU
//
// Readjust local varables on the stack. Clear ECX upfront.
//
// Rev 1.4 11 Aug 1995 15:14:32 DBRUCKS
// variable name changes
//
// Rev 1.3 11 Aug 1995 13:37:26 CZHU
//
// Adjust to the joint optimation of IDCT, IQ, RLE, and ZZ.
// Also added place holders for IDCT.
//
// Rev 1.2 11 Aug 1995 10:30:26 CZHU
// Changed the functions parameters, and added codes to short-curcuit IDCT bef
//
// Rev 1.1 03 Aug 1995 14:39:04 CZHU
//
// further optimization.
//
// Rev 1.0 02 Aug 1995 15:20:02 CZHU
// Initial revision.
//
// Rev 1.1 02 Aug 1995 10:21:12 CZHU
// Added asm codes for VLD of TCOEFF, inverse quantization, run-length decode.
//
//--------------------------------------------------------------------------
//
// d3xbvriq.cpp
//
// Description:
// This routine performs run length decoding and inverse quantization
// of transform coefficients for one block.
// MMx version.
//
// Routines:
// VLD_RLD_IQ_Block
//
// Inputs (dwords pushed onto stack by caller):
// lpBlockAction pointer to Block action stream for current blk.
//
// lpSrc The input bitstream.
//
// uBitsInOut Number of bits already read.
//
// pIQ_INDEX Pointer to coefficients and indices.
//
// pN Pointer to number of coefficients read.
//
// Returns:
// 0 on bit stream error, otherwise total number of bits read
// (including number read prior to call).
//
// Note:
// The structure of gTAB_TCOEFF_MAJOR is as follows:
// bits name: description
// ---- ----- -----------
// 25-18 bits: number of bitstream bits used
// 17 last: flag for last coefficient
// 16-9 run: number of preceeding 0 coefficients plus 1
// 8-2 level: absolute value of coefficient
// 1 sign: sign of coefficient
// 0 hit: 1 = major table miss, 0 = major table hit
//
// The structure of gTAB_TCOEFF_MINOR is the same, right shifted by 1 bit.
// A gTAB_TCOEFF_MAJOR value of 00000001h indicates the escape code.
//
//--------------------------------------------------------------------------
#include "precomp.h"
// local variable definitions
#define L_Quantizer esp+20 // quantizer P_BlockAction
#define L_Quantizer64 esp+24 // 64*quantizer P_src
#define L_Bits esp+28 // bit offset P_bits
#define L_CumRun esp+36 // cumulative run P_dst
// stack use
// ebp esp+0
// esi esp+4
// edi esp+8
// ebx esp+12
// return address esp+16
// input parameters
#define P_BlockAction esp+20 // L_Quantizer
#define P_src esp+24 // L_Quantizer64
#define P_bits esp+28 // L_Bits
#define P_num esp+32 //
#define P_dst esp+36 // L_CumRun
#pragma code_seg("IACODE1")
extern "C" __declspec(naked) U32 VLD_RLD_IQ_Block(T_BlkAction *lpBlockAction, U8 *lpSrc, U32 uBitsread, U32 *pN, U32 *pIQ_INDEX) { __asm {
// save registers
push ebp push esi push edi push ebx
//
// initialize
// make sure we read in the P_src and P_dst pointers before we
// overwrite them with L_Quantizer64 and L_CumRun.
//
// Output Registers:
// dl = block type ([P_BlockAction])
// esi = bitstream source pointer (P_src)
// edi = coefficient destination pointer (P_dst)
// ebp = coefficent counter (init to 0)
//
// Locals initialized on Stack: (these overwrite indicated input parameters)
// local var clobbers initial value
// ---------------------------------------------------
// L_Quantizer P_BlockAction input quantizer
// L_Quantizer64 P_src 64 * input quantizer
// L_CumRun P_dst -1
//
xor ebp, ebp // init coefficient counter to 0
xor eax, eax // zero eax for quantizer & coef. counter
mov ecx, [P_BlockAction] // ecx = block action pointer
mov ebx, -1 // beginning cumulative run value
mov esi, [P_src] // esi = bitstream source pointer
mov edi, [P_dst] // edi = coefficient pointer
mov al, [ecx+3] // al = Quantizer
mov [L_CumRun], ebx // init cumulative run to -1
mov [L_Quantizer], eax // save original quantizer
mov dl, [ecx] // block type in dl
shl eax, 6 // 64 * Quantizer
mov ecx, [L_Bits] // ecx = L_Bits
mov ebx, ecx // ebx = L_Bits
mov [L_Quantizer64], eax // save 64*Quantizer for this block
shr ebx, 3 // offset for input
and ecx, 7 // shift value
cmp dl, 1 // check the block type for INTRA
ja get_next_coefficient // if type 2 or larger, no INTRADC
//
// Decode INTRADC
//
// uses dword load & bitswap to achieve big endian ordering.
// prior codes prepares ebx, cl, and dl as follows:
// ebx = L_Bits>>3
// cl = L_Bits&7
// dl = BlockType (0=INTRA_DC, 1=INTRA, 2=INTER, etc.)
//
mov eax, [esi+ebx] // *** PROBABLE MALALIGNMENT ***
inc ebp // one coefficient decoded
bswap eax // big endian order
// *** NOT PAIRABLE ***
shl eax, cl // left justify bitstream buffer
// *** NOT PAIRABLE ***
// *** 4 CYCLES ***
shr eax, 21 // top 11 bits to the bottom
mov ecx, [L_Bits] // ecx = L_Bits
and eax, 07f8h // mask last 3 bits
add ecx, 8 // bits used += 8 for INTRADC
cmp eax, 07f8h // check for 11111111 codeword
jne skipa
mov eax, 0400h // 11111111 decodes to 400h = 1024
skipa: mov [L_Bits], ecx // update bits used
xor ebx, ebx
mov [L_CumRun], ebx // save total run (starts with zero)
mov [edi], eax // save decoded DC coefficient
mov [edi+4], ebx // save 0 index
mov ebx, ecx // ebx = L_Bits
shr ebx, 3 // offset for input
add edi, 8 // update coefficient pointer
// check for last
test dl, dl // check for INTRA-DC (block type=0)
jz finish // if only the INTRADC present
//
// Get Next Coefficient
//
// prior codes prepares ebx and ecx as follows:
// ebx = L_Bits>>3
// ecx = L_Bits
//
get_next_coefficient: // use dword load & bitswap to achieve big endian ordering
mov eax, [esi+ebx] // *** PROBABLE MALALIGNMENT ***
and ecx, 7 // shift value
bswap eax // big endian order
// *** NOT PAIRABLE ***
shl eax, cl // left justify buffer
// *** NOT PAIRABLE ***
// *** 4 CYCLES ***
// do table lookups
mov ebx, eax // ebx for major table
mov ecx, eax // ecx for minor table
shr ebx, 24 // major table lookup
shr ecx, 17 // minor table lookup in bits with garbage
mov ebx, [gTAB_TCOEFF_MAJOR+4*ebx] // get the major table value
// ** AGI **
shr ebx, 1 // test major hit ?
jnc skipb // if hit major
and ecx, 0ffch // mask off garbage for minor table
test ebx, ebx // escape code value was 0x00000001
jz escape_code // handle escape by major table.
mov ebx, [gTAB_TCOEFF_MINOR+ecx] // use minor table
//
// input is ebx = event. See function header for the meaning of its fields
// now we decode the event, extracting the run, value, last.
// The table value moves to ecx and is shifted downward as portions
// are extracted to ebx.
//
skipb: mov ecx, ebx // ecx = table value
and ebx, 0ffh // ebx = 2*abs(level) + sign
shr ecx, 8 // run to bottom
mov edx, [L_Quantizer64] // edx = 64*quant
// ** PREFIX DELAY **
// ** AGI **
mov ax, [gTAB_INVERSE_Q+edx+2*ebx] // ax = dequantized value (I16)
mov ebx, ecx // ebx = table value
shl eax, 16 // shift value until sign bit is on top
and ebx, 0ffh // ebx = run + 1
sar eax, 16 // arithmetic shift extends value's sign
mov edx, [L_CumRun] // edx = (old) cumulative run
add edx, ebx // cumulative run += run + 1
mov [edi], eax // save coefficient's signed value
cmp edx, 03fh // check run for bitstream error
jg error
mov [L_CumRun], edx // update the cumulative run
inc ebp // increment number of coefficients read
// ** AGI **
mov edx, [gTAB_ZZ_RUN+4*edx] // edx = index of the current coefficient
mov ebx, ecx // ebx: bit 8 = last flag
mov [edi+4], edx // save coefficient's index
add edi, 8 // increment coefficient pointer
shr ecx, 9 // ecx = bits decoded
mov edx, [L_Bits] // edx = L_Bits
add ecx, edx // L_Bits += bits decoded
mov edx, ebx // ebx: bit 8 = last flag
mov [L_Bits], ecx // update L_Bits
mov ebx, ecx // ebx = L_Bits
shr ebx, 3 // offset for bitstream load
test edx, 100h // check for last
jz get_next_coefficient
finish: mov ecx, [P_num] // pointer to number of coeffients read
mov eax, [L_Bits] // return total bits used
pop ebx pop edi
mov [ecx], ebp // store number of coefficients read
pop esi
pop ebp ret
//
// process escape code separately
//
// we have the following 4 cases to compute the reconstructed value
// depending on the sign of L=level and the parity of Q=quantizer:
//
// L pos L neg
// Q even 2QL+(Q-1) 2QL-(Q-1)
// Q odd 2QL+(Q) 2QL-(Q)
//
// The Q or Q-1 term is formed by adding Q to its parity bit
// and then subtracting 1.
// The + or - on this term is gotten by anding the term with a
// mask (=0 or =-1) formed from the sign bit of Q*L,
// doubling the result, then subtracting it from the term.
// This will negate the term when L is negative and leave
// it unchanged when L is positive.
//
// Register usages:
// eax starts with bitstream, later L, finally result
// ebx starts with Q, later is the Q or Q-1 term
// ecx startw with mask, later 2*term
// edx bitstream
//
escape_code: mov edx, eax // edx = bitstream buffer
shl eax, 14 // signed 8-bit level to top
sar eax, 24 // eax = L (signed level)
mov ebx, [L_Quantizer]
test eax, 7fh // test for invalid codes
jz error
imul eax, ebx // eax = Q*L
// *** NOT PAIRABLE ***
// *** 10 cycles ***
dec ebx // term = Q-1
mov ecx, eax // mask = QL
or ebx, 1 // term = Q-1 if Q even, else = Q
sar ecx, 31 // mask = -1 if L neg, else = 0
xor ebx, ecx // term = ~Q[-1] if L neg, else = Q[-1]
add eax, eax // result = 2*Q*L
sub ebx, ecx // term = -(Q[-1]) if L neg, else = Q[-1]
mov ecx, edx // bitstream to ecx to get run
add eax, ebx // result = 2QL +- Q[-1]
// now clip to -2048 ... +2047 (12 bits: 0xfffff800 <= res <= 0x000007ff)
cmp eax, -2048 jge skip1
mov eax, -2048 jmp skip2
skip1: cmp eax, +2047 jle skip2
mov eax, 2047
skip2: // update run and compute index
shr ecx, 18 // run to bottom
mov ebx, [L_CumRun] // ebx = old total run
and ecx, 3fh // mask off bottom 6 bits for run
inc ebx // old run ++
add ebx, ecx // ebx = new cumulative run
mov [edi], eax // save coefficient's signed value
cmp ebx, 03fh // check run for bitstream error
jg error
mov [L_CumRun], ebx // update the cumulative run
mov ecx, [L_Bits] // ebx = number of bits used
mov ebx, [gTAB_ZZ_RUN+4*ebx] // ebx = index of the current coefficient
add ecx, 22 // escape code uses 22 bits
mov [edi+4], ebx // save coefficient's index
add edi, 8 // increment coefficient pointer
mov [L_Bits], ecx // update number of bits used
mov ebx, ecx // ebx = L_Bits
shr ebx, 3 // offset for bitstream load
inc ebp // increment number of coefficients read
test edx, 01000000h // check last bit
jz get_next_coefficient
jmp finish
error: pop ebx pop edi
pop esi pop ebp
xor eax, eax // zero bits used indicates ERROR
ret
}
} #pragma code_seg()
|