You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
548 lines
16 KiB
548 lines
16 KiB
/* *************************************************************************
|
|
** INTEL Corporation Proprietary Information
|
|
**
|
|
** This listing is supplied under the terms of a license
|
|
** agreement with INTEL Corporation and may not be copied
|
|
** nor disclosed except in accordance with the terms of
|
|
** that agreement.
|
|
**
|
|
** Copyright (c) 1995 Intel Corporation.
|
|
** All Rights Reserved.
|
|
**
|
|
** *************************************************************************
|
|
*/
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// $Author: AGUPTA2 $
|
|
// $Date: 22 Mar 1996 17:23:16 $
|
|
// $Archive: S:\h26x\src\dec\d3bvriq.cpv $
|
|
// $Header: S:\h26x\src\dec\d3bvriq.cpv 1.7 22 Mar 1996 17:23:16 AGUPTA2 $
|
|
// $Log: S:\h26x\src\dec\d3bvriq.cpv $
|
|
//
|
|
// Rev 1.7 22 Mar 1996 17:23:16 AGUPTA2
|
|
// Minor interface change to accomodate MMX rtns. Now the interface is the
|
|
// same for MMX and IA.
|
|
//
|
|
// Rev 1.6 08 Mar 1996 16:46:10 AGUPTA2
|
|
// Added pragma code_seg.
|
|
//
|
|
//
|
|
// Rev 1.5 15 Feb 1996 14:54:08 RMCKENZX
|
|
// Gutted and re-wrote routine, optimizing for performance
|
|
// for the p5. Added clamping to -2048...+2047 to escape code
|
|
// portion.
|
|
//
|
|
// Rev 1.4 27 Dec 1995 14:36:00 RMCKENZX
|
|
// Added copyright notice
|
|
//
|
|
// Rev 1.3 09 Dec 1995 17:35:20 RMCKENZX
|
|
// Re-checked in module to support decoder re-architecture (thru PB frames)
|
|
//
|
|
// Rev 1.0 27 Nov 1995 14:36:46 CZHU
|
|
// Initial revision.
|
|
//
|
|
// Rev 1.28 03 Nov 1995 16:28:50 CZHU
|
|
// Cleaning up and added more comments
|
|
//
|
|
// Rev 1.27 31 Oct 1995 10:27:20 CZHU
|
|
// Added error checking for total run value.
|
|
//
|
|
// Rev 1.26 19 Sep 1995 10:45:12 CZHU
|
|
//
|
|
// Improved pairing and cleaned up
|
|
//
|
|
// Rev 1.25 18 Sep 1995 10:20:28 CZHU
|
|
// Fixed bugs in handling escape codes for INTER blocks w.r.t. run.
|
|
//
|
|
// Rev 1.24 15 Sep 1995 09:35:30 CZHU
|
|
// fixed bugs in run cumulation for inter
|
|
//
|
|
// Rev 1.23 14 Sep 1995 10:13:32 CZHU
|
|
//
|
|
// Initialize cumulated run for the INTER blocks.
|
|
//
|
|
// Rev 1.22 12 Sep 1995 17:36:06 AKASAI
|
|
//
|
|
// Fixed bug in addressing to Intermediate when changed from writing
|
|
// BYTES to DWORDS. Inter Butterfly only had the problem.
|
|
//
|
|
// Rev 1.21 12 Sep 1995 13:37:58 AKASAI
|
|
// Added Butterfly Inter code. Also added optimizations to pre-fetch
|
|
// accumulators and "output" cache lines.
|
|
//
|
|
// Rev 1.20 11 Sep 1995 16:41:32 CZHU
|
|
// Adjust target block address: write to Target if INTRA, write to tempory sto
|
|
//
|
|
// Rev 1.19 11 Sep 1995 14:30:32 CZHU
|
|
// Seperate Butterfly for inter and intra, put place holder for inter blocks
|
|
//
|
|
// Rev 1.18 08 Sep 1995 11:49:00 CZHU
|
|
// Added support for P frames, fixed bugs related to INTRADC's presence.
|
|
//
|
|
// Rev 1.17 28 Aug 1995 14:51:22 CZHU
|
|
// Improve pairing and clean up
|
|
//
|
|
// Rev 1.16 24 Aug 1995 15:36:24 CZHU
|
|
//
|
|
// Fixed bugs handling the escape code followed by 22bits fixed length code
|
|
//
|
|
// Rev 1.15 23 Aug 1995 14:53:32 AKASAI
|
|
// Changed butterfly writes to increment by bytes and take a PITCH.
|
|
//
|
|
// Rev 1.14 23 Aug 1995 11:58:46 CZHU
|
|
// Added signed extended inverse quant before calling idct. and others
|
|
//
|
|
// Rev 1.13 22 Aug 1995 17:38:28 CZHU
|
|
// Calls the idct accumulation for each symbol and butterfly at the end.
|
|
//
|
|
// Rev 1.12 21 Aug 1995 14:39:58 CZHU
|
|
//
|
|
// Added IDCT initialization code and stubs for accumulation and butterfly.
|
|
// Also added register saving and restoration before and after accumulation
|
|
//
|
|
// Rev 1.11 18 Aug 1995 17:03:32 CZHU
|
|
// Added comments and clean up for integration with IDCT
|
|
//
|
|
// Rev 1.10 18 Aug 1995 15:01:52 CZHU
|
|
// Fixed bugs in handling escape codes using byte oriented reading approach
|
|
//
|
|
// Rev 1.9 16 Aug 1995 14:24:22 CZHU
|
|
// Bug fixes for the integration with bitstream parsing. Also changed from DWO
|
|
// reading to byte oriented reading.
|
|
//
|
|
// Rev 1.8 15 Aug 1995 15:07:42 CZHU
|
|
// Fixed the stack so that the parameters have been passed in correctly.
|
|
//
|
|
// Rev 1.7 14 Aug 1995 16:39:02 DBRUCKS
|
|
// changed pPBlock to pCurBlock
|
|
//
|
|
// Rev 1.6 11 Aug 1995 16:08:12 CZHU
|
|
// removed local varables in C
|
|
//
|
|
// Rev 1.5 11 Aug 1995 15:51:26 CZHU
|
|
//
|
|
// Readjust local varables on the stack. Clear ECX upfront.
|
|
//
|
|
// Rev 1.4 11 Aug 1995 15:14:32 DBRUCKS
|
|
// variable name changes
|
|
//
|
|
// Rev 1.3 11 Aug 1995 13:37:26 CZHU
|
|
//
|
|
// Adjust to the joint optimation of IDCT, IQ, RLE, and ZZ.
|
|
// Also added place holders for IDCT.
|
|
//
|
|
// Rev 1.2 11 Aug 1995 10:30:26 CZHU
|
|
// Changed the functions parameters, and added codes to short-curcuit IDCT bef
|
|
//
|
|
// Rev 1.1 03 Aug 1995 14:39:04 CZHU
|
|
//
|
|
// further optimization.
|
|
//
|
|
// Rev 1.0 02 Aug 1995 15:20:02 CZHU
|
|
// Initial revision.
|
|
//
|
|
// Rev 1.1 02 Aug 1995 10:21:12 CZHU
|
|
// Added asm codes for VLD of TCOEFF, inverse quantization, run-length decode.
|
|
//
|
|
|
|
|
|
//--------------------------------------------------------------------------
|
|
//
|
|
// d3xbvriq.cpp
|
|
//
|
|
// Description:
|
|
// This routine performs run length decoding and inverse quantization
|
|
// of transform coefficients for one block.
|
|
// MMx version.
|
|
//
|
|
// Routines:
|
|
// VLD_RLD_IQ_Block
|
|
//
|
|
// Inputs (dwords pushed onto stack by caller):
|
|
// lpBlockAction pointer to Block action stream for current blk.
|
|
//
|
|
// lpSrc The input bitstream.
|
|
//
|
|
// uBitsInOut Number of bits already read.
|
|
//
|
|
// pIQ_INDEX Pointer to coefficients and indices.
|
|
//
|
|
// pN Pointer to number of coefficients read.
|
|
//
|
|
// Returns:
|
|
// 0 on bit stream error, otherwise total number of bits read
|
|
// (including number read prior to call).
|
|
//
|
|
// Note:
|
|
// The structure of gTAB_TCOEFF_MAJOR is as follows:
|
|
// bits name: description
|
|
// ---- ----- -----------
|
|
// 25-18 bits: number of bitstream bits used
|
|
// 17 last: flag for last coefficient
|
|
// 16-9 run: number of preceeding 0 coefficients plus 1
|
|
// 8-2 level: absolute value of coefficient
|
|
// 1 sign: sign of coefficient
|
|
// 0 hit: 1 = major table miss, 0 = major table hit
|
|
//
|
|
// The structure of gTAB_TCOEFF_MINOR is the same, right shifted by 1 bit.
|
|
// A gTAB_TCOEFF_MAJOR value of 00000001h indicates the escape code.
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
#include "precomp.h"
|
|
|
|
// local variable definitions
|
|
#define L_Quantizer esp+20 // quantizer P_BlockAction
|
|
#define L_Quantizer64 esp+24 // 64*quantizer P_src
|
|
#define L_Bits esp+28 // bit offset P_bits
|
|
#define L_CumRun esp+36 // cumulative run P_dst
|
|
|
|
// stack use
|
|
// ebp esp+0
|
|
// esi esp+4
|
|
// edi esp+8
|
|
// ebx esp+12
|
|
// return address esp+16
|
|
|
|
// input parameters
|
|
#define P_BlockAction esp+20 // L_Quantizer
|
|
#define P_src esp+24 // L_Quantizer64
|
|
#define P_bits esp+28 // L_Bits
|
|
#define P_num esp+32 //
|
|
#define P_dst esp+36 // L_CumRun
|
|
|
|
|
|
#pragma code_seg("IACODE1")
|
|
extern "C" __declspec(naked)
|
|
U32 VLD_RLD_IQ_Block(T_BlkAction *lpBlockAction,
|
|
U8 *lpSrc,
|
|
U32 uBitsread,
|
|
U32 *pN,
|
|
U32 *pIQ_INDEX)
|
|
{
|
|
__asm {
|
|
|
|
// save registers
|
|
push ebp
|
|
push esi
|
|
push edi
|
|
push ebx
|
|
|
|
//
|
|
// initialize
|
|
// make sure we read in the P_src and P_dst pointers before we
|
|
// overwrite them with L_Quantizer64 and L_CumRun.
|
|
//
|
|
// Output Registers:
|
|
// dl = block type ([P_BlockAction])
|
|
// esi = bitstream source pointer (P_src)
|
|
// edi = coefficient destination pointer (P_dst)
|
|
// ebp = coefficent counter (init to 0)
|
|
//
|
|
// Locals initialized on Stack: (these overwrite indicated input parameters)
|
|
// local var clobbers initial value
|
|
// ---------------------------------------------------
|
|
// L_Quantizer P_BlockAction input quantizer
|
|
// L_Quantizer64 P_src 64 * input quantizer
|
|
// L_CumRun P_dst -1
|
|
//
|
|
xor ebp, ebp // init coefficient counter to 0
|
|
xor eax, eax // zero eax for quantizer & coef. counter
|
|
|
|
mov ecx, [P_BlockAction] // ecx = block action pointer
|
|
mov ebx, -1 // beginning cumulative run value
|
|
|
|
mov esi, [P_src] // esi = bitstream source pointer
|
|
mov edi, [P_dst] // edi = coefficient pointer
|
|
|
|
mov al, [ecx+3] // al = Quantizer
|
|
mov [L_CumRun], ebx // init cumulative run to -1
|
|
|
|
mov [L_Quantizer], eax // save original quantizer
|
|
mov dl, [ecx] // block type in dl
|
|
|
|
shl eax, 6 // 64 * Quantizer
|
|
mov ecx, [L_Bits] // ecx = L_Bits
|
|
|
|
mov ebx, ecx // ebx = L_Bits
|
|
mov [L_Quantizer64], eax // save 64*Quantizer for this block
|
|
|
|
shr ebx, 3 // offset for input
|
|
and ecx, 7 // shift value
|
|
|
|
cmp dl, 1 // check the block type for INTRA
|
|
ja get_next_coefficient // if type 2 or larger, no INTRADC
|
|
|
|
//
|
|
// Decode INTRADC
|
|
//
|
|
// uses dword load & bitswap to achieve big endian ordering.
|
|
// prior codes prepares ebx, cl, and dl as follows:
|
|
// ebx = L_Bits>>3
|
|
// cl = L_Bits&7
|
|
// dl = BlockType (0=INTRA_DC, 1=INTRA, 2=INTER, etc.)
|
|
//
|
|
mov eax, [esi+ebx] // *** PROBABLE MALALIGNMENT ***
|
|
inc ebp // one coefficient decoded
|
|
|
|
bswap eax // big endian order
|
|
// *** NOT PAIRABLE ***
|
|
|
|
shl eax, cl // left justify bitstream buffer
|
|
// *** NOT PAIRABLE ***
|
|
// *** 4 CYCLES ***
|
|
|
|
shr eax, 21 // top 11 bits to the bottom
|
|
mov ecx, [L_Bits] // ecx = L_Bits
|
|
|
|
and eax, 07f8h // mask last 3 bits
|
|
add ecx, 8 // bits used += 8 for INTRADC
|
|
|
|
cmp eax, 07f8h // check for 11111111 codeword
|
|
jne skipa
|
|
|
|
mov eax, 0400h // 11111111 decodes to 400h = 1024
|
|
|
|
skipa:
|
|
mov [L_Bits], ecx // update bits used
|
|
xor ebx, ebx
|
|
|
|
mov [L_CumRun], ebx // save total run (starts with zero)
|
|
mov [edi], eax // save decoded DC coefficient
|
|
|
|
mov [edi+4], ebx // save 0 index
|
|
mov ebx, ecx // ebx = L_Bits
|
|
|
|
shr ebx, 3 // offset for input
|
|
add edi, 8 // update coefficient pointer
|
|
|
|
// check for last
|
|
test dl, dl // check for INTRA-DC (block type=0)
|
|
jz finish // if only the INTRADC present
|
|
|
|
|
|
//
|
|
// Get Next Coefficient
|
|
//
|
|
// prior codes prepares ebx and ecx as follows:
|
|
// ebx = L_Bits>>3
|
|
// ecx = L_Bits
|
|
//
|
|
|
|
get_next_coefficient:
|
|
// use dword load & bitswap to achieve big endian ordering
|
|
mov eax, [esi+ebx] // *** PROBABLE MALALIGNMENT ***
|
|
and ecx, 7 // shift value
|
|
|
|
bswap eax // big endian order
|
|
// *** NOT PAIRABLE ***
|
|
|
|
shl eax, cl // left justify buffer
|
|
// *** NOT PAIRABLE ***
|
|
// *** 4 CYCLES ***
|
|
|
|
// do table lookups
|
|
mov ebx, eax // ebx for major table
|
|
mov ecx, eax // ecx for minor table
|
|
|
|
shr ebx, 24 // major table lookup
|
|
|
|
shr ecx, 17 // minor table lookup in bits with garbage
|
|
mov ebx, [gTAB_TCOEFF_MAJOR+4*ebx] // get the major table value
|
|
// ** AGI **
|
|
|
|
shr ebx, 1 // test major hit ?
|
|
jnc skipb // if hit major
|
|
|
|
and ecx, 0ffch // mask off garbage for minor table
|
|
test ebx, ebx // escape code value was 0x00000001
|
|
|
|
jz escape_code // handle escape by major table.
|
|
|
|
mov ebx, [gTAB_TCOEFF_MINOR+ecx] // use minor table
|
|
|
|
//
|
|
// input is ebx = event. See function header for the meaning of its fields
|
|
// now we decode the event, extracting the run, value, last.
|
|
// The table value moves to ecx and is shifted downward as portions
|
|
// are extracted to ebx.
|
|
//
|
|
skipb:
|
|
mov ecx, ebx // ecx = table value
|
|
and ebx, 0ffh // ebx = 2*abs(level) + sign
|
|
|
|
shr ecx, 8 // run to bottom
|
|
mov edx, [L_Quantizer64] // edx = 64*quant
|
|
|
|
// ** PREFIX DELAY **
|
|
// ** AGI **
|
|
mov ax, [gTAB_INVERSE_Q+edx+2*ebx] // ax = dequantized value (I16)
|
|
mov ebx, ecx // ebx = table value
|
|
|
|
shl eax, 16 // shift value until sign bit is on top
|
|
and ebx, 0ffh // ebx = run + 1
|
|
|
|
sar eax, 16 // arithmetic shift extends value's sign
|
|
mov edx, [L_CumRun] // edx = (old) cumulative run
|
|
|
|
add edx, ebx // cumulative run += run + 1
|
|
mov [edi], eax // save coefficient's signed value
|
|
|
|
cmp edx, 03fh // check run for bitstream error
|
|
jg error
|
|
|
|
mov [L_CumRun], edx // update the cumulative run
|
|
inc ebp // increment number of coefficients read
|
|
|
|
// ** AGI **
|
|
mov edx, [gTAB_ZZ_RUN+4*edx] // edx = index of the current coefficient
|
|
mov ebx, ecx // ebx: bit 8 = last flag
|
|
|
|
mov [edi+4], edx // save coefficient's index
|
|
add edi, 8 // increment coefficient pointer
|
|
|
|
shr ecx, 9 // ecx = bits decoded
|
|
mov edx, [L_Bits] // edx = L_Bits
|
|
|
|
add ecx, edx // L_Bits += bits decoded
|
|
mov edx, ebx // ebx: bit 8 = last flag
|
|
|
|
mov [L_Bits], ecx // update L_Bits
|
|
mov ebx, ecx // ebx = L_Bits
|
|
|
|
shr ebx, 3 // offset for bitstream load
|
|
test edx, 100h // check for last
|
|
|
|
jz get_next_coefficient
|
|
|
|
|
|
finish:
|
|
mov ecx, [P_num] // pointer to number of coeffients read
|
|
mov eax, [L_Bits] // return total bits used
|
|
|
|
pop ebx
|
|
pop edi
|
|
|
|
mov [ecx], ebp // store number of coefficients read
|
|
pop esi
|
|
|
|
pop ebp
|
|
ret
|
|
|
|
|
|
//
|
|
// process escape code separately
|
|
//
|
|
// we have the following 4 cases to compute the reconstructed value
|
|
// depending on the sign of L=level and the parity of Q=quantizer:
|
|
//
|
|
// L pos L neg
|
|
// Q even 2QL+(Q-1) 2QL-(Q-1)
|
|
// Q odd 2QL+(Q) 2QL-(Q)
|
|
//
|
|
// The Q or Q-1 term is formed by adding Q to its parity bit
|
|
// and then subtracting 1.
|
|
// The + or - on this term is gotten by anding the term with a
|
|
// mask (=0 or =-1) formed from the sign bit of Q*L,
|
|
// doubling the result, then subtracting it from the term.
|
|
// This will negate the term when L is negative and leave
|
|
// it unchanged when L is positive.
|
|
//
|
|
// Register usages:
|
|
// eax starts with bitstream, later L, finally result
|
|
// ebx starts with Q, later is the Q or Q-1 term
|
|
// ecx startw with mask, later 2*term
|
|
// edx bitstream
|
|
//
|
|
escape_code:
|
|
mov edx, eax // edx = bitstream buffer
|
|
|
|
shl eax, 14 // signed 8-bit level to top
|
|
|
|
sar eax, 24 // eax = L (signed level)
|
|
mov ebx, [L_Quantizer]
|
|
|
|
test eax, 7fh // test for invalid codes
|
|
jz error
|
|
|
|
imul eax, ebx // eax = Q*L
|
|
// *** NOT PAIRABLE ***
|
|
// *** 10 cycles ***
|
|
|
|
dec ebx // term = Q-1
|
|
mov ecx, eax // mask = QL
|
|
|
|
or ebx, 1 // term = Q-1 if Q even, else = Q
|
|
sar ecx, 31 // mask = -1 if L neg, else = 0
|
|
|
|
xor ebx, ecx // term = ~Q[-1] if L neg, else = Q[-1]
|
|
add eax, eax // result = 2*Q*L
|
|
|
|
sub ebx, ecx // term = -(Q[-1]) if L neg, else = Q[-1]
|
|
mov ecx, edx // bitstream to ecx to get run
|
|
|
|
add eax, ebx // result = 2QL +- Q[-1]
|
|
|
|
// now clip to -2048 ... +2047 (12 bits: 0xfffff800 <= res <= 0x000007ff)
|
|
cmp eax, -2048
|
|
jge skip1
|
|
|
|
mov eax, -2048
|
|
jmp skip2
|
|
|
|
skip1:
|
|
cmp eax, +2047
|
|
jle skip2
|
|
|
|
mov eax, 2047
|
|
|
|
skip2:
|
|
// update run and compute index
|
|
|
|
shr ecx, 18 // run to bottom
|
|
mov ebx, [L_CumRun] // ebx = old total run
|
|
|
|
and ecx, 3fh // mask off bottom 6 bits for run
|
|
inc ebx // old run ++
|
|
|
|
add ebx, ecx // ebx = new cumulative run
|
|
mov [edi], eax // save coefficient's signed value
|
|
|
|
cmp ebx, 03fh // check run for bitstream error
|
|
jg error
|
|
|
|
mov [L_CumRun], ebx // update the cumulative run
|
|
mov ecx, [L_Bits] // ebx = number of bits used
|
|
|
|
mov ebx, [gTAB_ZZ_RUN+4*ebx] // ebx = index of the current coefficient
|
|
add ecx, 22 // escape code uses 22 bits
|
|
|
|
mov [edi+4], ebx // save coefficient's index
|
|
add edi, 8 // increment coefficient pointer
|
|
|
|
mov [L_Bits], ecx // update number of bits used
|
|
mov ebx, ecx // ebx = L_Bits
|
|
|
|
shr ebx, 3 // offset for bitstream load
|
|
inc ebp // increment number of coefficients read
|
|
|
|
test edx, 01000000h // check last bit
|
|
jz get_next_coefficient
|
|
|
|
jmp finish
|
|
|
|
|
|
error:
|
|
pop ebx
|
|
pop edi
|
|
|
|
pop esi
|
|
pop ebp
|
|
|
|
xor eax, eax // zero bits used indicates ERROR
|
|
ret
|
|
|
|
}
|
|
|
|
}
|
|
#pragma code_seg()
|