windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/d3bvriq.cpp

/* *************************************************************************
**    INTEL Corporation Proprietary Information
**
**    This listing is supplied under the terms of a license
**    agreement with INTEL Corporation and may not be copied
**    nor disclosed except in accordance with the terms of
**    that agreement.
**
**    Copyright (c) 1995 Intel Corporation.
**    All Rights Reserved.
**
** *************************************************************************
*/

//////////////////////////////////////////////////////////////////////////
// $Author:   AGUPTA2  $
// $Date:   22 Mar 1996 17:23:16  $
// $Archive:   S:\h26x\src\dec\d3bvriq.cpv  $
// $Header:   S:\h26x\src\dec\d3bvriq.cpv   1.7   22 Mar 1996 17:23:16   AGUPTA2  $
// $Log:   S:\h26x\src\dec\d3bvriq.cpv  $
// 
//    Rev 1.7   22 Mar 1996 17:23:16   AGUPTA2
// Minor interface change to accomodate MMX rtns.  Now the interface is the
// same for MMX and IA.
// 
//    Rev 1.6   08 Mar 1996 16:46:10   AGUPTA2
// Added pragma code_seg.
// 
// 
//    Rev 1.5   15 Feb 1996 14:54:08   RMCKENZX
// Gutted and re-wrote routine, optimizing for performance
// for the p5.  Added clamping to -2048...+2047 to escape code
// portion.
// 
//    Rev 1.4   27 Dec 1995 14:36:00   RMCKENZX
// Added copyright notice
// 
//    Rev 1.3   09 Dec 1995 17:35:20   RMCKENZX
// Re-checked in module to support decoder re-architecture (thru PB frames)
// 
//    Rev 1.0   27 Nov 1995 14:36:46   CZHU
// Initial revision.
// 
//    Rev 1.28   03 Nov 1995 16:28:50   CZHU
// Cleaning up and added more comments
// 
//    Rev 1.27   31 Oct 1995 10:27:20   CZHU
// Added error checking for total run value.
// 
//    Rev 1.26   19 Sep 1995 10:45:12   CZHU
// 
// Improved pairing and cleaned up
// 
//    Rev 1.25   18 Sep 1995 10:20:28   CZHU
// Fixed bugs in handling escape codes for INTER blocks w.r.t. run.
// 
//    Rev 1.24   15 Sep 1995 09:35:30   CZHU
// fixed bugs in run cumulation for inter
// 
//    Rev 1.23   14 Sep 1995 10:13:32   CZHU
// 
// Initialize cumulated run for the INTER blocks.
// 
//    Rev 1.22   12 Sep 1995 17:36:06   AKASAI
// 
// Fixed bug in addressing to Intermediate when changed from writing
// BYTES to DWORDS.  Inter Butterfly only had the problem.
// 
//    Rev 1.21   12 Sep 1995 13:37:58   AKASAI
// Added Butterfly Inter code.  Also added optimizations to pre-fetch
// accumulators and "output" cache lines.
// 
//    Rev 1.20   11 Sep 1995 16:41:32   CZHU
// Adjust target block address: write to Target if INTRA, write to tempory sto
// 
//    Rev 1.19   11 Sep 1995 14:30:32   CZHU
// Seperate Butterfly for inter and intra, put place holder for inter blocks
// 
//    Rev 1.18   08 Sep 1995 11:49:00   CZHU
// Added support for P frames, fixed bugs related to INTRADC's presence.
// 
//    Rev 1.17   28 Aug 1995 14:51:22   CZHU
// Improve pairing and clean up
// 
//    Rev 1.16   24 Aug 1995 15:36:24   CZHU
// 
// Fixed bugs handling the escape code followed by 22bits fixed length code
// 
//    Rev 1.15   23 Aug 1995 14:53:32   AKASAI
// Changed butterfly writes to increment by bytes and take a PITCH.
// 
//    Rev 1.14   23 Aug 1995 11:58:46   CZHU
// Added signed extended inverse quant before calling idct. and others 
// 
//    Rev 1.13   22 Aug 1995 17:38:28   CZHU
// Calls the idct accumulation for each symbol and butterfly at the end.
// 
//    Rev 1.12   21 Aug 1995 14:39:58   CZHU
// 
// Added IDCT initialization code and stubs for accumulation and butterfly.
// Also added register saving and restoration before and after accumulation
// 
//    Rev 1.11   18 Aug 1995 17:03:32   CZHU
// Added comments and clean up for integration with IDCT
// 
//    Rev 1.10   18 Aug 1995 15:01:52   CZHU
// Fixed bugs in handling escape codes using byte oriented reading approach
// 
//    Rev 1.9   16 Aug 1995 14:24:22   CZHU
// Bug fixes for the integration with bitstream parsing. Also changed from DWO
// reading to byte oriented reading.
// 
//    Rev 1.8   15 Aug 1995 15:07:42   CZHU
// Fixed the stack so that the parameters have been passed in correctly.
// 
//    Rev 1.7   14 Aug 1995 16:39:02   DBRUCKS
// changed pPBlock to pCurBlock
// 
//    Rev 1.6   11 Aug 1995 16:08:12   CZHU
// removed local varables in C
// 
//    Rev 1.5   11 Aug 1995 15:51:26   CZHU
// 
// Readjust local varables on the stack. Clear ECX upfront.
// 
//    Rev 1.4   11 Aug 1995 15:14:32   DBRUCKS
// variable name changes
// 
//    Rev 1.3   11 Aug 1995 13:37:26   CZHU
// 
// Adjust to the joint optimation of IDCT, IQ, RLE, and ZZ.
// Also added place holders for IDCT.
// 
//    Rev 1.2   11 Aug 1995 10:30:26   CZHU
// Changed the functions parameters, and added codes to short-curcuit IDCT bef
// 
//    Rev 1.1   03 Aug 1995 14:39:04   CZHU
// 
// further optimization.
// 
//    Rev 1.0   02 Aug 1995 15:20:02   CZHU
// Initial revision.
// 
//    Rev 1.1   02 Aug 1995 10:21:12   CZHU
// Added asm codes for VLD of TCOEFF, inverse quantization, run-length decode.
// 


//--------------------------------------------------------------------------
//
//  d3xbvriq.cpp
//
//  Description:
//    This routine performs run length decoding and inverse quantization
//    of transform coefficients for one block.
//	 MMx version.
//
//  Routines:
//    VLD_RLD_IQ_Block
//
//  Inputs (dwords pushed onto stack by caller):
//    lpBlockAction  pointer to Block action stream for current blk.
//
//	 lpSrc			The input bitstream.
//
//	 uBitsInOut		Number of bits already read.
//
//    pIQ_INDEX		Pointer to coefficients and indices.
//
//    pN				Pointer to number of coefficients read.
//
//  Returns:
//    0 				on bit stream error, otherwise total number of bits read
//					(including number read prior to call).
//
//  Note: 
//			The structure of gTAB_TCOEFF_MAJOR is as follows:
//				bits		name:		description
//				----		-----		-----------
//				25-18		bits:		number of bitstream bits used
//				17			last:		flag for last coefficient
//				16-9		run:		number of preceeding 0 coefficients plus 1
//				8-2			level:		absolute value of coefficient
//				1			sign:		sign of coefficient
//				0			hit:		1 = major table miss, 0 = major table hit
//
//			The structure of gTAB_TCOEFF_MINOR is the same, right shifted by 1 bit. 
//			A gTAB_TCOEFF_MAJOR value of 00000001h indicates the escape code.
//
//--------------------------------------------------------------------------

#include "precomp.h"

// local variable definitions
#define L_Quantizer		esp+20		// quantizer		P_BlockAction
#define L_Quantizer64	esp+24		// 64*quantizer		P_src
#define L_Bits      	esp+28		// bit offset		P_bits
#define L_CumRun		esp+36		// cumulative run	P_dst

// stack use
//	ebp					esp+0
//	esi					esp+4
//	edi					esp+8
//	ebx					esp+12
//	return address		esp+16

// input parameters
#define P_BlockAction 	esp+20		// L_Quantizer
#define P_src			esp+24		// L_Quantizer64
#define P_bits			esp+28		// L_Bits
#define P_num			esp+32		//
#define P_dst			esp+36		// L_CumRun


#pragma code_seg("IACODE1")
extern "C" __declspec(naked)
U32 VLD_RLD_IQ_Block(T_BlkAction *lpBlockAction,
                     U8  *lpSrc, 
                     U32 uBitsread,
                     U32 *pN,
                     U32 *pIQ_INDEX)
{		
	__asm {

// save registers
	push	ebp
	 push	esi 
	push	edi			
	 push	ebx

//
// initialize
//	make sure we read in the P_src and P_dst pointers before we
//	overwrite them with L_Quantizer64 and L_CumRun.
//
//	Output Registers:
//		 dl = block type ([P_BlockAction])
//		esi = bitstream source pointer (P_src)
//		edi = coefficient destination pointer (P_dst)
//		ebp = coefficent counter (init to 0)
//
//	Locals initialized on Stack: (these overwrite indicated input parameters) 
//		local var		clobbers  		initial value
//		---------------------------------------------------
//		L_Quantizer		P_BlockAction	input quantizer
//		L_Quantizer64	P_src			64 * input quantizer
//		L_CumRun 		P_dst			-1
//
	xor 	ebp, ebp						// init coefficient counter to 0
 	 xor 	eax, eax						// zero eax for quantizer & coef. counter

	mov 	ecx, [P_BlockAction]        	// ecx = block action pointer
	 mov 	ebx, -1							// beginning cumulative run value

	mov 	esi, [P_src]  					// esi = bitstream source pointer
	 mov 	edi, [P_dst]					// edi = coefficient pointer

	mov 	al, [ecx+3]						// al = Quantizer
	 mov 	[L_CumRun], ebx					// init cumulative run to -1

	mov 	[L_Quantizer], eax				// save original quantizer
	 mov 	dl, [ecx]						// block type in dl

	shl 	eax, 6							// 64 * Quantizer
 	 mov 	ecx, [L_Bits]					// ecx = L_Bits

	mov 	ebx, ecx						// ebx = L_Bits
	 mov 	[L_Quantizer64], eax				// save 64*Quantizer for this block

	shr 	ebx, 3							// offset for input
	 and 	ecx, 7							// shift value

	cmp 	dl, 1							// check the block type for INTRA
	 ja 	get_next_coefficient			// if type 2 or larger, no INTRADC
	 
//
// Decode INTRADC
//
//	uses dword load & bitswap to achieve big endian ordering.
//	prior codes prepares ebx, cl, and dl as follows:
//		ebx = L_Bits>>3
//		cl  = L_Bits&7
//		dl  = BlockType (0=INTRA_DC, 1=INTRA, 2=INTER, etc.)
//
	mov 	eax, [esi+ebx]					// *** PROBABLE MALALIGNMENT ***
	 inc 	ebp								// one coefficient decoded

	bswap	eax								// big endian order
											// *** NOT PAIRABLE ***

	shl 	eax, cl							// left justify bitstream buffer
											// *** NOT PAIRABLE ***
											// *** 4 CYCLES ***

	shr 	eax, 21							// top 11 bits to the bottom
 	 mov 	ecx, [L_Bits]					// ecx = L_Bits

	and 	eax, 07f8h						// mask last 3 bits
	 add 	ecx, 8							// bits used += 8 for INTRADC

	cmp 	eax, 07f8h						// check for 11111111 codeword
	 jne 	skipa

	mov 	eax, 0400h						// 11111111 decodes to 400h = 1024 

skipa:
	mov 	[L_Bits], ecx					//  update bits used
	 xor 	ebx, ebx

	mov 	[L_CumRun], ebx					// save total run (starts with zero)
	 mov 	[edi], eax						// save decoded DC coefficient

	mov 	[edi+4], ebx					// save 0 index
	 mov 	ebx, ecx						// ebx = L_Bits

	shr 	ebx, 3							// offset for input
	 add 	edi, 8							// update coefficient pointer

//  check for last
	test 	dl, dl							// check for INTRA-DC (block type=0)
	 jz		finish							// if only the INTRADC present


//
// Get Next Coefficient
//
//	prior codes prepares ebx and ecx as follows:
//		ebx = L_Bits>>3
//		ecx = L_Bits
//

get_next_coefficient:
//  use dword load & bitswap to achieve big endian ordering
	mov 	eax, [esi+ebx]					// *** PROBABLE MALALIGNMENT ***
	 and 	ecx, 7							// shift value

	bswap	eax								// big endian order
											// *** NOT PAIRABLE ***

	shl 	eax, cl							// left justify buffer
											// *** NOT PAIRABLE ***
											// *** 4 CYCLES ***
 	
//  do table lookups
	mov 	ebx, eax						// ebx for major table
	 mov 	ecx, eax						// ecx for minor table

	shr 	ebx, 24							// major table lookup

	shr 	ecx, 17							// minor table lookup in bits with garbage
	 mov 	ebx, [gTAB_TCOEFF_MAJOR+4*ebx]	// get the major table value
											// ** AGI **

	shr 	ebx, 1							// test major hit ?
	 jnc 	skipb							// if hit major

	and 	ecx, 0ffch						// mask off garbage for minor table
	 test 	ebx, ebx						// escape code value was 0x00000001

	jz 		escape_code						// handle escape by major table.

	mov 	ebx, [gTAB_TCOEFF_MINOR+ecx]	// use minor table
											 
//
//  input is ebx = event.  See function header for the meaning of its fields
//  now we decode the event, extracting the run, value, last.
//  The table value moves to ecx and is shifted downward as portions
//  are extracted to ebx. 
//
skipb:	
	mov 	ecx, ebx						// ecx = table value
	 and 	ebx, 0ffh						// ebx = 2*abs(level) + sign

	shr 	ecx, 8							// run to bottom
	 mov 	edx, [L_Quantizer64]			// edx = 64*quant

											//  ** PREFIX DELAY **
											//  ** AGI **
	mov 	ax, [gTAB_INVERSE_Q+edx+2*ebx]	// ax = dequantized value (I16)
	 mov 	ebx, ecx						// ebx = table value

	shl 	eax, 16							// shift value until sign bit is on top
	 and 	ebx, 0ffh						// ebx = run + 1

	sar 	eax, 16							// arithmetic shift extends value's sign
	 mov 	edx, [L_CumRun]					// edx = (old) cumulative run

	add 	edx, ebx						// cumulative run += run + 1
	 mov 	[edi], eax						// save coefficient's signed value

	cmp 	edx, 03fh						// check run for bitstream error
	 jg 	error

	mov 	[L_CumRun], edx					// update the cumulative run
	 inc 	ebp								// increment number of coefficients read

											//  ** AGI **
	mov 	edx, [gTAB_ZZ_RUN+4*edx]		// edx = index of the current coefficient
 	 mov 	ebx, ecx						// ebx:  bit 8 = last flag

	mov 	[edi+4], edx					// save coefficient's index
	 add 	edi, 8							// increment coefficient pointer

	shr 	ecx, 9							// ecx = bits decoded
 	 mov 	edx, [L_Bits]					// edx = L_Bits

	add 	ecx, edx						// L_Bits += bits decoded
	 mov 	edx, ebx						// ebx:  bit 8 = last flag

	mov 	[L_Bits], ecx					// update L_Bits
	 mov 	ebx, ecx						// ebx = L_Bits

	shr 	ebx, 3							// offset for bitstream load
	 test	edx, 100h						// check for last

	jz  	get_next_coefficient	 	
			

finish:
	mov 	ecx, [P_num]   					// pointer to number of coeffients read
	 mov 	eax, [L_Bits]					// return total bits used

	pop 	ebx								
	 pop 	edi

	mov 	[ecx], ebp						// store number of coefficients read
	 pop 	esi

	pop 	ebp
	 ret


//
// process escape code separately
//
//	we have the following 4 cases to compute the reconstructed value
//	depending on the sign of L=level and the parity of Q=quantizer:
//
//				L pos		L neg
//	Q even		2QL+(Q-1)	2QL-(Q-1)
//	Q odd		2QL+(Q)		2QL-(Q)
//
//	The Q or Q-1 term is formed by adding Q to its parity bit 
//	and then subtracting 1.
//	The + or - on this term is gotten by anding the term with a
//	mask (=0 or =-1) formed from the sign bit of Q*L,
//	doubling the result, then subtracting it from the term.
//	This will negate the term when L is negative and leave
//	it unchanged when L is positive.
//	
//	Register usages:
//		eax		starts with bitstream, later L, finally result
//		ebx		starts with Q, later is the Q or Q-1 term
//		ecx		startw with mask, later 2*term
//		edx		bitstream
//
escape_code:								
	mov 	edx, eax						// edx = bitstream buffer

	shl 	eax, 14							// signed 8-bit level to top

	sar 	eax, 24							// eax = L (signed level)
	 mov 	ebx, [L_Quantizer]

	test	eax, 7fh						// test for invalid codes
	 jz  	error

	imul	eax, ebx						// eax = Q*L
											// *** NOT PAIRABLE ***
											// *** 10 cycles ***

	dec 	ebx								// term = Q-1
	 mov 	ecx, eax						// mask = QL

	or  	ebx, 1							// term = Q-1 if Q even, else = Q
	 sar 	ecx, 31							// mask = -1 if L neg, else = 0

	xor 	ebx, ecx						// term = ~Q[-1] if L neg, else = Q[-1]
	 add 	eax, eax						// result = 2*Q*L

	sub 	ebx, ecx						// term = -(Q[-1]) if L neg, else = Q[-1]
	 mov 	ecx, edx						// bitstream to ecx to get run

	add 	eax, ebx						// result = 2QL +- Q[-1]

//  now clip to -2048 ... +2047 (12 bits:  0xfffff800 <= res <= 0x000007ff)
	cmp 	eax, -2048
	 jge 	skip1

	mov 	eax, -2048
	 jmp	skip2

skip1:
	cmp 	eax, +2047
	 jle  	skip2

	mov 	eax, 2047

skip2:
//  update run and compute index

	shr 	ecx, 18							// run to bottom
 	 mov 	ebx, [L_CumRun]					// ebx = old total run

	and 	ecx, 3fh						// mask off bottom 6 bits for run
	 inc 	ebx								// old run ++

	add 	ebx, ecx						// ebx = new cumulative run
 	 mov 	[edi], eax						// save coefficient's signed value

	cmp 	ebx, 03fh						// check run for bitstream error
	 jg 	error

  	mov 	[L_CumRun], ebx					// update the cumulative run
	 mov 	ecx, [L_Bits]					// ebx = number of bits used

	mov 	ebx, [gTAB_ZZ_RUN+4*ebx]		// ebx = index of the current coefficient
	add 	ecx, 22							// escape code uses 22 bits

	mov 	[edi+4], ebx					// save coefficient's index
	 add 	edi, 8							// increment coefficient pointer

	mov 	[L_Bits], ecx					// update number of bits used
 	 mov 	ebx, ecx						// ebx = L_Bits

	shr 	ebx, 3							// offset for bitstream load
	 inc 	ebp								// increment number of coefficients read

	test 	edx, 01000000h					// check last bit
	 jz  	get_next_coefficient	 	

	jmp 	finish

				
error:
	pop		ebx								
	 pop 	edi

	pop		esi
	 pop 	ebp

	xor 	eax, eax						// zero bits used indicates ERROR
	 ret

 }

}
#pragma code_seg()