windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/d3bvriq.cpp


								/* *************************************************************************

								**    INTEL Corporation Proprietary Information

								**

								**    This listing is supplied under the terms of a license

								**    agreement with INTEL Corporation and may not be copied

								**    nor disclosed except in accordance with the terms of

								**    that agreement.

								**

								**    Copyright (c) 1995 Intel Corporation.

								**    All Rights Reserved.

								**

								** *************************************************************************

								*/


								//////////////////////////////////////////////////////////////////////////

								// $Author:   AGUPTA2  $

								// $Date:   22 Mar 1996 17:23:16  $

								// $Archive:   S:\h26x\src\dec\d3bvriq.cpv  $

								// $Header:   S:\h26x\src\dec\d3bvriq.cpv   1.7   22 Mar 1996 17:23:16   AGUPTA2  $

								// $Log:   S:\h26x\src\dec\d3bvriq.cpv  $

								//

								//    Rev 1.7   22 Mar 1996 17:23:16   AGUPTA2

								// Minor interface change to accomodate MMX rtns.  Now the interface is the

								// same for MMX and IA.

								//

								//    Rev 1.6   08 Mar 1996 16:46:10   AGUPTA2

								// Added pragma code_seg.

								//

								//

								//    Rev 1.5   15 Feb 1996 14:54:08   RMCKENZX

								// Gutted and re-wrote routine, optimizing for performance

								// for the p5.  Added clamping to -2048...+2047 to escape code

								// portion.

								//

								//    Rev 1.4   27 Dec 1995 14:36:00   RMCKENZX

								// Added copyright notice

								//

								//    Rev 1.3   09 Dec 1995 17:35:20   RMCKENZX

								// Re-checked in module to support decoder re-architecture (thru PB frames)

								//

								//    Rev 1.0   27 Nov 1995 14:36:46   CZHU

								// Initial revision.

								//

								//    Rev 1.28   03 Nov 1995 16:28:50   CZHU

								// Cleaning up and added more comments

								//

								//    Rev 1.27   31 Oct 1995 10:27:20   CZHU

								// Added error checking for total run value.

								//

								//    Rev 1.26   19 Sep 1995 10:45:12   CZHU

								//

								// Improved pairing and cleaned up

								//

								//    Rev 1.25   18 Sep 1995 10:20:28   CZHU

								// Fixed bugs in handling escape codes for INTER blocks w.r.t. run.

								//

								//    Rev 1.24   15 Sep 1995 09:35:30   CZHU

								// fixed bugs in run cumulation for inter

								//

								//    Rev 1.23   14 Sep 1995 10:13:32   CZHU

								//

								// Initialize cumulated run for the INTER blocks.

								//

								//    Rev 1.22   12 Sep 1995 17:36:06   AKASAI

								//

								// Fixed bug in addressing to Intermediate when changed from writing

								// BYTES to DWORDS.  Inter Butterfly only had the problem.

								//

								//    Rev 1.21   12 Sep 1995 13:37:58   AKASAI

								// Added Butterfly Inter code.  Also added optimizations to pre-fetch

								// accumulators and "output" cache lines.

								//

								//    Rev 1.20   11 Sep 1995 16:41:32   CZHU

								// Adjust target block address: write to Target if INTRA, write to tempory sto

								//

								//    Rev 1.19   11 Sep 1995 14:30:32   CZHU

								// Seperate Butterfly for inter and intra, put place holder for inter blocks

								//

								//    Rev 1.18   08 Sep 1995 11:49:00   CZHU

								// Added support for P frames, fixed bugs related to INTRADC's presence.

								//

								//    Rev 1.17   28 Aug 1995 14:51:22   CZHU

								// Improve pairing and clean up

								//

								//    Rev 1.16   24 Aug 1995 15:36:24   CZHU

								//

								// Fixed bugs handling the escape code followed by 22bits fixed length code

								//

								//    Rev 1.15   23 Aug 1995 14:53:32   AKASAI

								// Changed butterfly writes to increment by bytes and take a PITCH.

								//

								//    Rev 1.14   23 Aug 1995 11:58:46   CZHU

								// Added signed extended inverse quant before calling idct. and others

								//

								//    Rev 1.13   22 Aug 1995 17:38:28   CZHU

								// Calls the idct accumulation for each symbol and butterfly at the end.

								//

								//    Rev 1.12   21 Aug 1995 14:39:58   CZHU

								//

								// Added IDCT initialization code and stubs for accumulation and butterfly.

								// Also added register saving and restoration before and after accumulation

								//

								//    Rev 1.11   18 Aug 1995 17:03:32   CZHU

								// Added comments and clean up for integration with IDCT

								//

								//    Rev 1.10   18 Aug 1995 15:01:52   CZHU

								// Fixed bugs in handling escape codes using byte oriented reading approach

								//

								//    Rev 1.9   16 Aug 1995 14:24:22   CZHU

								// Bug fixes for the integration with bitstream parsing. Also changed from DWO

								// reading to byte oriented reading.

								//

								//    Rev 1.8   15 Aug 1995 15:07:42   CZHU

								// Fixed the stack so that the parameters have been passed in correctly.

								//

								//    Rev 1.7   14 Aug 1995 16:39:02   DBRUCKS

								// changed pPBlock to pCurBlock

								//

								//    Rev 1.6   11 Aug 1995 16:08:12   CZHU

								// removed local varables in C

								//

								//    Rev 1.5   11 Aug 1995 15:51:26   CZHU

								//

								// Readjust local varables on the stack. Clear ECX upfront.

								//

								//    Rev 1.4   11 Aug 1995 15:14:32   DBRUCKS

								// variable name changes

								//

								//    Rev 1.3   11 Aug 1995 13:37:26   CZHU

								//

								// Adjust to the joint optimation of IDCT, IQ, RLE, and ZZ.

								// Also added place holders for IDCT.

								//

								//    Rev 1.2   11 Aug 1995 10:30:26   CZHU

								// Changed the functions parameters, and added codes to short-curcuit IDCT bef

								//

								//    Rev 1.1   03 Aug 1995 14:39:04   CZHU

								//

								// further optimization.

								//

								//    Rev 1.0   02 Aug 1995 15:20:02   CZHU

								// Initial revision.

								//

								//    Rev 1.1   02 Aug 1995 10:21:12   CZHU

								// Added asm codes for VLD of TCOEFF, inverse quantization, run-length decode.

								//


								//--------------------------------------------------------------------------

								//

								//  d3xbvriq.cpp

								//

								//  Description:

								//    This routine performs run length decoding and inverse quantization

								//    of transform coefficients for one block.

								//	 MMx version.

								//

								//  Routines:

								//    VLD_RLD_IQ_Block

								//

								//  Inputs (dwords pushed onto stack by caller):

								//    lpBlockAction  pointer to Block action stream for current blk.

								//

								//	 lpSrc			The input bitstream.

								//

								//	 uBitsInOut		Number of bits already read.

								//

								//    pIQ_INDEX		Pointer to coefficients and indices.

								//

								//    pN				Pointer to number of coefficients read.

								//

								//  Returns:

								//    0 				on bit stream error, otherwise total number of bits read

								//					(including number read prior to call).

								//

								//  Note:

								//			The structure of gTAB_TCOEFF_MAJOR is as follows:

								//				bits		name:		description

								//				----		-----		-----------

								//				25-18		bits:		number of bitstream bits used

								//				17			last:		flag for last coefficient

								//				16-9		run:		number of preceeding 0 coefficients plus 1

								//				8-2			level:		absolute value of coefficient

								//				1			sign:		sign of coefficient

								//				0			hit:		1 = major table miss, 0 = major table hit

								//

								//			The structure of gTAB_TCOEFF_MINOR is the same, right shifted by 1 bit.

								//			A gTAB_TCOEFF_MAJOR value of 00000001h indicates the escape code.

								//

								//--------------------------------------------------------------------------


								#include "precomp.h"


								// local variable definitions

								#define L_Quantizer		esp+20		// quantizer		P_BlockAction

								#define L_Quantizer64	esp+24		// 64*quantizer		P_src

								#define L_Bits      	esp+28		// bit offset		P_bits

								#define L_CumRun		esp+36		// cumulative run	P_dst


								// stack use

								//	ebp					esp+0

								//	esi					esp+4

								//	edi					esp+8

								//	ebx					esp+12

								//	return address		esp+16


								// input parameters

								#define P_BlockAction 	esp+20		// L_Quantizer

								#define P_src			esp+24		// L_Quantizer64

								#define P_bits			esp+28		// L_Bits

								#define P_num			esp+32		//

								#define P_dst			esp+36		// L_CumRun


								#pragma code_seg("IACODE1")

								extern "C" __declspec(naked)

								U32 VLD_RLD_IQ_Block(T_BlkAction *lpBlockAction,

								                     U8  *lpSrc,

								                     U32 uBitsread,

								                     U32 *pN,

								                     U32 *pIQ_INDEX)

								{

									__asm {


								// save registers

									push	ebp

									 push	esi

									push	edi

									 push	ebx


								//

								// initialize

								//	make sure we read in the P_src and P_dst pointers before we

								//	overwrite them with L_Quantizer64 and L_CumRun.

								//

								//	Output Registers:

								//		 dl = block type ([P_BlockAction])

								//		esi = bitstream source pointer (P_src)

								//		edi = coefficient destination pointer (P_dst)

								//		ebp = coefficent counter (init to 0)

								//

								//	Locals initialized on Stack: (these overwrite indicated input parameters)

								//		local var		clobbers  		initial value

								//		---------------------------------------------------

								//		L_Quantizer		P_BlockAction	input quantizer

								//		L_Quantizer64	P_src			64 * input quantizer

								//		L_CumRun 		P_dst			-1

								//

									xor 	ebp, ebp						// init coefficient counter to 0

								 	 xor 	eax, eax						// zero eax for quantizer & coef. counter


									mov 	ecx, [P_BlockAction]        	// ecx = block action pointer

									 mov 	ebx, -1							// beginning cumulative run value


									mov 	esi, [P_src]  					// esi = bitstream source pointer

									 mov 	edi, [P_dst]					// edi = coefficient pointer


									mov 	al, [ecx+3]						// al = Quantizer

									 mov 	[L_CumRun], ebx					// init cumulative run to -1


									mov 	[L_Quantizer], eax				// save original quantizer

									 mov 	dl, [ecx]						// block type in dl


									shl 	eax, 6							// 64 * Quantizer

								 	 mov 	ecx, [L_Bits]					// ecx = L_Bits


									mov 	ebx, ecx						// ebx = L_Bits

									 mov 	[L_Quantizer64], eax				// save 64*Quantizer for this block


									shr 	ebx, 3							// offset for input

									 and 	ecx, 7							// shift value


									cmp 	dl, 1							// check the block type for INTRA

									 ja 	get_next_coefficient			// if type 2 or larger, no INTRADC


								//

								// Decode INTRADC

								//

								//	uses dword load & bitswap to achieve big endian ordering.

								//	prior codes prepares ebx, cl, and dl as follows:

								//		ebx = L_Bits>>3

								//		cl  = L_Bits&7

								//		dl  = BlockType (0=INTRA_DC, 1=INTRA, 2=INTER, etc.)

								//

									mov 	eax, [esi+ebx]					// *** PROBABLE MALALIGNMENT ***

									 inc 	ebp								// one coefficient decoded


									bswap	eax								// big endian order

																			// *** NOT PAIRABLE ***


									shl 	eax, cl							// left justify bitstream buffer

																			// *** NOT PAIRABLE ***

																			// *** 4 CYCLES ***


									shr 	eax, 21							// top 11 bits to the bottom

								 	 mov 	ecx, [L_Bits]					// ecx = L_Bits


									and 	eax, 07f8h						// mask last 3 bits

									 add 	ecx, 8							// bits used += 8 for INTRADC


									cmp 	eax, 07f8h						// check for 11111111 codeword

									 jne 	skipa


									mov 	eax, 0400h						// 11111111 decodes to 400h = 1024


								skipa:

									mov 	[L_Bits], ecx					//  update bits used

									 xor 	ebx, ebx


									mov 	[L_CumRun], ebx					// save total run (starts with zero)

									 mov 	[edi], eax						// save decoded DC coefficient


									mov 	[edi+4], ebx					// save 0 index

									 mov 	ebx, ecx						// ebx = L_Bits


									shr 	ebx, 3							// offset for input

									 add 	edi, 8							// update coefficient pointer


								//  check for last

									test 	dl, dl							// check for INTRA-DC (block type=0)

									 jz		finish							// if only the INTRADC present


								//

								// Get Next Coefficient

								//

								//	prior codes prepares ebx and ecx as follows:

								//		ebx = L_Bits>>3

								//		ecx = L_Bits

								//


								get_next_coefficient:

								//  use dword load & bitswap to achieve big endian ordering

									mov 	eax, [esi+ebx]					// *** PROBABLE MALALIGNMENT ***

									 and 	ecx, 7							// shift value


									bswap	eax								// big endian order

																			// *** NOT PAIRABLE ***


									shl 	eax, cl							// left justify buffer

																			// *** NOT PAIRABLE ***

																			// *** 4 CYCLES ***


								//  do table lookups

									mov 	ebx, eax						// ebx for major table

									 mov 	ecx, eax						// ecx for minor table


									shr 	ebx, 24							// major table lookup


									shr 	ecx, 17							// minor table lookup in bits with garbage

									 mov 	ebx, [gTAB_TCOEFF_MAJOR+4*ebx]	// get the major table value

																			// ** AGI **


									shr 	ebx, 1							// test major hit ?

									 jnc 	skipb							// if hit major


									and 	ecx, 0ffch						// mask off garbage for minor table

									 test 	ebx, ebx						// escape code value was 0x00000001


									jz 		escape_code						// handle escape by major table.


									mov 	ebx, [gTAB_TCOEFF_MINOR+ecx]	// use minor table


								//

								//  input is ebx = event.  See function header for the meaning of its fields

								//  now we decode the event, extracting the run, value, last.

								//  The table value moves to ecx and is shifted downward as portions

								//  are extracted to ebx.

								//

								skipb:

									mov 	ecx, ebx						// ecx = table value

									 and 	ebx, 0ffh						// ebx = 2*abs(level) + sign


									shr 	ecx, 8							// run to bottom

									 mov 	edx, [L_Quantizer64]			// edx = 64*quant


																			//  ** PREFIX DELAY **

																			//  ** AGI **

									mov 	ax, [gTAB_INVERSE_Q+edx+2*ebx]	// ax = dequantized value (I16)

									 mov 	ebx, ecx						// ebx = table value


									shl 	eax, 16							// shift value until sign bit is on top

									 and 	ebx, 0ffh						// ebx = run + 1


									sar 	eax, 16							// arithmetic shift extends value's sign

									 mov 	edx, [L_CumRun]					// edx = (old) cumulative run


									add 	edx, ebx						// cumulative run += run + 1

									 mov 	[edi], eax						// save coefficient's signed value


									cmp 	edx, 03fh						// check run for bitstream error

									 jg 	error


									mov 	[L_CumRun], edx					// update the cumulative run

									 inc 	ebp								// increment number of coefficients read


																			//  ** AGI **

									mov 	edx, [gTAB_ZZ_RUN+4*edx]		// edx = index of the current coefficient

								 	 mov 	ebx, ecx						// ebx:  bit 8 = last flag


									mov 	[edi+4], edx					// save coefficient's index

									 add 	edi, 8							// increment coefficient pointer


									shr 	ecx, 9							// ecx = bits decoded

								 	 mov 	edx, [L_Bits]					// edx = L_Bits


									add 	ecx, edx						// L_Bits += bits decoded

									 mov 	edx, ebx						// ebx:  bit 8 = last flag


									mov 	[L_Bits], ecx					// update L_Bits

									 mov 	ebx, ecx						// ebx = L_Bits


									shr 	ebx, 3							// offset for bitstream load

									 test	edx, 100h						// check for last


									jz  	get_next_coefficient


								finish:

									mov 	ecx, [P_num]   					// pointer to number of coeffients read

									 mov 	eax, [L_Bits]					// return total bits used


									pop 	ebx

									 pop 	edi


									mov 	[ecx], ebp						// store number of coefficients read

									 pop 	esi


									pop 	ebp

									 ret


								//

								// process escape code separately

								//

								//	we have the following 4 cases to compute the reconstructed value

								//	depending on the sign of L=level and the parity of Q=quantizer:

								//

								//				L pos		L neg

								//	Q even		2QL+(Q-1)	2QL-(Q-1)

								//	Q odd		2QL+(Q)		2QL-(Q)

								//

								//	The Q or Q-1 term is formed by adding Q to its parity bit

								//	and then subtracting 1.

								//	The + or - on this term is gotten by anding the term with a

								//	mask (=0 or =-1) formed from the sign bit of Q*L,

								//	doubling the result, then subtracting it from the term.

								//	This will negate the term when L is negative and leave

								//	it unchanged when L is positive.

								//

								//	Register usages:

								//		eax		starts with bitstream, later L, finally result

								//		ebx		starts with Q, later is the Q or Q-1 term

								//		ecx		startw with mask, later 2*term

								//		edx		bitstream

								//

								escape_code:

									mov 	edx, eax						// edx = bitstream buffer


									shl 	eax, 14							// signed 8-bit level to top


									sar 	eax, 24							// eax = L (signed level)

									 mov 	ebx, [L_Quantizer]


									test	eax, 7fh						// test for invalid codes

									 jz  	error


									imul	eax, ebx						// eax = Q*L

																			// *** NOT PAIRABLE ***

																			// *** 10 cycles ***


									dec 	ebx								// term = Q-1

									 mov 	ecx, eax						// mask = QL


									or  	ebx, 1							// term = Q-1 if Q even, else = Q

									 sar 	ecx, 31							// mask = -1 if L neg, else = 0


									xor 	ebx, ecx						// term = ~Q[-1] if L neg, else = Q[-1]

									 add 	eax, eax						// result = 2*Q*L


									sub 	ebx, ecx						// term = -(Q[-1]) if L neg, else = Q[-1]

									 mov 	ecx, edx						// bitstream to ecx to get run


									add 	eax, ebx						// result = 2QL +- Q[-1]


								//  now clip to -2048 ... +2047 (12 bits:  0xfffff800 <= res <= 0x000007ff)

									cmp 	eax, -2048

									 jge 	skip1


									mov 	eax, -2048

									 jmp	skip2


								skip1:

									cmp 	eax, +2047

									 jle  	skip2


									mov 	eax, 2047


								skip2:

								//  update run and compute index


									shr 	ecx, 18							// run to bottom

								 	 mov 	ebx, [L_CumRun]					// ebx = old total run


									and 	ecx, 3fh						// mask off bottom 6 bits for run

									 inc 	ebx								// old run ++


									add 	ebx, ecx						// ebx = new cumulative run

								 	 mov 	[edi], eax						// save coefficient's signed value


									cmp 	ebx, 03fh						// check run for bitstream error

									 jg 	error


								  	mov 	[L_CumRun], ebx					// update the cumulative run

									 mov 	ecx, [L_Bits]					// ebx = number of bits used


									mov 	ebx, [gTAB_ZZ_RUN+4*ebx]		// ebx = index of the current coefficient

									add 	ecx, 22							// escape code uses 22 bits


									mov 	[edi+4], ebx					// save coefficient's index

									 add 	edi, 8							// increment coefficient pointer


									mov 	[L_Bits], ecx					// update number of bits used

								 	 mov 	ebx, ecx						// ebx = L_Bits


									shr 	ebx, 3							// offset for bitstream load

									 inc 	ebp								// increment number of coefficients read


									test 	edx, 01000000h					// check last bit

									 jz  	get_next_coefficient


									jmp 	finish


								error:

									pop		ebx

									 pop 	edi


									pop		esi

									 pop 	ebp


									xor 	eax, eax						// zero bits used indicates ERROR

									 ret


								 }


								}

								#pragma code_seg()