windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/e3rgb16.cpp


								/* *************************************************************************

								**    INTEL Corporation Proprietary Information

								**

								**    This listing is supplied under the terms of a license

								**    agreement with INTEL Corporation and may not be copied

								**    nor disclosed except in accordance with the terms of

								**    that agreement.

								**

								**    Copyright (c) 1995 Intel Corporation.

								**    All Rights Reserved.

								**

								** *************************************************************************

								*/


								#include "precomp.h"


								#if defined(H263P) || defined(USE_BILINEAR_MSH26X) // {


								//

								// For the P5 versions, the strategy is to compute the Y value for an odd RGB value

								// followed by computing the Y value for the corresponding even RGB value. The registers

								// are then set with the proper values to compute U and V values for the even RGB

								// value. This avoids repeating the shifting and masking needed to extract the Red,

								// Green and Blue components.

								//

								// Only the 555 version of RGB16 input color conversion is provided. To generate

								// other versions, use the following table.

								//

								//   number             shift              mask

								//                     B, G, R

								//   ------          -----------       ----------------

								//    555              2, 3, 8         0x7C, 0x7C, 0x7C

								//    664              3, 3, 9         0x78, 0x7E, 0x7E

								//    565              2, 4, 9         0x7C, 0x7E, 0x7C

								//    655              2, 3, 9         0x7C, 0x7C, 0x7E

								//

								// Only 555 falls under BI_RGB. The others are specified using the

								// BI_BITFIELDS compression specification. For BI_BITFIELDS, call

								// Build16bitModeID to get the actual bitfield number. This routine requires the

								// three array elements in the bmiColors field of a BITMAPINFO object.

								//

								/*****************************************************************************

								 *

								 *  H26X_BGR16555toYUV12()

								 *

								 *  Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory

								 *  with pitch defined by the constant PITCH. The input data is stored in

								 *  the order B,G,R,B,G,R...

								 *

								 */


								#if 0 // { 0


								void C_H26X_BGR16555toYUV12(

									LPBITMAPINFOHEADER	lpbiInput,

									WORD OutputWidth,

									WORD OutputHeight,

								    U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									const int pitch)

								{

									int t1, t2;

									int tm1, tm2;


									C_RGB_COLOR_CONVERT_INIT


									for ( j = 0; j < LumaIters; j++) {


										for (k = 0; k < mark; k++) {

											for (i = OutputWidth; i > 0; i-=4, YPlane+=4) {

												tm1 = *pnext++;

												t1 = (BYUV[(tm1<<2)&0x7C].YU +

													 GYUV[(tm1>>3)&0x7C].YU +

													 RYUV[(tm1>>8)&0x7C].YU);

												*(YPlane) = (U8)((t1>>SHIFT_WIDTH)+8);

												t = (BYUV[(tm1>>14)&0x7C].YU +

													 GYUV[(tm1>>19)&0x7C].YU +

													 RYUV[(tm1>>24)&0x7C].YU);

												*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);

												tm2 = *pnext++;

												t2 = (BYUV[(tm2<<2)&0x7C].YU +

													 GYUV[(tm2>>3)&0x7C].YU +

													 RYUV[(tm2>>8)&0x7C].YU);

												*(YPlane+2) = (U8)((t2>>SHIFT_WIDTH)+8);

												t = (BYUV[(tm2>>14)&0x7C].YU +

													 GYUV[(tm2>>19)&0x7C].YU +

													 RYUV[(tm2>>24)&0x7C].YU);

												*(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8);

												if (0 == (k&1)) {

													*(U16 *)UPlane = ((t1+0x40000000)>>24) |	(((t2+0x40000000)>>16)&0xFF00);

													t1 = (RYUV[(tm1>>8)&0x7C].V +

														 GYUV[(tm1>>3)&0x7C].V +

														 BYUV[(tm1<<2)&0x7C].V);

													t2 = (RYUV[(tm2>>8)&0x7C].V +

														 GYUV[(tm2>>3)&0x7C].V +

														 BYUV[(tm2<<2)&0x7C].V);

													*(U16 *)VPlane = ((t1+0x4000)>>8) | ((t2+0x4000)&0xFF00);

													UPlane += 2; VPlane += 2;

												}

											}

											// The next two cases are mutually exclusive.

											// If there is a width_diff there cannot be a stretch and

											// if there is a stretch, there cannot be a width_diff.

											C_WIDTH_FILL

											if (stretch && (0 == k) && j) {

												for (i = OutputWidth; i > 0; i -= 8) {

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

												}

											}

											pnext += BackTwoLines;

											YPlane += byte_ypitch_adj;

											// Increment after even lines.

											if(0 == (k&1)) {

												UPlane += byte_uvpitch_adj;

												VPlane += byte_uvpitch_adj;

											}

										} // end of for k

										if (stretch) {

											pyprev = (U32 *)(YPlane - pitch);

											pyspace = (U32 *)YPlane;

											pynext = (U32 *)(YPlane += pitch);

										}

									} // end of for j

									// The next two cases are mutually exclusive.

									// If there is a height_diff there cannot be a stretch and

									// if there is a stretch, there cannot be a height_diff.

									C_HEIGHT_FILL

									if (stretch) {

										for (i = OutputWidth; i > 0; i -= 4) {

											*pyspace++ = *pyprev++;

										}

									}

								} // end of C_H26X_BGR55516toYUV12


								#endif // } 0


								__declspec(naked)

								void P5_H26X_BGR16555toYUV12(

									LPBITMAPINFOHEADER	lpbiInput,

									WORD OutputWidth,

									WORD OutputHeight,

								    U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch				|  +136

								//	| VPlane			|  +132

								//	| UPlane			|  +128

								//	| YPlane			|  +124

								//	| lpInput			|  +120

								//	| OutputHeight		|  +116

								//	| OutputWidth		|  +112

								//	| lpbiInput			|  +108

								//	----------------------------

								//	| return addr		|  +104

								//	| saved ebp			|  +100

								//	| saved ebx			|  + 96

								//	| saved esi			|  + 92

								//	| saved edi			|  + 88


								//  | output_width		|  + 84

								//  | pyprev			|  + 80

								//  | pyspace			|  + 76

								//  | pynext	        |  + 72

								//  | puvprev			|  + 68

								//  | puvspace			|  + 64

								//	| i					|  + 60

								//	| j					|  + 56

								//	| k					|  + 52

								//	| BackTwoLines		|  + 48

								//	| widthx16			|  + 44

								//	| heightx16			|  + 40

								//	| width_diff		|  + 36

								//	| height_diff		|  + 32

								//	| width_adj			|  + 28

								//	| height_adj		|  + 24

								//	| stretch			|  + 20

								//	| aspect			|  + 16

								//	| LumaIters			|  + 12

								//	| mark				|  +  8

								//	| byte_ypitch_adj	|  +  4

								//	| byte_uvpitch_adj	|  +  0


								#define LOCALSIZE			 88


								#define PITCH_PARM			136

								#define VPLANE				132

								#define UPLANE				128

								#define YPLANE				124

								#define LP_INPUT			120

								#define OUTPUT_HEIGHT_WORD	116

								#define OUTPUT_WIDTH_WORD	112

								#define LPBI_INPUT			108


								#define	OUTPUT_WIDTH		 84

								#define	PYPREV				 80

								#define	PYSPACE				 76

								#define	PYNEXT				 72

								#define	PUVPREV				 68

								#define	PUVSPACE			 64

								#define LOOP_I				 60

								#define LOOP_J				 56

								#define LOOP_K				 52

								#define BACK_TWO_LINES		 48

								#define WIDTHX16			 44

								#define HEIGHTX16			 40

								#define WIDTH_DIFF			 36

								#define HEIGHT_DIFF			 32

								#define WIDTH_ADJ			 28

								#define HEIGHT_ADJ			 24

								#define STRETCH				 20

								#define ASPECT				 16

								#define LUMA_ITERS			 12

								#define MARK				  8

								#define BYTE_YPITCH_ADJ		  4

								#define BYTE_UVPITCH_ADJ	  0


									_asm {


									push	ebp

									push 	ebx

									push 	esi

									push 	edi

									sub 	esp, LOCALSIZE


								//	int width_diff = 0

								//	int height_diff = 0

								//	int width_adj = 0

								//	int height_adj = 0

								//	int stretch = 0

								//	int aspect = 0


									xor		eax, eax

									mov		[esp + WIDTH_DIFF], eax

									mov		[esp + HEIGHT_DIFF], eax

									mov		[esp + WIDTH_ADJ], eax

									mov		[esp + HEIGHT_ADJ], eax

									mov		[esp + STRETCH], eax

									mov		[esp + ASPECT], eax


								//	int LumaIters = 1


									inc		eax

									mov		[esp + LUMA_ITERS], eax


								//	int mark = OutputHeight

								//	int output_width = OutputWidth

								//	int byte_ypitch_adj = pitch - OutputWidth

								//	int byte_uvpitch_adj = pitch - (OutputWidth >> 1)


									xor		ebx, ebx

									mov		bx, [esp + OUTPUT_HEIGHT_WORD]

									mov		[esp + MARK], ebx

									mov		bx, [esp + OUTPUT_WIDTH_WORD]

									mov		[esp + OUTPUT_WIDTH], ebx

									mov		ecx, [esp + PITCH_PARM]

									mov		edx, ecx

									sub		ecx, ebx

									mov		[esp + BYTE_YPITCH_ADJ], ecx

									shr		ebx, 1

									sub		edx, ebx

									mov		[esp + BYTE_UVPITCH_ADJ], edx


								//	if (lpbiInput->biHeight > OutputHeight)


									mov		ebx, [esp + LPBI_INPUT]

									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biHeight

									xor		edx, edx

									mov		dx, [esp + OUTPUT_HEIGHT_WORD]

									cmp		ecx, edx

									jle		Lno_stretch


								//		for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) LumaIters += 4


									xor		ecx, ecx

								Lrepeat48:

									lea		ecx, [ecx + 4]

									sub		edx, 48

									jnz		Lrepeat48

									mov		[esp + LUMA_ITERS], ecx


								//		aspect = LumaIters


									mov		[esp + ASPECT], ecx


								//		width_adj = (lpbiInput->biWidth - OutputWidth) >> 1

								//		width_adj *= lpbiInput->biBitCount

								//		width_adj >>= 3


									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biWidth

									mov		edx, [esp + OUTPUT_WIDTH]

									sub		ecx, edx

									shr		ecx, 1

									xor		edx, edx

									mov		dx, (LPBITMAPINFOHEADER)[ebx].biBitCount

									imul	ecx, edx

									shr		ecx, 3

									mov		[esp + WIDTH_ADJ], ecx


								//		height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1


									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biHeight

									xor		edx, edx

									mov		dx, [esp + OUTPUT_HEIGHT_WORD]

									sub		ecx, edx

									add		ecx, [esp + ASPECT]

									shr		ecx, 1

									mov		[esp + HEIGHT_ADJ], ecx


								//		stretch = 1

								//		mark = 11


									mov		ecx, 1

									mov		edx, 11

									mov		[esp + STRETCH], ecx

									mov		[esp + MARK], edx

									jmp		Lif_done


								Lno_stretch:


								//		widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF

								//		width_diff = widthx16 - OutputWidth


									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biWidth

									add		ecx, 00FH

									and		ecx, 0FFFFFFF0H

									mov		[esp + WIDTHX16], ecx

									mov		edx, [esp + OUTPUT_WIDTH]

									sub		ecx, edx

									mov		[esp + WIDTH_DIFF], ecx


								//		byte_ypitch_adj -= width_diff


									mov		edx, [esp + BYTE_YPITCH_ADJ]

									sub		edx, ecx

									mov		[esp + BYTE_YPITCH_ADJ], edx


								//		byte_uvpitch_adj -= (width_diff >> 1)


									mov		edx, [esp + BYTE_UVPITCH_ADJ]

									shr		ecx, 1

									sub		edx, ecx

									mov		[esp + BYTE_UVPITCH_ADJ], edx


								//		heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF

								//		height_diff = heightx16 - OutputHeight


									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biHeight

									add		ecx, 00FH

									and		ecx, 0FFFFFFF0H

									mov		[esp + HEIGHTX16], ecx

									xor		edx, edx

									mov		dx, [esp + OUTPUT_HEIGHT_WORD]

									sub		ecx, edx

									mov		[esp + HEIGHT_DIFF], ecx


								Lif_done:


								//	BackTwoLines = -(lpbiInput->biWidth + OutputWidth);

								//	BackTwoLines *= lpbiInput->biBitCount

								//	BackTwoLines >>= 3


									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biWidth

									mov		edx, [esp + OUTPUT_WIDTH]

									add		ecx, edx

									neg		ecx

									xor		edx, edx

									mov		dx, (LPBITMAPINFOHEADER)[ebx].biBitCount

									imul	ecx, edx

									sar		ecx, 3

									mov		[esp + BACK_TWO_LINES], ecx


								//	pnext =	(U32 *)(lpInput +

								//				(((lpbiInput->biWidth * lpbiInput->biBitCount) >> 3)) *

								//					((OutputHeight - aspect - 1) + height_adj)) +

								//				width_adj)

								// assign (esi, pnext)


									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biWidth

									xor		edx, edx

									mov		dx, (LPBITMAPINFOHEADER)[ebx].biBitCount

									imul	ecx, edx

									shr		ecx, 3

									xor		edx, edx

									mov		dx, [esp + OUTPUT_HEIGHT_WORD]

									sub		edx, [esp + ASPECT]

									dec		edx

									add		edx, [esp + HEIGHT_ADJ]

									imul	ecx, edx

									add		ecx, [esp + WIDTH_ADJ]

									add		ecx, [esp + LP_INPUT]

									mov		esi, ecx


								// assign (edi, YPlane)

									mov		edi, [esp + YPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								// for (k = 0; k < mark; k++)

								L4:

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								// for (i = OutputWidth; i > 0; i -= 2, pnext += 4)

								L5:

									mov		eax, [esp + OUTPUT_WIDTH]

								// This jump is here to make sure the following loop starts on the U pipe

									jmp		L6

								L6:

								// tm1 = pnext[0]

								// t = ( BYUV[(tm1>>14)&0x7C].YU +

								//       GYUV[(tm1>>19)&0x7C].YU +

								//       RYUV[(tm1>>24)&0x7C].YU )

								// *(YPlane+1) = (U8)((t>>8)+8)

								// t1 = ( BYUV[(tm1<< 2)&0x7C].YU +

								//       GYUV[(tm1>> 8)&0x7C].YU +

								//       RYUV[(tm1>>13)&0x7C].YU )

								// *YPlane = (U8)((t1>>8)+8)

								// assign(eax: B2/Y1/Y2/U)

								// assign(ebx: B1/V)

								// assign(ecx: G2/G1)

								// assign(edx: R2/R1)

								// assign(ebp: B1)

								// 1

									mov 	ebx, [esi]

									mov		[esp + LOOP_I], eax

								// 2

									mov 	eax, ebx

									mov 	ecx, ebx

								// 3

									shr 	eax, 14

									mov 	edx, ebx

								// 4

									shr 	ecx, 19

									and 	eax, 0x7C

								// 5

									shr 	edx, 24

									and 	ecx, 0x7C

								// 6

									mov 	eax, [BYUV+eax*8].YU

									and 	edx, 0x7C

								// 7

									add 	eax, [GYUV+ecx*8].YU

										mov	ecx,  ebx

								// 8

									add 	eax, [RYUV+edx*8].YU

										mov	edx,  ebx

								// 9

									sar 	eax, 8

										and	ebx,  0x1F

								// 10

										shl 	ebx, 2

									add 	eax, 8

								// 11

										shr 	ecx, 3

									mov	 	[edi + 1], al

								// 12

										shr 	edx, 8

										and 	ecx, 0x7C

								// 13

										mov	 	eax, [BYUV+ebx*8].YU

										and	 	edx, 0x7C

								// 14

										add	 	eax, [GYUV+ecx*8].YU

										mov	 	ebp, ebx

								// 15

										add	 	eax, [RYUV+edx*8].YU

										lea		edi, [edi + 4]

								// 16

										sar	 	eax, 8

										mov 	ebx, [esp + LOOP_K]

								// 17

										add	 	eax, 8

										and		ebx, 1

								// 18

										mov 	[edi - 4], al

										jnz 	L9a


								// At this point, ebp: B1, ecx: G1, edx: R1

								// *UPlane++   = (U8)((t1>>24)+64)

								// t   = ( VBGR[(t>>13)&0x7C].VR +

								//         VBGR[(t>> 8)&0x7C].VG +

								//         VBGR[(t<< 2)&0x7C].VB )

								// *VPlane++ = (U8)((t>>8)+64)

								// 19

									mov 	ebx, [RYUV+edx*8].V

									mov 	edx, [esp + UPLANE]

								// 20

									sar		eax, 16

									add 	ebx, [GYUV+ecx*8].V

								// 21

									add		eax, 64

									add 	ebx, [BYUV+ebp*8].V

								// 22

									mov		[edx], al

									inc		edx

								// 23

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 24

									sar 	ebx, 8

									inc		edx

								// 25

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 26

									mov		[edx - 1], bl

									nop


								L9a:

								// tm2 = pnext[1]

								// t = ( BYUV[(tm2>>14)&0x7C].YU +

								//       GYUV[(tm2>>19)&0x7C].YU +

								//       RYUV[(tm2>>24)&0x7C].YU )

								// *(YPlane+1) = (U8)((t>>8)+8)

								// t2 = ( BYUV[(tm2<< 2)&0x7C].YU +

								//       GYUV[(tm2>> 8)&0x7C].YU +

								//       RYUV[(tm2>>13)&0x7C].YU )

								// *YPlane = (U8)((t2>>8)+8)

								// YPlane += 4

								// assign(eax: B2/Y1/Y2/U)

								// assign(ebx: B1/V)

								// assign(ecx: G2/G1)

								// assign(edx: R2/R1)

								// assign(ebp: B1)

								// 27

									mov 	eax, [esi + 4]

									lea		esi, [esi + 8]

								// 28

									mov 	ebx, eax

									mov 	ecx, eax

								// 29

									shr 	eax, 14

									mov 	edx, ebx

								// 30

									shr 	ecx, 19

									and 	eax, 0x7C

								// 31

									shr 	edx, 24

									and 	ecx, 0x7C

								// 32

									mov 	eax, [BYUV+eax*8].YU

									and 	edx, 0x7C

								// 33

									add 	eax, [GYUV+ecx*8].YU

										mov	ecx,  ebx

								// 34

									add 	eax, [RYUV+edx*8].YU

										mov	edx,  ebx

								// 35

									sar 	eax, 8

										and	ebx,  0x1F

								// 36

										shl 	ebx, 2

									add 	eax, 8

								// 37

										shr 	ecx, 3

									mov	 	[edi - 1], al

								// 38

										shr 	edx, 8

										and 	ecx, 0x7C

								// 39

										mov	 	eax, [BYUV+ebx*8].YU

										and	 	edx, 0x7C

								// 40

										add	 	eax, [GYUV+ecx*8].YU

										mov	 	ebp, ebx

								// 41

										add	 	eax, [RYUV+edx*8].YU

										nop

								// 42

										sar	 	eax, 8

										mov 	ebx, [esp + LOOP_K]

								// 43

										add	 	eax, 8

										and		ebx, 1

								// 44

										mov 	[edi - 2], al

										jnz 	L9


								// At this point, ebp: B1, ecx: G1, edx: R1

								// *UPlane++   = (U8)((t2>>24)+64)

								// t   = ( VBGR[(t>>13)&0x7C].VR +

								//         VBGR[(t>> 8)&0x7C].VG +

								//         VBGR[(t<< 2)&0x7C].VB )

								// *VPlane++ = (U8)((t>>8)+64)

								// 45

									mov 	ebx, [RYUV+edx*8].V

									mov 	edx, [esp + UPLANE]

								// 46

									sar		eax, 16

									add 	ebx, [GYUV+ecx*8].V

								// 47

									add		eax, 64

									add 	ebx, [BYUV+ebp*8].V

								// 48

									mov		[edx], al

									inc		edx

								// 49

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 50

									sar 	ebx, 8

									inc		edx

								// 51

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 52

									mov		[edx - 1], bl

									nop


								L9:

								// 53

									mov		eax, [esp + LOOP_I]

									nop

								// 54

									sub		eax, 4

									jnz		L6


								// Assembler version of C_WIDTH_DIFF

								// if (width_diff)

									mov		eax, [esp + WIDTH_DIFF]

									mov		edx, eax

									test	eax, eax

									jz		Lno_width_diff

								// tm = (*(YPlane-1)) << 24

								// tm |= (tm>>8) | (tm>>16) | (tm>>24)

									mov		bl, [edi - 1]

									shl		ebx, 24

									mov		ecx, ebx

									shr		ebx, 8

									or		ecx, ebx

									shr		ebx, 8

									or		ecx, ebx

									shr		ebx, 8

									or		ecx, ebx

								// *(U32 *)YPlane = tm

									mov		[edi], ecx

								// if ((width_diff-4) > 0)

									sub		eax, 4

									jz		Lupdate_YPlane

								// *(U32 *)(YPlane + 4) = tm

									mov		[edi + 4], ecx

									sub		eax, 4

								// if ((width_diff-8) > 0)

									jz		Lupdate_YPlane

								// *(U32 *)(YPlane + 8) = tm

									mov		[edi + 8], ecx

								Lupdate_YPlane:

								// YPlane += width_diff

									lea		edi, [edi + edx]

								///if (0 == (k&1))

									mov		eax, [esp + LOOP_K]

									test	eax, 1

									jnz		Lno_width_diff

								// t8u = *(UPlane-1)

								// t8v = *(VPlane-1)

								// *UPlane++ = t8u

								// *UPlane++ = t8u

								// *VPlane++ = t8v

								// *VPlane++ = t8v

									mov		ebp, edx

									mov		eax, [esp + UPLANE]

									mov		ebx, [esp + VPLANE]

									mov		cl, [eax - 1]

									mov		ch, [ebx - 1]

									mov		[eax], cl

									mov		[eax + 1], cl

									mov		[ebx], ch

									mov		[ebx + 1], ch

								// if ((width_diff-4) > 0)

									sub		ebp, 4

									jz		Lupdate_UVPlane

								// *UPlane++ = t8u

								// *UPlane++ = t8u

								// *VPlane++ = t8v

								// *VPlane++ = t8v

									mov		[eax + 2], cl

									mov		[eax + 3], cl

									mov		[ebx + 2], ch

									mov		[ebx + 3], ch

								// if ((width_diff-8) > 0)

									sub		ebp, 4

									jz		Lupdate_UVPlane

								// *UPlane++ = t8u

								// *UPlane++ = t8u

								// *VPlane++ = t8v

								// *VPlane++ = t8v

									mov		[eax + 4], cl

									mov		[eax + 5], cl

									mov		[ebx + 4], ch

									mov		[ebx + 5], ch

								Lupdate_UVPlane:

									shr		edx, 1

									lea		eax, [eax + edx]

									mov		[esp + UPLANE], eax

									lea		ebx, [ebx + edx]

									mov		[esp + VPLANE], ebx

								Lno_width_diff:


								// if (stretch && (0 == k) && j)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L14

									mov		eax, [esp + LOOP_K]

									test	eax, eax

									jnz		L14

									mov 	eax, [esp + LOOP_J]

									test	eax, eax

									jz		L14


								// spill YPlane ptr

									mov		[esp + YPLANE], edi

									nop


								// for (i = OutputWidth; i > 0; i -= 8)

								// assign (ebx, pyprev)

								// assign (ecx, t)

								// assign (edx, pynext)

								// assign (edi, pyspace)

								// assign (ebp, i)


								// make sure offsets are such that there are no bank conflicts here

									mov 	ebx, [esp + PYPREV]

									mov 	edi, [esp + PYSPACE]


									mov 	edx, [esp + PYNEXT]

									mov 	ebp, [esp + OUTPUT_WIDTH]


								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								L15:

								// 1

									mov		eax, [ebx]

									lea		ebx, [ebx + 4]

								// 2

									mov		ecx, [edx]

									lea		edx, [edx + 4]

								// 3

									shr		ecx, 1

									and		eax, 0xFEFEFEFE

								// 4

									shr		eax, 1

									and		ecx, 0x7F7F7F7F

								// 5

									add		eax, ecx

									mov		ecx, [ebx]

								// 6

									shr		ecx, 1

									mov		[edi], eax

								// 7

									mov		eax, [edx]

									and		ecx, 0x7F7F7F7F

								// 8

									shr		eax, 1

									lea		edi, [edi + 4]

								// 9

									and		eax, 0x7F7F7F7F

									lea		ebx, [ebx + 4]

								// 10

									lea		edx, [edx + 4]

									add		eax, ecx

								// 11

									mov		[edi], eax

									lea		edi, [edi + 4]

								// 12

									sub		ebp, 8

									jnz		L15

								// kill (ebx, pyprev)

								// kill (ecx, t)

								// kill (edx, pynext)

								// kill (edi, pyspace)

								// kill (ebp, i)


								// restore YPlane

									mov		edi, [esp + YPLANE]


								// pnext += BackTwoLines

								L14:

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += byte_ypitch_adj;

									add		edi, [esp + BYTE_YPITCH_ADJ]

								// if(0 == (k&1))

									mov		eax, [esp + LOOP_K]

									and		eax, 1

									jnz		L16

								// UPlane += byte_uvpitch_adj;

								// VPlane += byte_uvpitch_adj;

									mov		eax, [esp + BYTE_UVPITCH_ADJ]

									add		[esp + UPLANE], eax

									add		[esp + VPLANE], eax


								L16:

									inc		DWORD PTR [esp + LOOP_K]

									mov		eax, [esp + LOOP_K]

									cmp		eax, [esp + MARK]

									jl		L5


								// if (stretch)

									cmp		DWORD PTR [esp + STRETCH], 0

									je	 	L17

								// pyprev = YPlane - pitch

									mov		eax, edi

									sub		eax, [esp + PITCH_PARM]

									mov		[esp + PYPREV], eax

								// pyspace = YPlane

									mov		[esp + PYSPACE], edi

								// pynext = (YPlane += pitch)

									add		edi, [esp + PITCH_PARM]

									mov		[esp + PYNEXT], edi


								L17:

									inc		DWORD PTR [esp + LOOP_J]

									mov		eax, [esp + LOOP_J]

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


								// kill (esi, pnext)

								// kill (edi, YPlane)


								// ASM version of C_HEIGHT_FILL

								// if (height_diff)

									mov		eax, [esp + HEIGHT_DIFF]

									test	eax, eax

									jz		Lno_height_diff


								// pyspace = (U32 *)YPlane

									mov		esi, edi

								// pyprev =  (U32 *)(YPlane - pitch)

									sub		esi, [esp + PITCH_PARM]

								// for (j = height_diff; j > 0; j--)

								Lheight_yfill_loop:

									mov		ebx, [esp + WIDTHX16]

								// for (i = widthx16; i>0; i -=4)

								Lheight_yfill_row:

								// *pyspace++ = *pyprev++

									mov		ecx, [esi]

									lea		esi, [esi + 4]

									mov		[edi], ecx

									lea		edi, [edi + 4]

									sub		ebx, 4

									jnz		Lheight_yfill_row

								// pyspace += word_ypitch_adj

								// pyprev  += word_ypitch_adj

									add		esi, [esp + BYTE_YPITCH_ADJ]

									add		edi, [esp + BYTE_YPITCH_ADJ]

									dec		eax

									jnz		Lheight_yfill_loop


									mov		eax, [esp + HEIGHT_DIFF]

									mov		edi, [esp + UPLANE]

								// puvspace = (U32 *)UPlane

									mov		esi, edi

								// puvprev =  (U32 *)(UPlane - pitch)

									sub		esi, [esp + PITCH_PARM]

								// for (j = height_diff; j > 0; j -= 2)

								Lheight_ufill_loop:

									mov		ebx, [esp + WIDTHX16]

								// for (i = widthx16; i>0; i -= 8)

								Lheight_ufill_row:

								// *puvspace++ = *puvprev++

									mov		ecx, [esi]

									mov		[edi], ecx

									lea		esi, [esi + 4]

									lea		edi, [edi + 4]

									sub		ebx, 8

									jnz		Lheight_ufill_row

								// puvspace += word_uvpitch_adj

								// puvprev  += word_uvpitch_adj

									add		esi, [esp + BYTE_UVPITCH_ADJ]

									add		edi, [esp + BYTE_UVPITCH_ADJ]

									sub		eax, 2

									jnz		Lheight_ufill_loop


									mov		eax, [esp + HEIGHT_DIFF]

									mov		edi, [esp + VPLANE]

								// puvspace = (U32 *)VPlane

									mov		esi, edi

								// puvprev =  (U32 *)(VPlane - pitch)

									sub		esi, [esp + PITCH_PARM]

								// for (j = height_diff; j > 0; j -= 2)

								Lheight_vfill_loop:

									mov		ebx, [esp + WIDTHX16]

								// for (i = widthx16; i>0; i -= 8)

								Lheight_vfill_row:

								// *puvspace++ = *puvprev++

									mov		ecx, [esi]

									mov		[edi], ecx

									lea		esi, [esi + 4]

									lea		edi, [edi + 4]

									sub		ebx, 8

									jnz		Lheight_vfill_row

								// puvspace += word_uvpitch_adj

								// puvprev  += word_uvpitch_adj

									add		esi, [esp + BYTE_UVPITCH_ADJ]

									add		edi, [esp + BYTE_UVPITCH_ADJ]

									sub		eax, 2

									jnz		Lheight_vfill_loop

								Lno_height_diff:


								// if (stretch)

									mov		esi, [esp + PYPREV]

									cmp		DWORD PTR [esp + STRETCH], 0

									je		L19


								// for (i = OutputWidth; i > 0; i -= 4)

								// assign (esi, pyprev)

								// assign (edi, pyspace)

								// assign (ebp, i)

									mov		ebp, [esp + OUTPUT_WIDTH]

									 mov	edi, [esp + PYSPACE]

								L18:

									mov		ecx, [esi]

									 lea	esi, [esi + 4]

									mov		[edi], ecx

									 lea	edi, [edi + 4]

									sub		ebp, 4

									 jnz	L18

								// kill (esi, pyprev)

								// kill (edi, pyspace)

								// kill (ebp, i)


								L19:

									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef	LOCALSIZE


								#undef	PITCH_PARM

								#undef	VPLANE

								#undef	UPLANE

								#undef	YPLANE

								#undef	LP_INPUT

								#undef	OUTPUT_HEIGHT_WORD

								#undef	OUTPUT_WIDTH_WORD

								#undef	LPBI_INPUT


								#undef	OUTPUT_WIDTH

								#undef	PYPREV

								#undef	PYSPACE

								#undef	PYNEXT

								#undef	PUVPREV

								#undef	PUVSPACE

								#undef	LOOP_I

								#undef	LOOP_J

								#undef	LOOP_K

								#undef	BACK_TWO_LINES

								#undef	WIDTHX16

								#undef	HEIGHTX16

								#undef	WIDTH_DIFF

								#undef	HEIGHT_DIFF

								#undef	WIDTH_ADJ

								#undef	HEIGHT_ADJ

								#undef	STRETCH

								#undef	ASPECT

								#undef	LUMA_ITERS

								#undef	MARK

								#undef	BYTE_YPITCH_ADJ

								#undef	BYTE_UVPITCH_ADJ


								#endif // } H263P