windows-server-2003/enduser/netmeeting/av/codecs/intel/h261/excolcnv.cpp


								/* *************************************************************************

								**    INTEL Corporation Proprietary Information

								**

								**    This listing is supplied under the terms of a license

								**    agreement with INTEL Corporation and may not be copied

								**    nor disclosed except in accordance with the terms of

								**    that agreement.

								**

								**    Copyright (c) 1995 Intel Corporation.

								**    All Rights Reserved.

								**

								** *************************************************************************

								*/


								////////////////////////////////////////////////////////////////////////////

								//

								// $Author:   MDUDA  $

								// $Date:   21 Nov 1996 17:33:56  $

								// $Archive:   S:\h26x\src\enc\excolcnv.cpv  $

								// $Header:   S:\h26x\src\enc\excolcnv.cpv   1.45   21 Nov 1996 17:33:56   MDUDA  $

								// $Log:   S:\h26x\src\enc\excolcnv.cpv  $

								//

								//    Rev 1.45   21 Nov 1996 17:33:56   MDUDA

								// Added more non-compressed YUV12 support (RGB16 and RGB24).

								// Also rewrote IA_YUV12toEncYUV12 to be more readable.

								//

								//    Rev 1.44   31 Oct 1996 10:05:48   KLILLEVO

								// changed from DBOUT to DbgLog

								//

								//    Rev 1.43   22 Oct 1996 16:44:22   MDUDA

								// Added IA support for YUY2 input color conversion and cleaned up C version.

								// Now using IA version.

								//

								//    Rev 1.42   18 Oct 1996 14:31:32   MDUDA

								//

								// Added a C-version of YUY2 input color conversion.

								//

								//    Rev 1.41   11 Oct 1996 16:04:50   MDUDA

								// Using new RGB to YUV lookup tables.

								//

								//    Rev 1.40   03 Oct 1996 10:43:58   AGUPTA2

								// Got rid of segment directives; made tables read-only.

								//

								//    Rev 1.39   13 Sep 1996 13:34:04   MDUDA

								// Fixed YVU9 bug where input = output frame size was not colored

								// (U and V planes) properly.

								//

								//    Rev 1.38   11 Sep 1996 15:45:06   MDUDA

								// Modified RGB look-up tables and added C_H26X_YUV12toEncYUV12 and

								// IA_H26X_YUV12toEncYUV12.

								//

								//    Rev 1.37   03 Sep 1996 14:54:46   MDUDA

								// Fixed problem causing VC++ 4.1 internal compiler error. Replaced

								// inline assembler constructs such as [ebx.biWidth] with

								// (LPBITMAPINFOHEADER)[ebx].biWidth.

								//

								//    Rev 1.36   29 Aug 1996 16:31:14   MDUDA

								// Added Pentium assembler versions for all RGB conversion routines.

								// Also, rewrote YVU9 support to allow input frame sizes other

								// than 160x120 and 240x180.

								//

								//    Rev 1.35   16 Aug 1996 12:17:48   MDUDA

								// Fixed bug where U and V values in the BGR converters were treated as unsign

								// values. Also did some general cleanup of BGR converters in preparation for

								// doing Pentium assembler version.

								//

								//    Rev 1.34   13 Aug 1996 10:35:38   MDUDA

								// Added support for RGB4. Generalized RGB LUT support for 4-bit and

								// and 8-bit pixels into a single routine.

								//

								//    Rev 1.33   09 Aug 1996 09:45:02   MDUDA

								// Added support for RGB16 format on input. This is for the color

								// Quick Cam. Also, generalized RGB16 for other bit combinations.

								// However, these can only be specified under BI_BITFIELDS format.

								//

								//    Rev 1.32   02 Aug 1996 13:44:48   MDUDA

								// modified H26X_BGR24toYUV12 to crop and stretch 240x180 and 160x120

								// frames

								//

								//    Rev 1.31   01 Aug 1996 14:03:50   MDUDA

								//

								// Optimized H26X_YVU9toYUV12 by rewriting function in assembler code. Used in

								// _asm. Also re-arranged functions so that colorCnvtFrame is at the end of

								// the file.

								//

								//    Rev 1.30   22 Jul 1996 13:28:22   BECHOLS

								// Added a CLUT8 to YUV12 color convertor (CC). This CC crops and stretches

								// either the 240x180 or the 160x120 image size to produce QCIF and SubQCIF

								// image sizes respectively.

								//

								//    Rev 1.29   11 Jul 1996 15:47:02   MDUDA

								//

								// Modified H263_YVU9toYUV12 to create subQCIF and QCIF from

								// 160x120 and 240x180 images, respectively. To fit the new

								// formats, the original images are cropped and stretched using a

								// dither pattern for the color planes.

								//

								//    Rev 1.28   14 May 1996 12:04:08   KLILLEVO

								// changed RGB->YUV color conversion to use the inverse

								// if the output YUV->RGB conversion instead of the conversion

								// "recommended by the CCIR". Compression performance for RGB

								// input was significantly improved (33% less bits for same

								// fixed QP)

								//

								//    Rev 1.27   04 May 1996 21:55:20   BECHOLS

								// For RGB24 to YVU12 conversion, I unrolled the inner loop by 8 and changed

								// the writes to DWORD vs. BYTE writes.  This resulted in a 30% reduction in

								// the execution time.

								//

								//    Rev 1.26   10 Apr 1996 16:44:14   RHAZRA

								// Fixed a bug in 320x240 mode for the H26X_YUV12toEncYUV12() function.

								// DWORD should be and-ed with 0x7f7f7f7f and not 0x7f7f7f.

								//

								//    Rev 1.25   27 Mar 1996 15:10:08   SCDAY

								// Optimized H26X_YUV12toEncYUV12 'C' code to read/write DWORDs

								//

								//    Rev 1.24   08 Jan 1996 17:46:14   unknown

								//

								// Correct logic on bIs320x240 check

								//

								//    Rev 1.23   05 Jan 1996 17:34:38   RMCKENZX

								// corrected chroma pad value to 0x40 to achieve black padding

								//

								//    Rev 1.22   05 Jan 1996 17:29:46   RMCKENZX

								// Added code to pad out 320x240 stills to 352x288

								// full CIF images.

								//

								//    Rev 1.21   04 Jan 1996 18:37:20   TRGARDOS

								// Added code to permit 320x240 input and then set a boolean

								// bIs320x240.

								//

								//    Rev 1.20   02 Jan 1996 17:09:04   TRGARDOS

								// Moved colorCnvFrame into this file and made the

								// color convertor functions static.

								//

								//    Rev 1.19   27 Dec 1995 15:32:56   RMCKENZX

								// Added copyright notice

								//

								//    Rev 1.18   06 Dec 1995 09:35:42   TRGARDOS

								// Added Brian's fix to the input color convertor to avoid

								// overflow of the chars.

								//

								//    Rev 1.17   27 Nov 1995 16:09:04   TRGARDOS

								// Removed two unused variables to get rid of compiler warnings.

								//

								//    Rev 1.16   30 Oct 1995 14:34:12   TRGARDOS

								// Fixed 240x180 to center clip.

								//

								//    Rev 1.15   30 Oct 1995 12:03:16   TRGARDOS

								// Added color convertor support for YUV9 240x180.

								//

								//    Rev 1.14   28 Oct 1995 15:39:28   TRGARDOS

								// Fixed color conversion problem from YVU9 to YVU12.

								//

								//    Rev 1.13   12 Oct 1995 17:40:12   TRGARDOS

								// Fixed YUV12 input color convertor.

								//

								//    Rev 1.12   12 Oct 1995 12:04:16   TRGARDOS

								// Changed some variable names in YUV12 convertor.

								//

								//    Rev 1.11   10 Oct 1995 16:34:12   TRGARDOS

								// Added YUV12 input support.

								//

								//    Rev 1.10   28 Sep 1995 17:02:36   DBRUCKS

								// fix colorIn to not swap left to right

								//

								//    Rev 1.9   15 Sep 1995 16:37:38   TRGARDOS

								//

								//

								//    Rev 1.8   13 Sep 1995 17:09:22   TRGARDOS

								//

								// Finished adding encoder support for YVU9 160x120 frames.

								//

								//    Rev 1.7   11 Sep 1995 11:14:06   DBRUCKS

								// add h261 ifdef

								//

								//    Rev 1.6   07 Sep 1995 09:27:54   TRGARDOS

								// Added YVU9 to YVU12 color convertor.

								//

								//    Rev 1.5   05 Sep 1995 15:50:46   TRGARDOS

								// Added color back in to convertors.

								//

								//    Rev 1.4   01 Sep 1995 17:51:42   TRGARDOS

								// Fixed bugs in color converter.

								//

								//    Rev 1.3   01 Sep 1995 10:13:42   TRGARDOS

								// Debugging bit stream errors.

								//

								//    Rev 1.2   30 Aug 1995 12:42:26   TRGARDOS

								// Fixed bugs in intra AC coef VLC coding.

								//

								//    Rev 1.1   02 Aug 1995 17:28:06   TRGARDOS

								//

								// Cleaned up stuff to get stub working under new

								// version control system.

								//

								//    Rev 1.0   31 Jul 1995 13:07:10   DBRUCKS

								// Initial revision.

								//

								//    Rev 1.0   17 Jul 1995 14:46:16   CZHU

								// Initial revision.

								//

								//    Rev 1.0   17 Jul 1995 14:14:22   CZHU

								// Initial revision.

								;////////////////////////////////////////////////////////////////////////////


								/*


								CCIR 601 Specifies a conversion from RGB to YCrCb. For

								what we call U and V, they are equivalent as

								U = Cb, V = Cr.


								From CCIR 601-2 Annex II, we can go from RGB with values

								in the range of 0-255, to YUV values in the same range

								by the equation:


								Y = (    77*R + 150*G +  29*B ) >> 8;

								V = (   131*R - 110*G -  21*B ) >> 8 + 128; 	// Cr

								U = ( (-44)*R -  87*G + 131*B ) >> 8 + 128;		// Cb


								Has now changed to the inverse of the YUV->RGB on the

								output, since the old version produced way too many bits.

								The new version is:


								Y = (   16836*R +  33056*G +  6416*B ) >> 16 + 16;

								V = (   28777*R -  24117*G -  4660*B ) >> 16 + 128; 	// Cr

								U = ( (-9726)*R -  19064*G + 28790*B ) >> 16 + 128;		// Cb


								*/


								#include "precomp.h"


								#if defined(_CODEC_STATS)


								static const double RDTSC_SHIFT_32 = 4294967296.0;


								static double PENTIUM_TIMER()

								{

									unsigned long int a, b;

									double temp1, temp2, result;


									__asm

									{

										_emit   0x0f

										_emit   0x31

										mov     a, eax

										mov     b, edx

									}


									temp1 = (double) a;

									temp2 = (double) (b & 0xFFFF);

									if (RDTSC_CLOCK_FREQ) {

										result = (temp1 + temp2 * RDTSC_SHIFT_32) / RDTSC_CLOCK_FREQ;

									} else {

										result = 0.0;

									}

									return( result * 1000.0 );

								}


								#endif


								// Set all local functions to "static", and then set it accordingly if

								// VTune statistics are to be collected. VTune doesn't recognize static functions

								// so we need some way to turn off the static attribute if VTune is to be run

								// on the executable. For now, simply use a define of _VTUNE to build the driver.

								#if defined(_VTUNE)

								#define _STATIC

								#else

								#define _STATIC static

								#endif


								// These are the look-up tables for the RGB converters. They are 8 bytes/entry

								// to allow addressing via the scale by 8 indexed addressing mode. A pseudo-SIMD

								// arrangement is used in these tables. Since all R, G and B contributions to the

								// Y value are positive and fit in 15 bits, these are stored in the lower 16-bits

								// of the YU word. In some cases, the U contribution is negative so it is placed

								// in the upper 16 bits of the YU word. When a Y value is calculated, the U value

								// is calculated in parallel. The V contribution is negative in some cases, but it

								// gets its own word.


								// This is the code that was used to generate the tables.

								#if 0

								#define YRCoef   16836

								#define YGCoef   33056

								#define YBCoef    6416

								#define URCoef    9726

								#define UGCoef   19064

								#define UBCoef   28790

								#define VRCoef   28777

								#define VGCoef   24117

								#define VBCoef    4660


								#include <stdio.h>


								void main() {

								int i,j;


								  printf("struct YUV {\n");

								  printf("  int YU;\n");

								  printf("  int V;\n");

								  printf("};\n\n");


								  printf("struct YUV  RYUV[] = {\n");

								  for (i = 0; i < 64; i++) {

								    for (j = 0; j < 4; j += 2) {

								      printf("{0x%.8x, 0x%.8x}, ",

								        ((YRCoef*((i*4)+j+1))>>9) |

								         ((-(((URCoef*((i*4)+j+1)))>>9))<<16),

								        ((VRCoef*((i*4)+j+1))>>9));

								    }

								    printf("\n");

								  }

								  printf("};\n");


								  printf("struct YUV  GYUV[] = {\n");

								  for (i = 0; i < 64; i++) {

								    for (j = 0; j < 4; j += 2) {

								      printf("{0x%.8x, 0x%.8x}, ",

								        ((YGCoef*((i*4)+j+1))>>9) |

								         ((-(((UGCoef*((i*4)+j+1)))>>9))<<16),

								         -((VGCoef*((i*4)+j+1))>>9));

								    }

								    printf("\n");

								  }

								  printf("};\n");


								  printf("struct YUV  BYUV[] = {\n");

								  for (i = 0; i < 64; i++) {

								    for (j = 0; j < 4; j += 2) {

								      printf("{0x%.8x, 0x%.8x}, ",

								        ((YBCoef*((i*4)+j+1))>>9) |

								         (((UBCoef*((i*4)+j+1))>>9)<<16),

								        -((VBCoef*((i*4)+j+1))>>9));

								    }

								    printf("\n");

								  }

								  printf("};\n");

								}

								#endif


								struct YUV {

								  int YU;

								  int V;

								};


								const struct YUV  RYUV[] = {

								{0xffee0020, 0x00000038}, {0xffc80062, 0x000000a8},

								{0xffa200a4, 0x00000119}, {0xff7c00e6, 0x00000189},

								{0xff560127, 0x000001f9}, {0xff300169, 0x0000026a},

								{0xff0a01ab, 0x000002da}, {0xfee401ed, 0x0000034b},

								{0xfebe022f, 0x000003bb}, {0xfe980270, 0x0000042b},

								{0xfe7202b2, 0x0000049c}, {0xfe4c02f4, 0x0000050c},

								{0xfe260336, 0x0000057d}, {0xfe000377, 0x000005ed},

								{0xfdda03b9, 0x0000065d}, {0xfdb403fb, 0x000006ce},

								{0xfd8e043d, 0x0000073e}, {0xfd68047e, 0x000007af},

								{0xfd4204c0, 0x0000081f}, {0xfd1c0502, 0x0000088f},

								{0xfcf60544, 0x00000900}, {0xfcd00585, 0x00000970},

								{0xfcaa05c7, 0x000009e1}, {0xfc840609, 0x00000a51},

								{0xfc5e064b, 0x00000ac2}, {0xfc38068d, 0x00000b32},

								{0xfc1206ce, 0x00000ba2}, {0xfbec0710, 0x00000c13},

								{0xfbc60752, 0x00000c83}, {0xfba00794, 0x00000cf4},

								{0xfb7a07d5, 0x00000d64}, {0xfb540817, 0x00000dd4},

								{0xfb2e0859, 0x00000e45}, {0xfb08089b, 0x00000eb5},

								{0xfae208dc, 0x00000f26}, {0xfabc091e, 0x00000f96},

								{0xfa960960, 0x00001006}, {0xfa7009a2, 0x00001077},

								{0xfa4a09e3, 0x000010e7}, {0xfa240a25, 0x00001158},

								{0xf9fe0a67, 0x000011c8}, {0xf9d80aa9, 0x00001239},

								{0xf9b20aeb, 0x000012a9}, {0xf98c0b2c, 0x00001319},

								{0xf9660b6e, 0x0000138a}, {0xf9400bb0, 0x000013fa},

								{0xf91a0bf2, 0x0000146b}, {0xf8f40c33, 0x000014db},

								{0xf8ce0c75, 0x0000154b}, {0xf8a80cb7, 0x000015bc},

								{0xf8820cf9, 0x0000162c}, {0xf85c0d3a, 0x0000169d},

								{0xf8360d7c, 0x0000170d}, {0xf8100dbe, 0x0000177d},

								{0xf7ea0e00, 0x000017ee}, {0xf7c40e41, 0x0000185e},

								{0xf79e0e83, 0x000018cf}, {0xf7780ec5, 0x0000193f},

								{0xf7520f07, 0x000019af}, {0xf72c0f49, 0x00001a20},

								{0xf7060f8a, 0x00001a90}, {0xf6e00fcc, 0x00001b01},

								{0xf6ba100e, 0x00001b71}, {0xf6941050, 0x00001be2},

								{0xf66e1091, 0x00001c52}, {0xf64810d3, 0x00001cc2},

								{0xf6221115, 0x00001d33}, {0xf5fc1157, 0x00001da3},

								{0xf5d61198, 0x00001e14}, {0xf5b011da, 0x00001e84},

								{0xf58a121c, 0x00001ef4}, {0xf564125e, 0x00001f65},

								{0xf53e12a0, 0x00001fd5}, {0xf51812e1, 0x00002046},

								{0xf4f21323, 0x000020b6}, {0xf4cc1365, 0x00002126},

								{0xf4a613a7, 0x00002197}, {0xf48013e8, 0x00002207},

								{0xf45a142a, 0x00002278}, {0xf434146c, 0x000022e8},

								{0xf40e14ae, 0x00002359}, {0xf3e814ef, 0x000023c9},

								{0xf3c21531, 0x00002439}, {0xf39c1573, 0x000024aa},

								{0xf37615b5, 0x0000251a}, {0xf35015f6, 0x0000258b},

								{0xf32a1638, 0x000025fb}, {0xf304167a, 0x0000266b},

								{0xf2de16bc, 0x000026dc}, {0xf2b816fe, 0x0000274c},

								{0xf292173f, 0x000027bd}, {0xf26c1781, 0x0000282d},

								{0xf24617c3, 0x0000289d}, {0xf2201805, 0x0000290e},

								{0xf1fa1846, 0x0000297e}, {0xf1d41888, 0x000029ef},

								{0xf1ae18ca, 0x00002a5f}, {0xf188190c, 0x00002acf},

								{0xf162194d, 0x00002b40}, {0xf13c198f, 0x00002bb0},

								{0xf11619d1, 0x00002c21}, {0xf0f01a13, 0x00002c91},

								{0xf0ca1a54, 0x00002d02}, {0xf0a41a96, 0x00002d72},

								{0xf07e1ad8, 0x00002de2}, {0xf0581b1a, 0x00002e53},

								{0xf0321b5c, 0x00002ec3}, {0xf00c1b9d, 0x00002f34},

								{0xefe61bdf, 0x00002fa4}, {0xefc01c21, 0x00003014},

								{0xef9a1c63, 0x00003085}, {0xef741ca4, 0x000030f5},

								{0xef4e1ce6, 0x00003166}, {0xef281d28, 0x000031d6},

								{0xef021d6a, 0x00003246}, {0xeedc1dab, 0x000032b7},

								{0xeeb61ded, 0x00003327}, {0xee901e2f, 0x00003398},

								{0xee6a1e71, 0x00003408}, {0xee441eb2, 0x00003479},

								{0xee1e1ef4, 0x000034e9}, {0xedf81f36, 0x00003559},

								{0xedd21f78, 0x000035ca}, {0xedac1fba, 0x0000363a},

								{0xed861ffb, 0x000036ab}, {0xed60203d, 0x0000371b},

								{0xed3a207f, 0x0000378b}, {0xed1420c1, 0x000037fc},

								};

								const struct YUV  GYUV[] = {

								{0xffdb0040, 0xffffffd1}, {0xff9100c1, 0xffffff73},

								{0xff460142, 0xffffff15}, {0xfefc01c3, 0xfffffeb7},

								{0xfeb10245, 0xfffffe59}, {0xfe6702c6, 0xfffffdfa},

								{0xfe1c0347, 0xfffffd9c}, {0xfdd203c8, 0xfffffd3e},

								{0xfd880449, 0xfffffce0}, {0xfd3d04ca, 0xfffffc82},

								{0xfcf3054b, 0xfffffc23}, {0xfca805cc, 0xfffffbc5},

								{0xfc5e064e, 0xfffffb67}, {0xfc1306cf, 0xfffffb09},

								{0xfbc90750, 0xfffffaaa}, {0xfb7e07d1, 0xfffffa4c},

								{0xfb340852, 0xfffff9ee}, {0xfae908d3, 0xfffff990},

								{0xfa9f0954, 0xfffff932}, {0xfa5409d5, 0xfffff8d3},

								{0xfa0a0a57, 0xfffff875}, {0xf9bf0ad8, 0xfffff817},

								{0xf9750b59, 0xfffff7b9}, {0xf92a0bda, 0xfffff75b},

								{0xf8e00c5b, 0xfffff6fc}, {0xf8960cdc, 0xfffff69e},

								{0xf84b0d5d, 0xfffff640}, {0xf8010dde, 0xfffff5e2},

								{0xf7b60e60, 0xfffff584}, {0xf76c0ee1, 0xfffff525},

								{0xf7210f62, 0xfffff4c7}, {0xf6d70fe3, 0xfffff469},

								{0xf68c1064, 0xfffff40b}, {0xf64210e5, 0xfffff3ad},

								{0xf5f71166, 0xfffff34e}, {0xf5ad11e7, 0xfffff2f0},

								{0xf5621269, 0xfffff292}, {0xf51812ea, 0xfffff234},

								{0xf4cd136b, 0xfffff1d6}, {0xf48313ec, 0xfffff177},

								{0xf439146d, 0xfffff119}, {0xf3ee14ee, 0xfffff0bb},

								{0xf3a4156f, 0xfffff05d}, {0xf35915f0, 0xffffeffe},

								{0xf30f1672, 0xffffefa0}, {0xf2c416f3, 0xffffef42},

								{0xf27a1774, 0xffffeee4}, {0xf22f17f5, 0xffffee86},

								{0xf1e51876, 0xffffee27}, {0xf19a18f7, 0xffffedc9},

								{0xf1501978, 0xffffed6b}, {0xf10519f9, 0xffffed0d},

								{0xf0bb1a7b, 0xffffecaf}, {0xf0701afc, 0xffffec50},

								{0xf0261b7d, 0xffffebf2}, {0xefdb1bfe, 0xffffeb94},

								{0xef911c7f, 0xffffeb36}, {0xef471d00, 0xffffead8},

								{0xeefc1d81, 0xffffea79}, {0xeeb21e02, 0xffffea1b},

								{0xee671e84, 0xffffe9bd}, {0xee1d1f05, 0xffffe95f},

								{0xedd21f86, 0xffffe901}, {0xed882007, 0xffffe8a2},

								{0xed3d2088, 0xffffe844}, {0xecf32109, 0xffffe7e6},

								{0xeca8218a, 0xffffe788}, {0xec5e220b, 0xffffe72a},

								{0xec13228d, 0xffffe6cb}, {0xebc9230e, 0xffffe66d},

								{0xeb7e238f, 0xffffe60f}, {0xeb342410, 0xffffe5b1},

								{0xeaea2491, 0xffffe552}, {0xea9f2512, 0xffffe4f4},

								{0xea552593, 0xffffe496}, {0xea0a2614, 0xffffe438},

								{0xe9c02696, 0xffffe3da}, {0xe9752717, 0xffffe37b},

								{0xe92b2798, 0xffffe31d}, {0xe8e02819, 0xffffe2bf},

								{0xe896289a, 0xffffe261}, {0xe84b291b, 0xffffe203},

								{0xe801299c, 0xffffe1a4}, {0xe7b62a1d, 0xffffe146},

								{0xe76c2a9f, 0xffffe0e8}, {0xe7212b20, 0xffffe08a},

								{0xe6d72ba1, 0xffffe02c}, {0xe68c2c22, 0xffffdfcd},

								{0xe6422ca3, 0xffffdf6f}, {0xe5f82d24, 0xffffdf11},

								{0xe5ad2da5, 0xffffdeb3}, {0xe5632e26, 0xffffde55},

								{0xe5182ea8, 0xffffddf6}, {0xe4ce2f29, 0xffffdd98},

								{0xe4832faa, 0xffffdd3a}, {0xe439302b, 0xffffdcdc},

								{0xe3ee30ac, 0xffffdc7e}, {0xe3a4312d, 0xffffdc1f},

								{0xe35931ae, 0xffffdbc1}, {0xe30f322f, 0xffffdb63},

								{0xe2c432b1, 0xffffdb05}, {0xe27a3332, 0xffffdaa6},

								{0xe22f33b3, 0xffffda48}, {0xe1e53434, 0xffffd9ea},

								{0xe19b34b5, 0xffffd98c}, {0xe1503536, 0xffffd92e},

								{0xe10635b7, 0xffffd8cf}, {0xe0bb3638, 0xffffd871},

								{0xe07136ba, 0xffffd813}, {0xe026373b, 0xffffd7b5},

								{0xdfdc37bc, 0xffffd757}, {0xdf91383d, 0xffffd6f8},

								{0xdf4738be, 0xffffd69a}, {0xdefc393f, 0xffffd63c},

								{0xdeb239c0, 0xffffd5de}, {0xde673a41, 0xffffd580},

								{0xde1d3ac3, 0xffffd521}, {0xddd23b44, 0xffffd4c3},

								{0xdd883bc5, 0xffffd465}, {0xdd3d3c46, 0xffffd407},

								{0xdcf33cc7, 0xffffd3a9}, {0xdca93d48, 0xffffd34a},

								{0xdc5e3dc9, 0xffffd2ec}, {0xdc143e4a, 0xffffd28e},

								{0xdbc93ecc, 0xffffd230}, {0xdb7f3f4d, 0xffffd1d2},

								{0xdb343fce, 0xffffd173}, {0xdaea404f, 0xffffd115},

								};

								const struct YUV  BYUV[] = {

								{0x0038000c, 0xfffffff7}, {0x00a80025, 0xffffffe5},

								{0x0119003e, 0xffffffd3}, {0x01890057, 0xffffffc1},

								{0x01fa0070, 0xffffffaf}, {0x026a0089, 0xffffff9c},

								{0x02da00a2, 0xffffff8a}, {0x034b00bb, 0xffffff78},

								{0x03bb00d5, 0xffffff66}, {0x042c00ee, 0xffffff54},

								{0x049c0107, 0xffffff41}, {0x050d0120, 0xffffff2f},

								{0x057d0139, 0xffffff1d}, {0x05ee0152, 0xffffff0b},

								{0x065e016b, 0xfffffef9}, {0x06cf0184, 0xfffffee6},

								{0x073f019d, 0xfffffed4}, {0x07b001b6, 0xfffffec2},

								{0x082001cf, 0xfffffeb0}, {0x089001e8, 0xfffffe9e},

								{0x09010201, 0xfffffe8b}, {0x0971021a, 0xfffffe79},

								{0x09e20233, 0xfffffe67}, {0x0a52024c, 0xfffffe55},

								{0x0ac30266, 0xfffffe43}, {0x0b33027f, 0xfffffe30},

								{0x0ba40298, 0xfffffe1e}, {0x0c1402b1, 0xfffffe0c},

								{0x0c8502ca, 0xfffffdfa}, {0x0cf502e3, 0xfffffde8},

								{0x0d6602fc, 0xfffffdd5}, {0x0dd60315, 0xfffffdc3},

								{0x0e46032e, 0xfffffdb1}, {0x0eb70347, 0xfffffd9f},

								{0x0f270360, 0xfffffd8c}, {0x0f980379, 0xfffffd7a},

								{0x10080392, 0xfffffd68}, {0x107903ab, 0xfffffd56},

								{0x10e903c4, 0xfffffd44}, {0x115a03dd, 0xfffffd31},

								{0x11ca03f7, 0xfffffd1f}, {0x123b0410, 0xfffffd0d},

								{0x12ab0429, 0xfffffcfb}, {0x131c0442, 0xfffffce9},

								{0x138c045b, 0xfffffcd6}, {0x13fc0474, 0xfffffcc4},

								{0x146d048d, 0xfffffcb2}, {0x14dd04a6, 0xfffffca0},

								{0x154e04bf, 0xfffffc8e}, {0x15be04d8, 0xfffffc7b},

								{0x162f04f1, 0xfffffc69}, {0x169f050a, 0xfffffc57},

								{0x17100523, 0xfffffc45}, {0x1780053c, 0xfffffc33},

								{0x17f10555, 0xfffffc20}, {0x1861056e, 0xfffffc0e},

								{0x18d20588, 0xfffffbfc}, {0x194205a1, 0xfffffbea},

								{0x19b205ba, 0xfffffbd8}, {0x1a2305d3, 0xfffffbc5},

								{0x1a9305ec, 0xfffffbb3}, {0x1b040605, 0xfffffba1},

								{0x1b74061e, 0xfffffb8f}, {0x1be50637, 0xfffffb7d},

								{0x1c550650, 0xfffffb6a}, {0x1cc60669, 0xfffffb58},

								{0x1d360682, 0xfffffb46}, {0x1da7069b, 0xfffffb34},

								{0x1e1706b4, 0xfffffb22}, {0x1e8806cd, 0xfffffb0f},

								{0x1ef806e6, 0xfffffafd}, {0x1f6806ff, 0xfffffaeb},

								{0x1fd90719, 0xfffffad9}, {0x20490732, 0xfffffac7},

								{0x20ba074b, 0xfffffab4}, {0x212a0764, 0xfffffaa2},

								{0x219b077d, 0xfffffa90}, {0x220b0796, 0xfffffa7e},

								{0x227c07af, 0xfffffa6c}, {0x22ec07c8, 0xfffffa59},

								{0x235d07e1, 0xfffffa47}, {0x23cd07fa, 0xfffffa35},

								{0x243e0813, 0xfffffa23}, {0x24ae082c, 0xfffffa11},

								{0x251e0845, 0xfffff9fe}, {0x258f085e, 0xfffff9ec},

								{0x25ff0877, 0xfffff9da}, {0x26700890, 0xfffff9c8},

								{0x26e008aa, 0xfffff9b6}, {0x275108c3, 0xfffff9a3},

								{0x27c108dc, 0xfffff991}, {0x283208f5, 0xfffff97f},

								{0x28a2090e, 0xfffff96d}, {0x29130927, 0xfffff95b},

								{0x29830940, 0xfffff948}, {0x29f40959, 0xfffff936},

								{0x2a640972, 0xfffff924}, {0x2ad4098b, 0xfffff912},

								{0x2b4509a4, 0xfffff8ff}, {0x2bb509bd, 0xfffff8ed},

								{0x2c2609d6, 0xfffff8db}, {0x2c9609ef, 0xfffff8c9},

								{0x2d070a08, 0xfffff8b7}, {0x2d770a21, 0xfffff8a4},

								{0x2de80a3b, 0xfffff892}, {0x2e580a54, 0xfffff880},

								{0x2ec90a6d, 0xfffff86e}, {0x2f390a86, 0xfffff85c},

								{0x2faa0a9f, 0xfffff849}, {0x301a0ab8, 0xfffff837},

								{0x308a0ad1, 0xfffff825}, {0x30fb0aea, 0xfffff813},

								{0x316b0b03, 0xfffff801}, {0x31dc0b1c, 0xfffff7ee},

								{0x324c0b35, 0xfffff7dc}, {0x32bd0b4e, 0xfffff7ca},

								{0x332d0b67, 0xfffff7b8}, {0x339e0b80, 0xfffff7a6},

								{0x340e0b99, 0xfffff793}, {0x347f0bb2, 0xfffff781},

								{0x34ef0bcc, 0xfffff76f}, {0x35600be5, 0xfffff75d},

								{0x35d00bfe, 0xfffff74b}, {0x36400c17, 0xfffff738},

								{0x36b10c30, 0xfffff726}, {0x37210c49, 0xfffff714},

								{0x37920c62, 0xfffff702}, {0x38020c7b, 0xfffff6f0},

								};


								#define COEF_WIDTH   8

								#define SHIFT_WIDTH  COEF_WIDTH


								//

								// All of the RGB converters follow the template given below. The converters make

								// some assumptions about the frame size. All output frame sizes are assumed to

								// have a frame height that is a multiple of 48. Also, the output frame width

								// is assumed to be a multiple of 8. If the input frame size is equal

								// to the output frame size, no stretching or cropping is done. Otherwise, the

								// image is cropped and stretched for an 11:12 aspect ratio.

								//


								#if 0

								void rgb_color_converter() {

									for (j = 0; j < LumaIters; j++) {

										for (k = 0; k < mark; k++) {

											for (i = FrameWidth; i > 0; i -= m, pnext += n) {

												compute m Y values using look-up tables

												if (0 == (k&1)) {

													compute m/2 U,V values using look-up tables

												}

											}

											if ((0 == k) && j) {

												for (i = FrameWidth; i > 0; i -= 8 {

													t = *pyprev++ & 0xFEFEFEFE;

													t += *pynext++ & 0xFEFEFEFE;

													*pyspace++ = t;

													t = *pyprev++ & 0xFEFEFEFE;

													t += *pynext++ & 0xFEFEFEFE;

													*pyspace++ = t;

												}

											}

											pnext += iBackTwoLines;

											py += ypitch_adj;

											if (0 == (k&1)) {

												pu += uvpitch_adj;

												pv += uvpitch_adj;

											}

										}

										if (stretch) {

											pyprev = py - pitch;

											pyspace = py;

											pynext = py + pitch;

										}

									}

									if (stretch) {

										for (i = FrameWidth; i > 0; i -= 4 {

											*pyspace++ = *pyprev++;

										}

									}

								}

								#endif


								//

								// For the IA versions, the strategy is to compute the Y value for an odd RGB value

								// followed by computing the Y value for the corresponding even RGB value. The registers

								// are then set with the proper values to compute U and V values for the even RGB

								// value. This avoids repeating the shifting and masking needed to extract the Red,

								// Green and Blue components.

								//


								/*****************************************************************************

								 *

								 *  H26X_BGR24toYUV12()

								 *

								 *  Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory

								 *  with pitch defined by the constant PITCH. The input data is stored in

								 *  the order B,G,R,B,G,R...

								 *

								 */


								#if 0

								_STATIC void C_H26X_BGR24toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * lpInput,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

									U32 *pnext, *pyprev, *pyspace, *pynext;

									U32 tm;

									int t;

									int i, j, k;

									int iBackTwoLines;

									int stretch, mark, aspect;

									int height_adj, width_adj;

									int LumaIters = 0;

									int ypitch_adj = pitch - FrameWidth;

									int uvpitch_adj = pitch - (FrameWidth >> 1);


									// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).

									for (i = FrameHeight; i > 0; i -= 48) {

										LumaIters += 4;

									}

									width_adj = (lpbiInput->biWidth - FrameWidth) >> 1;

									width_adj += (width_adj << 1);

									aspect = (width_adj ? LumaIters : 0);

									height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;

									stretch = (height_adj ? 1 : 0);

									mark = 12 - stretch;

									// The input image is upside down - process the lines in reverse order.


									// Move from end of line N to beginning of line N-1

									iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 2);

									iBackTwoLines += (iBackTwoLines << 1);


									// Point to the beginning of the last line.

									pnext =	(U32 *)

												(lpInput +

												((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) *

													((FrameHeight - aspect - 1) + height_adj)) +

												width_adj);


									for ( j = 0; j < LumaIters; j++) {


										for (k = 0; k < mark; k++) {


											for (i = FrameWidth; i > 0; i -= 4, pnext += 3) {

												tm = pnext[0];

												t = BYUV[tm>>25].YU;

												tm = pnext[1];

												t += (GYUV[(tm>>1)&0x7F].YU +

												      RYUV[(tm>>9)&0x7F].YU);

												*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);

												tm = pnext[0];

												t = (BYUV[(tm>>1)&0x7F].YU +

												     GYUV[(tm>>9)&0x7F].YU +

												     RYUV[(tm>>17)&0x7F].YU);

												*YPlane = (U8)((t>>SHIFT_WIDTH)+8);

												if (0 == (k&1)) {

													*UPlane++ = (U8)((t>>24)+64);

													t = (RYUV[(tm>>17)&0x7F].V +

													     GYUV[(tm>>9)&0x7F].V +

													     BYUV[(tm>>1)&0x7F].V);

													*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);

												}

												tm = pnext[2];

												t = (BYUV[(tm>>9)&0x7F].YU +

												     GYUV[(tm>>17)&0x7F].YU +

												     RYUV[tm>>25].YU);

												*(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8);

												tm = pnext[1];

												t = BYUV[(tm>>17)&0x7F].YU + GYUV[tm>>25].YU;

												tm = pnext[2];

												t += RYUV[(tm>>1)&0x7F].YU;

												*(YPlane+2) = (U8)((t>>SHIFT_WIDTH)+8);

												YPlane += 4;

												if (0 == (k&1)) {

													*UPlane++ = (U8)((t>>24)+64);

													t = RYUV[(tm>>1)&0x7F].V;

													tm = pnext[1];

													t += GYUV[tm>>25].V + BYUV[(tm>>17)&0x7F].V;

													*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);

												}

											}

											if (stretch && (0 == k) && j) {

												for (i = FrameWidth; i > 0; i -= 8) {

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

												}

											}

											pnext += iBackTwoLines;

											YPlane += ypitch_adj;

											// Increment after even lines.

											if(0 == (k&1)) {

												UPlane += uvpitch_adj;

												VPlane += uvpitch_adj;

											}

										} // end of for k

										if (stretch) {

											pyprev = (U32 *)(YPlane - pitch);

											pyspace = (U32 *)YPlane;

											pynext = (U32 *)(YPlane += pitch);

										}

									} // end of for j

									if (stretch) {

										for (i = FrameWidth; i > 0; i -= 4) {

											*pyspace++ = *pyprev++;

										}

									}

								} // end of C_H26X_BGR24toYUV12()

								#endif


								__declspec(naked)

								_STATIC void IA_H26X_BGR24toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * BGR24Image,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch			|  + 96

								//	| FrameHeight	|  + 92

								//	| FrameWidth	|  + 88

								//	| VPlane		|  + 84

								//	| UPlane		|  + 80

								//	| YPlane		|  + 76

								//	| lpInput		|  + 72

								//	| lpbiInput		|  + 68

								//	----------------------------

								//	| return addr	|  + 64

								//	| saved ebp		|  + 60

								//	| saved ebx		|  + 56

								//	| saved esi		|  + 52

								//	| saved edi		|  + 48


								//  | pyprev		|  + 44

								//  | pyspace       |  + 40

								//  | pynext        |  + 36

								//	| i				|  + 32

								//	| j				|  + 28

								//	| k				|  + 24

								//	| iBackTwoLines	|  + 20

								//	| stretch		|  + 16

								//	| mark			|  + 12

								//	| LumaIters		|  +  8

								//	| ypitch_adj	|  +  4

								//	| uvpitch_adj	|  +  0


								#define LOCALSIZE			 48


								#define PITCH_PARM			 96

								#define FRAME_HEIGHT		 92

								#define FRAME_WIDTH			 88

								#define VPLANE				 84

								#define UPLANE				 80

								#define YPLANE				 76

								#define LP_INPUT			 72

								#define LPBI_INPUT			 68


								#define PYPREV				 44

								#define PYSPACE				 40

								#define PYNEXT				 36

								#define LOOP_I				 32

								#define LOOP_J				 28

								#define LOOP_K				 24

								#define BACK_TWO_LINES		 20

								#define STRETCH				 16

								#define MARK				 12

								#define LUMA_ITERS			  8

								#define YPITCH_ADJ			  4

								#define UVPITCH_ADJ			  0


									_asm {


									push	ebp

									push 	ebx

									push 	esi

									push 	edi

									sub 	esp, LOCALSIZE


								// assign (ebx, lpbiInput)

									mov		ebx, [esp + LPBI_INPUT]

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (edx, pitch)

									mov		ebp, ecx

									shr		ebp, 1

									sub		edx, ebp

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (edx, LumaIters)

									xor		edx, edx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		edx, [edx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = (lpbiInput->biWidth - FrameWidth) >> 1

								// width_adj += width_adj << 1

								// assign (esi, width_adj)

									mov		esi, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		esi, [esp + FRAME_WIDTH]

									mov		eax, esi

									shr		eax, 1

									add		esi, eax

								// aspect = (width_adj ? LumaIters : 0)

								// assign (edi, aspect)

								// kill (edx, LumaIters)

									mov		[esp + LUMA_ITERS], edx

									xor		edi, edi

									test	esi, esi

									jz		L2

									mov		edi, edx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (edx, height_adj)

								L2:

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		edx, [esp + FRAME_HEIGHT]

									add		edx, edi

									shr		edx, 1

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	edx, edx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		ebp, 12

									sub		ebp, eax

									mov		[esp + MARK], ebp

								// iBackTwoLines = -(lpbiInput->biWidth + FrameWidth)

								// iBackTwoLines += (iBackTwoLines << 1)

									mov		ebp, (LPBITMAPINFOHEADER)[ebx].biWidth

									add		ebp, [esp + FRAME_WIDTH]

									neg		ebp

									mov		eax, ebp

									shl		eax, 1

									add		ebp, eax

									mov		[esp + BACK_TWO_LINES], ebp

								// pnext = lpInput +

								//            ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) *

								//             ((FrameHeight - aspect - 1) + height_adj)) +

								//            width_adj

								// kill (ebx, lpbiInput)

								// kill (ecx, FrameWidth)

								// kill (edx, height_adj)

								// kill (esi, width_adj)

								// kill (edi, aspect)

								// assign (esi, pnext)

									mov		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

									shl		eax, 1

									add		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

									mov		ebx, [esp + FRAME_HEIGHT]

									sub		ebx, edi

									dec		ebx

									add		ebx, edx

									imul	ebx

									add		esi, eax

									add		esi, [esp + LP_INPUT]

								// assign (edi, YPlane)

									mov		edi, [esp + YPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								// for (k = 0; k < mark; k++)

								L4:

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								// for (i = FrameWidth; i > 0; i -= 4, pnext += 12)

								L5:

									mov		eax, [esp + FRAME_WIDTH]

									mov		[esp + LOOP_I], eax

								// This jump is here to make sure the following loop starts in the U pipe

									jmp		L6

								L6:

								//  ---------------------

								//  | B2 | R1 | G1 | B1 | pnext[0]

								//  ---------------------

								//  | G3 | B3 | R2 | G2 | pnext[1]

								//  ---------------------

								//  | R4 | G4 | B4 | R3 | pnext[2]

								//  ---------------------


								// t0 = pnext[0]

								// t1 = pnext[1]

								// t = ( BYUV[t0>>25].YU +

								//       GYUV[(t1>> 1)&0x7F].YU +

								//       RYUV[(t1>> 9)&0x7F].YU )

								// *(YPlane+1) = ((t>>8)+8)

								// t = ( BYUV[(t0>> 1)&0x7F].YU +

								//       GYUV[(t0>> 9)&0x7F].YU +

								//       RYUV[(t0>>17)&0x7F].YU )

								// *YPlane = ((t>>8)+8)

								// assign(eax: B2,Y1,Y2,U)

								// assign(ebx: B1,V)

								// assign(ecx: G2,G1)

								// assign(edx: R2,R1)

								// assign(ebp: B1)


								// 1

									mov 	eax, [esi]

									mov		ecx, [esi + 4]

								// 2

									mov 	ebx, eax

									mov 	edx, ecx

								// 3

									shr 	eax, 25

									and 	ecx, 0xFE

								// 4

									shr 	ecx, 1

									and 	edx, 0xFE00

								// 5

									shr 	edx, 9

										and		ebx, 0xFEFEFE

								// 6

									mov 	eax, [BYUV+eax*8].YU

									nop

								// 7

									add 	eax, [GYUV+ecx*8].YU

										mov		ecx,  ebx

								// 8

									add 	eax, [RYUV+edx*8].YU

										mov		edx,  ebx

								// 9

										and 	ebx, 0xFE

									add	eax,  0x800

								// 10

									sar 	eax, 8

									nop

								// 11

										shr 	ebx, 1

										nop

								// 12

										shr 	ecx, 9

									mov	 [edi + 1], al

								// 13

										shr		edx, 17

										and		ecx, 0x7F

								// 14

										mov		eax, [BYUV+ebx*8].YU

										and		edx, 0x7F

								// 15

										add	 	eax, [GYUV+ecx*8].YU

										mov		ebp, ebx

								// 16

										add		eax, [RYUV+edx*8].YU

										nop

								// 17

										sar		eax, 8

										mov 	ebx, [esp + LOOP_K]

								// 18

										add		eax, 8

										and		ebx, 1

								// 19

										mov 	[edi], al

										jnz		L9


								// At this point, ebp: B1, ecx: G1, edx: R1

								// t0 = pnext[0]

								// *UPlane++   = ((t>>24)+64)

								// t   = ( RYUV[(t0>>17)&0x7F].V +

								//         GYUV[(t0>> 9)&0x7F].V +

								//         BYUV[(t0>> 1)&0x7F].V )

								// *VPlane++ = ((t>>8)+64)


								// 20

									mov 	ebx, [RYUV+edx*8].V

									mov 	edx, [esp + UPLANE]

								// 21

									sar		eax, 16

									add 	ebx, [GYUV+ecx*8].V

								// 22

									add		eax, 64

									add 	ebx, [BYUV+ebp*8].V

								// 23

									mov		[edx], al

									inc		edx

								// 24

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 25

									sar 	ebx, 8

									inc		edx

								// 26

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 27

									mov		[edx - 1], bl

									nop


								L9:

								//  ---------------------

								//  | B2 | R1 | G1 | B1 | pnext[0]

								//  ---------------------

								//  | G3 | B3 | R2 | G2 | pnext[1]

								//  ---------------------

								//  | R4 | G4 | B4 | R3 | pnext[2]

								//  ---------------------


								// t1 = pnext[1]

								// t2 = pnext[2]

								// t = ( BYUV[(t2>> 9)&0x7F].YU +

								//       GYUV[(t2>>17)&0x7F].YU +

								//       RYUV[t2>>25].YR )

								// *(YPlane+3) = ((t>>8)+8)

								// t = ( BYUV[(t1>>17)&0x7F].YU +

								//       GYUV[t1>>25].YU +

								//       RYUV[(t2>> 1)&0x7F].YU )

								// *(YPlane+2) = ((t>>8)+8)

								// YPlane += 4

								// assign(eax: B4,Y3,Y4,U)

								// assign(ebx: R3,V)

								// assign(ecx: G4,G3)

								// assign(edx: R4/B3)

								// assign(ebp: R3)


								// 28

									mov		ebp, [esi + 4]

									mov 	ebx, [esi + 8]

								// 29

									mov 	eax, ebx

									mov 	ecx, ebx

								// 30

									shr		eax, 9

									mov		edx, ebx

								// 31

									shr 	ecx, 17

									and 	eax, 0x7F

								// 32

									shr 	edx, 25

									and		ecx, 0x7F

								// 33

									mov 	eax, [BYUV+eax*8].YU

									nop

								// 34

									add 	eax, [GYUV+ecx*8].YU

										and		ebx, 0xFE

								// 35

									add 	eax, [RYUV+edx*8].YU

										mov		ecx, ebp

								// 36

										shr		ebx, 1

									add	eax,  0x800

								// 37

									sar 	eax, 8

										mov		edx, ebp

								// 38

										shr		edx, 17

									mov	 [edi + 3], al

								// 39

										shr 	ecx, 25

										and		edx, 0x7F

								// 40

										mov		eax, [RYUV+ebx*8].YU

										mov		ebp, ebx

								// 41

										add	 	eax, [GYUV+ecx*8].YU

										nop

								// 42

										add		eax, [BYUV+edx*8].YU

										nop

								// 43

										sar		eax, 8

										mov 	ebx, [esp + LOOP_K]

								// 44

										add		eax, 8

										and		ebx, 1

								// 45

										mov 	[edi + 2], al

										jnz		L16


								// At this point, ebp: R3, ecx: G3, edx: B3

								// t1 = pnext[1]

								// t2 = pnext[2]

								// *UPlane++   = ((t>>16)+64)

								// t   = ( RYUV[(t2>> 1)&0x7F].V +

								//         GYUV[t1>>25].V +

								//         BYUV[(t1>>17)&0x7F].V )

								// *VPlane++ = ((t>>8)+64)


								// 46

									mov 	ebx, [BYUV+edx*8].V

									mov 	edx, [esp + UPLANE]

								// 47

									sar		eax, 16

									add 	ebx, [GYUV+ecx*8].V

								// 48

									add		eax, 64

									add 	ebx, [RYUV+ebp*8].V

								// 49

									mov		[edx], al

									inc		edx

								// 50

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 51

									sar 	ebx, 8

									inc		edx

								// 52

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 53

									mov		[edx - 1], bl

									nop

								L16:

								// 54

									mov		eax, [esp + LOOP_I]

									lea		esi, [esi + 12]

								// 55

									sub		eax, 4

									lea		edi, [edi + 4]

								// 56

									mov		[esp + LOOP_I], eax

									jnz		L6


								// if (stretch && (0 == k) && j)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L21

									mov		eax, [esp + LOOP_K]

									test	eax, eax

									jnz		L21

									mov 	eax, [esp + LOOP_J]

									test	eax, eax

									jz		L21


								// spill YPlane ptr

									mov		[esp + YPLANE], edi

									nop


								// for (i = FrameWidth; i > 0; i -= 8)

								// assign (ebx, pyprev)

								// assign (ecx, t)

								// assign (edx, pynext)

								// assign (edi, pyspace)

								// assign (ebp, i)


								// make sure offsets are such that there are no bank conflicts here

									mov 	ebx, [esp + PYPREV]

									mov 	edi, [esp + PYSPACE]


									mov 	edx, [esp + PYNEXT]

									mov 	ebp, [esp + FRAME_WIDTH]


								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								L22:

								// 1

									mov		eax, [ebx]

									lea		ebx, [ebx + 4]

								// 2

									mov		ecx, [edx]

									lea		edx, [edx + 4]

								// 3

									shr		ecx, 1

									and		eax, 0xFEFEFEFE

								// 4

									shr		eax, 1

									and		ecx, 0x7F7F7F7F

								// 5

									add		eax, ecx

									mov		ecx, [ebx]

								// 6

									shr		ecx, 1

									mov		[edi], eax

								// 7

									mov		eax, [edx]

									and		ecx, 0x7F7F7F7F

								// 8

									shr		eax, 1

									lea		edi, [edi + 4]

								// 9

									and		eax, 0x7F7F7F7F

									lea		ebx, [ebx + 4]

								// 10

									lea		edx, [edx + 4]

									add		eax, ecx

								// 11

									mov		[edi], eax

									lea		edi, [edi + 4]

								// 12

									sub		ebp, 8

									jnz		L22

								// kill (ebx, pyprev)

								// kill (ecx, t)

								// kill (edx, pynext)

								// kill (edi, pyspace)

								// kill (ebp, i)


								// restore YPlane

									mov		edi, [esp + YPLANE]


								// pnext += iBackTwoLines

								L21:

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += ypitch_adj;

									add		edi, [esp + YPITCH_ADJ]

								// if(0 == (k&1))

									mov		eax, [esp + LOOP_K]

									and		eax, 1

									jnz		L23

								// UPlane += uvpitch_adj;

								// VPlane += uvpitch_adj;

									mov		eax, [esp + UVPITCH_ADJ]

									add		[esp + UPLANE], eax

									add		[esp + VPLANE], eax


								L23:

									inc		DWORD PTR [esp + LOOP_K]

									mov		eax, [esp + LOOP_K]

									cmp		eax, [esp + MARK]

									jl		L5


								// if (stretch)

									cmp		DWORD PTR [esp + STRETCH], 0

									je	 	L24

								// pyprev = YPlane - pitch

									mov		eax, edi

									sub		eax, [esp + PITCH_PARM]

									mov		[esp + PYPREV], eax

								// pyspace = YPlane

									mov		[esp + PYSPACE], edi

								// pynext = (YPlane += pitch)

									add		edi, [esp + PITCH_PARM]

									mov		[esp + PYNEXT], edi


								L24:

									inc		DWORD PTR [esp + LOOP_J]

									mov		eax, [esp + LOOP_J]

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


								// kill (esi, pnext)

								// kill (edi, YPlane)

								// if (stretch)

									mov		esi, [esp + PYPREV]

									cmp		DWORD PTR [esp + STRETCH], 0

									je		L26


								// for (i = FrameWidth; i > 0; i -= 4)

								// assign (esi, pyprev)

								// assign (edi, pyspace)

								// assign (ebp, i)

									mov		ebp, [esp + FRAME_WIDTH]

									 mov	edi, [esp + PYSPACE]

								L25:

									mov		ecx, [esi]

									 lea	esi, [esi + 4]

									mov		[edi], ecx

									 lea	edi, [edi + 4]

									sub		ebp, 4

									 jnz	L25

								// kill (esi, pyprev)

								// kill (edi, pyspace)

								// kill (ebp, i)


								L26:

									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef PYPREV

								#undef PYSPACE

								#undef PYNEXT

								#undef LOOP_I

								#undef LOOP_J

								#undef LOOP_K

								#undef BACK_TWO_LINES

								#undef STRETCH

								#undef MARK

								#undef LUMA_ITERS

								#undef YPITCH_ADJ

								#undef UVPITCH_ADJ


								#if 0

								_STATIC void C_H26X_BGR16toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * lpInput,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									UN  bitfield,

									const int pitch)

								{

									U32 *pnext, *pyprev, *pyspace, *pynext;

									U32 tm;

									int t;

									int i, j, k;

									int iBackTwoLines;

									int stretch, mark, aspect;

									int width_adj, height_adj;

									int LumaIters = 0;

									int ypitch_adj = pitch - FrameWidth;

									int uvpitch_adj = pitch - (FrameWidth >> 1);


									// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).

									for (i = FrameHeight; i > 0; i -= 48) {

										LumaIters += 4;

									}

									width_adj = lpbiInput->biWidth - FrameWidth;

									aspect = (width_adj ? LumaIters : 0);

									height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;

									stretch = (height_adj ? 1 : 0);

									mark = 12 - stretch;


									// The input image is upside down - process the lines in reverse order.


									// Move from end of line N to beginning of line N-1

									iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 1);


									// Point to the beginning of the last line.

									pnext = (U32 *)(lpInput +

													((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) +

													 width_adj);


									for ( j = 0; j < LumaIters; j++) {


										for (k = 0; k < mark; k++) {


											for (i = FrameWidth; i > 0; i -= 2, pnext++) {


												tm = *pnext;

												switch (bitfield) {

													//    555              2, 3, 8         0x7C, 0x7C, 0x7C

													case 555:

														t = (BYUV[(tm>>14)&0x7C].YU +

														     GYUV[(tm>>19)&0x7C].YU +

														     RYUV[(tm>>24)&0x7C].YU);

														*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);

														t = (BYUV[(tm<<2)&0x7C].YU +

														     GYUV[(tm>>3)&0x7C].YU +

														     RYUV[(tm>>8)&0x7C].YU);

														*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);

														YPlane += 2;

														break;

								#if 0

								// Beware - untested code ahead

													//    664              3, 3, 9         0x78, 0x7E, 0x7E

													case 664:

														t = (BYUV[(tm>>13)&0x78].YU +

														     GYUV[(tm>>19)&0x7E].YU +

														     RYUV[(tm>>25)&0x7E].YU);

														*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);

														t = (BYUV[(tm<<3)&0x78].YU +

														     GYUV[(tm>>3)&0x7E].YU +

														     RYUV[(tm>>9)&0x7E].YU);

														*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);

														YPlane += 2;

														break;

													//    565              2, 4, 9         0x7C, 0x7E, 0x7C

													case 565:

														t = (BYUV[(tm>>14)&0x7C].YU +

														     GYUV[(tm>>20)&0x7E].YU +

														     RYUV[(tm>>25)&0x7C].YU);

														*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);

														t = (BYUV[(tm<<2)&0x7C].YU +

														     GYUV[(tm>>4)&0x7E].YU +

														     RYUV[(tm>>9)&0x7C].YU);

														*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);

														YPlane += 2;

														break;

													//    655              2, 3, 9         0x7C, 0x7C, 0x7E

													case 655:

														t = (BYUV[(tm>>14)&0x7C].YU +

														     GYUV[(tm>>19)&0x7C].YU +

														     RYUV[(tm>>25)&0x7E].YU);

														*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);

														t = (BYUV[(tm<<2)&0x7C].YU +

														     GYUV[(tm>>3)&0x7C].YU +

														     RYUV[(tm>>9)&0x7E].YU);

														*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);

														YPlane += 2;

														break;

								#endif

												}

												if (0 == (k&1)) {

													switch (bitfield) {

														//    555              2, 3, 8         0x7C, 0x7C, 0x7C

														case 555:

															*UPlane++ = (U8)((t>>24)+64);

															t = (RYUV[(tm>>8)&0x7C].V +

															     GYUV[(tm>>3)&0x7C].V +

															     BYUV[(tm<<2)&0x7C].V);

															*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);

															break;

								#if 0

								// Beware - untested code ahead

														//    664              3, 3, 9         0x78, 0x7E, 0x7E

														case 664:

															*UPlane++ = (U8)((t>>24)+64);

															t = (RYUV[(tm>>9)&0x7E].V +

															     GYUV[(tm>>3)&0x7E].V +

															     BYUV[(tm<<3)&0x78].V);

															*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);

															break;

														//    565              2, 4, 9         0x7C, 0x7E, 0x7C

														case 565:

															*UPlane++ = (U8)((t>>24)+64);

															t = (RYUV[(tm>>9)&0x7C].V +

															     GYUV[(tm>>4)&0x7E].V +

															     BYUV[(tm<<2)&0x7C].V);

															*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);

															break;

														//    655              2, 3, 9         0x7C, 0x7C, 0x7E

														case 655:

															*UPlane++ = (U8)((t>>24)+64);

															t = (RYUV[(tm>>9)&0x7E].V +

															     GYUV[(tm>>3)&0x7C].V +

															     BYUV[(tm<<2)&0x7C].V);

															*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);

															break;

								#endif

													}

												}

											}

											if (stretch && (0 == k) && j) {

												for (i = FrameWidth; i > 0; i -= 8) {

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

												}

											}

											pnext += iBackTwoLines;

											YPlane += ypitch_adj;

											// Increment after even lines.

											if(0 == (k&1)) {

												UPlane += uvpitch_adj;

												VPlane += uvpitch_adj;

											}

										} // end of for k

										if (stretch) {

											pyprev = (U32 *)(YPlane - pitch);

											pyspace = (U32 *)YPlane;

											pynext = (U32 *)(YPlane += pitch);

										}

									} // end of for j

									if (stretch) {

										for (i = FrameWidth; i > 0; i -= 4) {

											*pyspace++ = *pyprev++;

										}

									}

								} // end of C_H26X_BGR16toYUV12

								#endif


								__declspec(naked)

								_STATIC void IA_H26X_BGR16555toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * lpInput,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch			|  + 96

								//	| FrameHeight	|  + 92

								//	| FrameWidth	|  + 88

								//	| VPlane		|  + 84

								//	| UPlane		|  + 80

								//	| YPlane		|  + 76

								//	| lpInput		|  + 72

								//	| lpbiInput		|  + 68

								//	----------------------------

								//	| return addr	|  + 64

								//	| saved ebp		|  + 60

								//	| saved ebx		|  + 56

								//	| saved esi		|  + 52

								//	| saved edi		|  + 48


								//  | pyprev		|  + 44

								//  | pyspace       |  + 40

								//  | pynext        |  + 36

								//	| i				|  + 32

								//	| j				|  + 28

								//	| k				|  + 24

								//	| iBackTwoLines	|  + 20

								//	| stretch		|  + 16

								//	| mark			|  + 12

								//	| LumaIters		|  +  8

								//	| ypitch_adj	|  +  4

								//	| uvpitch_adj	|  +  0


								#define LOCALSIZE			 48


								#define PITCH_PARM			 96

								#define FRAME_HEIGHT		 92

								#define FRAME_WIDTH			 88

								#define VPLANE				 84

								#define UPLANE				 80

								#define YPLANE				 76

								#define LP_INPUT			 72

								#define LPBI_INPUT			 68


								#define PYPREV				 44

								#define PYSPACE				 40

								#define PYNEXT				 36

								#define LOOP_I				 32

								#define LOOP_J				 28

								#define LOOP_K				 24

								#define BACK_TWO_LINES		 20

								#define STRETCH				 16

								#define MARK				 12

								#define LUMA_ITERS			  8

								#define YPITCH_ADJ			  4

								#define UVPITCH_ADJ			  0


									_asm {


									push	ebp

									push 	ebx

									push 	esi

									push 	edi

									sub 	esp, LOCALSIZE


								// assign (ebx, lpbiInput)

									mov		ebx, [esp + LPBI_INPUT]

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (edx, pitch)

									mov		ebp, ecx

									shr		ebp, 1

									sub		edx, ebp

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (edx, LumaIters)

									xor		edx, edx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		edx, [edx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = lpbiInput->biWidth - FrameWidth

								// assign (esi, width_adj)

									mov		esi, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		esi, [esp + FRAME_WIDTH]

								// aspect = (width_adj ? LumaIters : 0)

								// assign (edi, aspect)

								// kill (edx, LumaIters)

									mov		[esp + LUMA_ITERS], edx

									xor		edi, edi

									test	esi, esi

									jz		L2

									mov		edi, edx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (edx, height_adj)

								L2:

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		edx, [esp + FRAME_HEIGHT]

									add		edx, edi

									shr		edx, 1

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	edx, edx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		ebp, 12

									sub		ebp, eax

									mov		[esp + MARK], ebp

								// iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)

									mov		ebp, (LPBITMAPINFOHEADER)[ebx].biWidth

									add		ebp, [esp + FRAME_WIDTH]

									shl		ebp, 1

									neg		ebp

									mov		[esp + BACK_TWO_LINES], ebp

								// pnext = lpInput +

								//            ((lpbiInput->biWidth << 1) *

								//             ((FrameHeight - aspect - 1) + height_adj)) +

								//            width_adj

								// kill (ebx, lpbiInput)

								// kill (ecx, FrameWidth)

								// kill (edx, height_adj)

								// kill (esi, width_adj)

								// kill (edi, aspect)

								// assign (esi, pnext)

									mov		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

									shl		eax, 1

									mov		ebx, [esp + FRAME_HEIGHT]

									sub		ebx, edi

									dec		ebx

									add		ebx, edx

									imul	ebx

									add		esi, eax

									add		esi, [esp + LP_INPUT]

								// assign (edi, YPlane)

									mov		edi, [esp + YPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								// for (k = 0; k < mark; k++)

								L4:

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								// for (i = FrameWidth; i > 0; i -= 2, pnext += 4)

								L5:

									mov		eax, [esp + FRAME_WIDTH]

									mov		[esp + LOOP_I], eax

								// This jump is here to make sure the following loop starts on the U pipe

									jmp		L6

								L6:

								// tm = pnext[0]

								// t = ( BYUV[(tm>>14)&0x7C].YU +

								//       GYUV[(tm>>19)&0x7C].YU +

								//       RYUV[(tm>>24)&0x7C].YU )

								// *(YPlane+1) = (U8)((t>>8)+8)

								// t = ( BYUV[(tm<< 2)&0x7C].YU +

								//       GYUV[(tm>> 8)&0x7C].YU +

								//       RYUV[(tm>>13)&0x7C].YU )

								// *YPlane = (U8)((t>>8)+8)

								// YPlane += 2

								// assign(eax: B2/Y1/Y2/U)

								// assign(ebx: B1/V)

								// assign(ecx: G2/G1)

								// assign(edx: R2/R1)

								// assign(ebp: B1)

								// 1

									mov 	eax, [esi]

									nop

								// 2

									mov 	ebx, eax

									mov 	ecx, eax

								// 3

									shr 	eax, 14

									mov 	edx, ebx

								// 4

									shr 	ecx, 19

									and 	eax, 0x7C

								// 5

									shr 	edx, 24

									and 	ecx, 0x7C

								// 6

									mov 	eax, [BYUV+eax*8].YU

									and 	edx, 0x7C

								// 7

									add 	eax, [GYUV+ecx*8].YU

										mov	ecx,  ebx

								// 8

									add 	eax, [RYUV+edx*8].YU

										mov	edx,  ebx

								// 9

									sar 	eax, 8

										and	ebx,  0x1F

								// 10

										shl 	ebx, 2

									add 	eax, 8

								// 11

										shr 	ecx, 3

									mov	 	[edi + 1], al

								// 12

										shr 	edx, 8

										and 	ecx, 0x7C

								// 13

										mov	 	eax, [BYUV+ebx*8].YU

										and	 	edx, 0x7C

								// 14

										add	 	eax, [GYUV+ecx*8].YU

										mov	 	ebp, ebx

								// 15

										add	 	eax, [RYUV+edx*8].YU

										nop

								// 16

										sar	 	eax, 8

										mov 	ebx, [esp + LOOP_K]

								// 17

										add	 	eax, 8

										and		ebx, 1

								// 18

										mov 	[edi], al

										jnz 	L9


								// At this point, ebp: B1, ecx: G1, edx: R1

								// *UPlane++   = (U8)((t>>24)+64)

								// t   = ( VBGR[(t>>13)&0x7C].VR +

								//         VBGR[(t>> 8)&0x7C].VG +

								//         VBGR[(t<< 2)&0x7C].VB )

								// *VPlane++ = (U8)((t>>8)+64)

								// 19

									mov 	ebx, [RYUV+edx*8].V

									mov 	edx, [esp + UPLANE]

								// 20

									sar		eax, 16

									add 	ebx, [GYUV+ecx*8].V

								// 21

									add		eax, 64

									add 	ebx, [BYUV+ebp*8].V

								// 22

									mov		[edx], al

									inc		edx

								// 23

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 24

									sar 	ebx, 8

									inc		edx

								// 25

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 26

									mov		[edx - 1], bl

									nop

								L9:

								// 27

									mov		eax, [esp + LOOP_I]

									lea		esi, [esi + 4]

								// 28

									sub		eax, 2

									lea		edi, [edi + 2]

								// 29

									mov		[esp + LOOP_I], eax

									jnz		L6


								// if (stretch && (0 == k) && j)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L14

									mov		eax, [esp + LOOP_K]

									test	eax, eax

									jnz		L14

									mov 	eax, [esp + LOOP_J]

									test	eax, eax

									jz		L14


								// spill YPlane ptr

									mov		[esp + YPLANE], edi

									nop


								// for (i = FrameWidth; i > 0; i -= 8)

								// assign (ebx, pyprev)

								// assign (ecx, t)

								// assign (edx, pynext)

								// assign (edi, pyspace)

								// assign (ebp, i)


								// make sure offsets are such that there are no bank conflicts here

									mov 	ebx, [esp + PYPREV]

									mov 	edi, [esp + PYSPACE]


									mov 	edx, [esp + PYNEXT]

									mov 	ebp, [esp + FRAME_WIDTH]


								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								L15:

								// 1

									mov		eax, [ebx]

									lea		ebx, [ebx + 4]

								// 2

									mov		ecx, [edx]

									lea		edx, [edx + 4]

								// 3

									shr		ecx, 1

									and		eax, 0xFEFEFEFE

								// 4

									shr		eax, 1

									and		ecx, 0x7F7F7F7F

								// 5

									add		eax, ecx

									mov		ecx, [ebx]

								// 6

									shr		ecx, 1

									mov		[edi], eax

								// 7

									mov		eax, [edx]

									and		ecx, 0x7F7F7F7F

								// 8

									shr		eax, 1

									lea		edi, [edi + 4]

								// 9

									and		eax, 0x7F7F7F7F

									lea		ebx, [ebx + 4]

								// 10

									lea		edx, [edx + 4]

									add		eax, ecx

								// 11

									mov		[edi], eax

									lea		edi, [edi + 4]

								// 12

									sub		ebp, 8

									jnz		L15

								// kill (ebx, pyprev)

								// kill (ecx, t)

								// kill (edx, pynext)

								// kill (edi, pyspace)

								// kill (ebp, i)


								// restore YPlane

									mov		edi, [esp + YPLANE]


								// pnext += iBackTwoLines

								L14:

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += ypitch_adj;

									add		edi, [esp + YPITCH_ADJ]

								// if(0 == (k&1))

									mov		eax, [esp + LOOP_K]

									and		eax, 1

									jnz		L16

								// UPlane += uvpitch_adj;

								// VPlane += uvpitch_adj;

									mov		eax, [esp + UVPITCH_ADJ]

									add		[esp + UPLANE], eax

									add		[esp + VPLANE], eax


								L16:

									inc		DWORD PTR [esp + LOOP_K]

									mov		eax, [esp + LOOP_K]

									cmp		eax, [esp + MARK]

									jl		L5


								// if (stretch)

									cmp		DWORD PTR [esp + STRETCH], 0

									je	 	L17

								// pyprev = YPlane - pitch

									mov		eax, edi

									sub		eax, [esp + PITCH_PARM]

									mov		[esp + PYPREV], eax

								// pyspace = YPlane

									mov		[esp + PYSPACE], edi

								// pynext = (YPlane += pitch)

									add		edi, [esp + PITCH_PARM]

									mov		[esp + PYNEXT], edi


								L17:

									inc		DWORD PTR [esp + LOOP_J]

									mov		eax, [esp + LOOP_J]

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


								// kill (esi, pnext)

								// kill (edi, YPlane)

								// if (stretch)

									mov		esi, [esp + PYPREV]

									cmp		DWORD PTR [esp + STRETCH], 0

									je		L19


								// for (i = FrameWidth; i > 0; i -= 4)

								// assign (esi, pyprev)

								// assign (edi, pyspace)

								// assign (ebp, i)

									mov		ebp, [esp + FRAME_WIDTH]

									 mov	edi, [esp + PYSPACE]

								L18:

									mov		ecx, [esi]

									 lea	esi, [esi + 4]

									mov		[edi], ecx

									 lea	edi, [edi + 4]

									sub		ebp, 4

									 jnz	L18

								// kill (esi, pyprev)

								// kill (edi, pyspace)

								// kill (ebp, i)


								L19:

									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef PYPREV

								#undef PYSPACE

								#undef PYNEXT

								#undef LOOP_I

								#undef LOOP_J

								#undef LOOP_K

								#undef BACK_TWO_LINES

								#undef STRETCH

								#undef MARK

								#undef LUMA_ITERS

								#undef YPITCH_ADJ

								#undef UVPITCH_ADJ


								/*****************************************************************************

								 *

								 *  H26X_CLUTtoYUV12()

								 *

								 *  Convert from CLUT8/CLUT4 to YUV12 (YCrCb 4:2:0) and copy to destination memory

								 *  with pitch defined by the constant PITCH.

								 *

								 *	This is needed to support the quickcam.

								 */


								#if 0

								_STATIC void C_H26X_CLUTtoYUV12(

									LPBITMAPINFOHEADER lpbiInput,

								    U8 * lpInput,

								    U8 * YPlane,

								    U8 * UPlane,

								    U8 * VPlane,

								    UN  FrameWidth,

								    UN  FrameHeight,

									UN	pixel_bits,

									const int pitch)

								{

									U32 *pnext, *pyprev, *pyspace, *pynext;

									U32 tm, tn;

									int t;

									int i, j, k, m, n;

									int iNextLine, iBackTwoLines;

									int stretch, mark, aspect;

									int width_adj, height_adj;

									int yshift, uvshift;

									int pixel_mask, loop_cnt, loop_limit;

									RGBQUAD *lpCEntry, *lpCTable = (RGBQUAD *)((U8 *)lpbiInput + sizeof(BITMAPINFOHEADER));

									int LumaIters = 0;

									int ypitch_adj = (pitch - FrameWidth);

									int uvpitch_adj = (pitch - (FrameWidth >> 1));


									ASSERT((8 == pixel_bits) || (4 == pixel_bits));

									// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).

									for (i = FrameHeight; i > 0; i -= 48) {

										LumaIters += 4;

									}

									width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1);

									aspect = (width_adj ? LumaIters : 0);

									height_adj = ((lpbiInput->biHeight - (FrameHeight - aspect)) >> 1);

									stretch = (height_adj ? 1 : 0);

									mark = 12 - stretch;

									iNextLine = lpbiInput->biWidth;

									iBackTwoLines = -((iNextLine + (int)FrameWidth) >> 2);

									if (8 == pixel_bits) {

										yshift = 8;

										uvshift = 16;

										pixel_mask = 0xFF;

										loop_cnt = 2;

										loop_limit = 4;

									} else {

										yshift = 4;

										uvshift = 8;

										pixel_mask = 0xF;

										loop_cnt = 1;

										loop_limit = 8;

										width_adj >>= 1;

										iNextLine >>= 1;

										iBackTwoLines >>= 1;

									}


									// The input image is upside down - process the lines in reverse order.


									// Point to the beginning of the last line.

									pnext = (U32 *)(lpInput +

													(iNextLine * ((FrameHeight - aspect - 1) + height_adj)) + width_adj);


									for (j = 0; j < LumaIters; j++) {


										for (k = 0; k < mark; k++) {


											for (i = FrameWidth; i > 0; i -= 8) {


												for (n = 0; n < loop_cnt; n++) {

													tm = *pnext++;

													tm = ((4 == pixel_bits) ?

														( ((tm >> 4) & 0x0F0F0F0F) | ((tm << 4) & 0xF0F0F0F0) ) : tm);

													tn = tm;

													for (m = 0; m < loop_limit; m += 4) {

														lpCEntry = &lpCTable[tm&pixel_mask];

														t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

															  GYUV[lpCEntry->rgbGreen>>1].YU +

															  RYUV[lpCEntry->rgbRed>>1].YU );

														*YPlane++ = (U8)((t>>8)+8);

														tm >>= yshift;

														lpCEntry = &lpCTable[tm&pixel_mask];

														t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

															  GYUV[lpCEntry->rgbGreen>>1].YU +

															  RYUV[lpCEntry->rgbRed>>1].YU );

														*YPlane++ = (U8)((t>>8)+8);

														tm >>= yshift;

														lpCEntry = &lpCTable[tm&pixel_mask];

														t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

															  GYUV[lpCEntry->rgbGreen>>1].YU +

															  RYUV[lpCEntry->rgbRed>>1].YU );

														*YPlane++ = (U8)((t>>8)+8);

														tm >>= yshift;

														lpCEntry = &lpCTable[tm&pixel_mask];

														t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

															  GYUV[lpCEntry->rgbGreen>>1].YU +

															  RYUV[lpCEntry->rgbRed>>1].YU );

														*YPlane++ = (U8)((t>>8)+8);

														tm >>= yshift;

													}

													if (0 == (k&1)) {

														for (m = 0; m < loop_limit; m += 2, tn >>= uvshift) {

															lpCEntry = &lpCTable[tn&pixel_mask];

															t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

																  RYUV[lpCEntry->rgbRed>>1].YU +

																  GYUV[lpCEntry->rgbGreen>>1].YU );

															*UPlane++ = (U8)((t>>24)+64);

															t = ( RYUV[lpCEntry->rgbRed>>1].V +

																  GYUV[lpCEntry->rgbGreen>>1].V +

																  BYUV[lpCEntry->rgbBlue>>1].V );

															*VPlane++ = (U8)((t>>8)+64);

														}

													}

												}

											}


											if (stretch && (0 == k) && j) {

												for (i = FrameWidth; i > 0; i -= 8) {

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

													tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);

													tm += ((*pynext++ & 0xFEFEFEFE) >> 1);

													*pyspace++ = tm;

												}

											}


											pnext += iBackTwoLines;


											YPlane += ypitch_adj;

											// Increment after even lines.

											if(0 == (k&1)) {

												UPlane += uvpitch_adj;

												VPlane += uvpitch_adj;

											}

										}


										if (stretch) {

											pyprev = (U32 *)(YPlane - pitch);

											pyspace = (U32 *)YPlane;

											pynext = (U32 *)(YPlane += pitch);

										}

									}


									if (stretch) {

										for (i = FrameWidth; i > 0; i -= 4) {

											*pyspace++ = *pyprev++;

										}

									}

								} // end of H26X_CLUTtoYUV12()

								#endif


								__declspec(naked)

								_STATIC void IA_H26X_CLUT8toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * lpInput,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch			|  +100

								//	| FrameHeight	|  + 96

								//	| FrameWidth	|  + 92

								//	| VPlane		|  + 88

								//	| UPlane		|  + 84

								//	| YPlane		|  + 80

								//	| lpInput		|  + 76

								//	| lpbiInput		|  + 72

								//	----------------------------

								//	| return addr	|  + 68

								//	| saved ebp		|  + 64

								//	| saved ebx		|  + 60

								//	| saved esi		|  + 56

								//	| saved edi		|  + 52


								//	| pyprev		|  + 48

								//	| pyspace		|  + 44

								//	| pynext		|  + 40

								//	| i				|  + 36

								//	| j				|  + 32

								//	| k				|  + 28

								//	| iBackTwoLines	|  + 24

								//	| stretch		|  + 20

								//	| mark			|  + 16

								//	| lpCEntry		|  + 12

								//	| lpCTable		|  +  8

								//	| ypitch_adj	|  +  4

								//	| uvpitch_adj	|  +  0


								#define LOCALSIZE			 52


								#define PITCH_PARM			100

								#define FRAME_HEIGHT		 96

								#define FRAME_WIDTH			 92

								#define VPLANE				 88

								#define UPLANE				 84

								#define YPLANE				 80

								#define LP_INPUT			 76

								#define LPBI_INPUT			 72


								#define PYPREV				 48

								#define PYSPACE				 44

								#define PYNEXT				 40

								#define LOOP_I				 36

								#define LOOP_J				 32

								#define LOOP_K				 28

								#define BACK_TWO_LINES		 24

								#define STRETCH				 20

								#define MARK				 16

								#define LUMA_ITERS			 12

								#define LPCTABLE			  8

								#define YPITCH_ADJ			  4

								#define UVPITCH_ADJ			  0


									_asm {


									push	ebp

									push 	ebx

									push 	esi

									push 	edi

									sub 	esp, LOCALSIZE


								// lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER)

								// assign (ebx, lpbiInput)

									mov		eax, [esp + LPBI_INPUT]

									mov		ebx, eax

									add		eax, TYPE BITMAPINFOHEADER

									mov		[esp + LPCTABLE], eax

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (ecx, FrameWidth)

								// kill (edx, pitch)

									shr		ecx, 1

									sub		edx, ecx

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (ecx, LumaIters)

									xor		ecx, ecx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		ecx, [ecx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1

								// assign (edx, width_adj)

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		edx, [esp + FRAME_WIDTH]

									shr		edx, 1

								// aspect = (width_adj ? LumaIters : 0)

								// assign (esi, aspect)

								// kill (ecx, LumaIters)

									mov		[esp + LUMA_ITERS], ecx

									xor		esi, esi

									test	edx, edx

									jz		L2

									mov		esi, ecx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (ecx, height_adj)

								L2:

									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		ecx, [esp + FRAME_HEIGHT]

									add		ecx, esi

									shr		ecx, 1

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	ecx, ecx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		edi, 12

									sub		edi, eax

									mov		[esp + MARK], edi

								// iNextLine = lpbiInput->biWidth

								// kill (ebx, lpbiInput)

								// assign (ebx, iNextLine)

									mov		ebx, (LPBITMAPINFOHEADER)[ebx].biWidth

								// iBackTwoLines = -(iNextline + FrameWidth)

									mov		edi, [esp + FRAME_WIDTH]

									add		edi, ebx

									neg		edi

									mov		[esp + BACK_TWO_LINES], edi

								// pnext = lpInput +

								//            (iNextLine*((FrameHeight-aspect-1) + height_adj)) +

								//            width_adj

								// kill (ebx, iNextLine)

								// kill (ecx, height_adj)

								// kill (edx, width_adj)

								// kill (esi, aspect)

								// assign (esi, pnext)

									mov		eax, [esp + FRAME_HEIGHT]

									sub		eax, esi

									dec		eax

									add		eax, ecx

									mov		esi, [esp + LP_INPUT]

									add		esi, edx

									imul	ebx

									add		esi, eax

								// assign (edi, YPlane)

									mov		edi, [esp + YPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								// for (k = 0; k < mark; k++)

								L4:

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								// for (i = FrameWidth; i > 0; i -= 2, pnext += 2)

								L5:

									mov		eax, [esp + FRAME_WIDTH]

									mov		[esp + LOOP_I], eax

								// This jump is here to make sure the following loop starts on the U pipe

									jmp		L6

								L6:

								// lpCEntry = &lpCTable[*(pnext+1)]

								// t = (  BYUV[lpCEntry->rgbBlue>>1].YU +

								//        GYUV[lpCEntry->rgbGreen>>1].YU +

								//        RYUV[lpCEntry->rgbRed>>1].YU )

								// *(YPlane+1) = (U8)((t>>8)+8)

								// lpCEntry = &lpCTable[*pnext]

								// t = (  BYUV[lpCEntry->rgbBlue>>1].YU +

								//        GYUV[lpCEntry->rgbGreen>>1].YU +

								//        RYUV[lpCEntry->rgbRed>>1].YU )

								// *YPlane = (U8)((t>>8)+8)

								// YPlane += 2

								// *UPlane++ = (U8)((t>>24)+64)

								// t = (  VBGR[lpCEntry->rgbRed>>1].V +

								//        VBGR[lpCEntry->rgbGreen>>1].V +

								//        VBGR[lpCEntry->rgbBlue>>1].V )

								// *VPlane++ = (U8)((t>>8)+64)

								// assign (ebp: lpCEntry,B1)

								// assign (eax: P2,B2,Y2,Y1,U)

								// assign (ebx: B1,V)

								// assign (ecx: G2,G1)

								// assign (edx: R2,R1)

								// 1

									xor		eax, eax

									mov		ebp, [esp + LPCTABLE]

								// 2

									mov		al, [esi + 1]

									xor		ecx, ecx

								// 3

									lea		ebx, [ebp+eax*4]

									xor		edx, edx

								// 4

									mov		al, (LPRGBQUAD)[ebx].rgbBlue

									nop

								// 5

									mov		cl, (LPRGBQUAD)[ebx].rgbGreen

									and		al, 0xFE

								// 6

									mov		dl, (LPRGBQUAD)[ebx].rgbRed

									and		cl, 0xFE

								// 7

									mov		eax, [BYUV+eax*4].YU

									and		dl, 0xFE

								// 8

									add		eax, [GYUV+ecx*4].YU

										xor		ebx, ebx

								// 9

									add		eax, [RYUV+edx*4].YU

										mov		bl, [esi]

								// 10

									sar		eax, 8

										lea		ebp, [ebp+ebx*4]

								// 11

									add		eax, 8

									nop

								// 12

									mov		[edi + 1], al

										mov		bl, (LPRGBQUAD)[ebp].rgbBlue

								// 13

										mov		cl, (LPRGBQUAD)[ebp].rgbGreen

										and		bl, 0xFE

								// 14

										mov		dl, (LPRGBQUAD)[ebp].rgbRed

										and		cl, 0xFE

								// 15

										mov		eax, [BYUV+ebx*4].YU

										and		dl, 0xFE

								// 16

										add		eax, [GYUV+ecx*4].YU

										mov		ebp, ebx

								// 17

										add		eax, [RYUV+edx*4].YU

										nop

								// 18

										sar		eax, 8

									mov		ebx, [esp + LOOP_K]

								// 19

										add		eax, 8

									and		ebx, 1

								// 20

										mov		[edi], al

									jnz		L9

								// 21

									mov 	ebx, [RYUV+edx*4].V

									mov 	edx, [esp + UPLANE]

								// 22

									sar		eax, 16

									add 	ebx, [GYUV+ecx*4].V

								// 23

									add		eax, 64

									add 	ebx, [BYUV+ebp*4].V

								// 24

									mov		[edx], al

									inc		edx

								// 25

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 26

									sar 	ebx, 8

									inc		edx

								// 27

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 28

									mov		[edx - 1], bl

									nop

								L9:

								// 29

									mov		eax, [esp + LOOP_I]

									lea		esi, [esi + 2]

								// 30

									sub		eax, 2

									lea		edi, [edi + 2]

								// 31

									mov		[esp + LOOP_I], eax

									jnz		L6


								// only esi (pnext) is live at this point (after line loop)

								// if (stretch && (0 == k) && j)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L14

									mov		eax, [esp + LOOP_K]

									test	eax, eax

									jnz		L14

									mov 	eax, [esp + LOOP_J]

									test	eax, eax

									jz		L14


								// spill YPlane ptr

									mov		[esp + YPLANE], edi

									nop


								// for (i = FrameWidth; i > 0; i -= 8)

								// assign (ebx, pyprev)

								// assign (ecx, t)

								// assign (edx, pynext)

								// assign (edi, pyspace)

								// assign (ebp, i)


								// make sure offsets are such that there are no bank conflicts here

									mov 	ebx, [esp + PYPREV]

									mov 	edi, [esp + PYSPACE]


									mov 	edx, [esp + PYNEXT]

									mov 	ebp, [esp + FRAME_WIDTH]


								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								L15:

								// 1

									mov		eax, [ebx]

									lea		ebx, [ebx + 4]

								// 2

									mov		ecx, [edx]

									lea		edx, [edx + 4]

								// 3

									shr		ecx, 1

									and		eax, 0xFEFEFEFE

								// 4

									shr		eax, 1

									and		ecx, 0x7F7F7F7F

								// 5

									add		eax, ecx

									mov		ecx, [ebx]

								// 6

									shr		ecx, 1

									mov		[edi], eax

								// 7

									mov		eax, [edx]

									and		ecx, 0x7F7F7F7F

								// 8

									shr		eax, 1

									lea		edi, [edi + 4]

								// 9

									and		eax, 0x7F7F7F7F

									lea		ebx, [ebx + 4]

								// 10

									lea		edx, [edx + 4]

									add		eax, ecx

								// 11

									mov		[edi], eax

									lea		edi, [edi + 4]

								// 12

									sub		ebp, 8

									jnz		L15

								// kill (ebx, pyprev)

								// kill (ecx, t)

								// kill (edx, pynext)

								// kill (edi, pyspace)

								// kill (ebp, i)


								// restore YPlane

									mov		edi, [esp + YPLANE]


								// pnext += iBackTwoLines

								L14:

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += ypitch_adj;

									add		edi, [esp + YPITCH_ADJ]

								// if(0 == (k&1))

									mov		eax, [esp + LOOP_K]

									and		eax, 1

									jnz		L16

								// UPlane += uvpitch_adj;

								// VPlane += uvpitch_adj;

									mov		eax, [esp + UVPITCH_ADJ]

									add		[esp + UPLANE], eax

									add		[esp + VPLANE], eax


								L16:

									inc		DWORD PTR [esp + LOOP_K]

									mov		eax, [esp + LOOP_K]

									cmp		eax, [esp + MARK]

									jl		L5


								// if (stretch)

									cmp		DWORD PTR [esp + STRETCH], 0

									je	 	L17

								// pyprev = YPlane - pitch

									mov		eax, edi

									sub		eax, [esp + PITCH_PARM]

									mov		[esp + PYPREV], eax

								// pyspace = YPlane

									mov		[esp + PYSPACE], edi

								// pynext = (YPlane += pitch)

									add		edi, [esp + PITCH_PARM]

									mov		[esp + PYNEXT], edi


								L17:

									inc		DWORD PTR [esp + LOOP_J]

									mov		eax, [esp + LOOP_J]

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


								// kill (esi, pnext)

								// kill (edi, YPlane)

								// if (stretch)

									mov		esi, [esp + PYPREV]

									cmp		DWORD PTR [esp + STRETCH], 0

									je		L19


								// for (i = FrameWidth; i > 0; i -= 4)

								// assign (esi, pyprev)

								// assign (edi, pyspace)

								// assign (ebp, i)

									mov		ebp, [esp + FRAME_WIDTH]

									 mov	edi, [esp + PYSPACE]

								L18:

									mov		ecx, [esi]

									 lea	esi, [esi + 4]

									mov		[edi], ecx

									 lea	edi, [edi + 4]

									sub		ebp, 4

									 jnz	L18

								// kill (esi, pyprev)

								// kill (edi, pyspace)

								// kill (ebp, i)


								L19:

									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef PYPREV

								#undef PYSPACE

								#undef PYNEXT

								#undef LOOP_I

								#undef LOOP_J

								#undef LOOP_K

								#undef BACK_TWO_LINES

								#undef STRETCH

								#undef MARK

								#undef LUMA_ITERS

								#undef LPCTABLE

								#undef YPITCH_ADJ

								#undef UVPITCH_ADJ


								__declspec(naked)

								_STATIC void IA_H26X_CLUT4toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * lpInput,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch			|  +100

								//	| FrameHeight	|  + 96

								//	| FrameWidth	|  + 92

								//	| VPlane		|  + 88

								//	| UPlane		|  + 84

								//	| YPlane		|  + 80

								//	| lpInput		|  + 76

								//	| lpbiInput		|  + 72

								//	----------------------------

								//	| return addr	|  + 68

								//	| saved ebp		|  + 64

								//	| saved ebx		|  + 60

								//	| saved esi		|  + 56

								//	| saved edi		|  + 52


								//	| pyprev		|  + 48

								//	| pyspace		|  + 44

								//	| pynext		|  + 40

								//	| i				|  + 36

								//	| j				|  + 32

								//	| k				|  + 28

								//	| iBackTwoLines	|  + 24

								//	| stretch		|  + 20

								//	| mark			|  + 16

								//	| lpCEntry		|  + 12

								//	| lpCTable		|  +  8

								//	| ypitch_adj	|  +  4

								//	| uvpitch_adj	|  +  0


								#define LOCALSIZE			 52


								#define PITCH_PARM			100

								#define FRAME_HEIGHT		 96

								#define FRAME_WIDTH			 92

								#define VPLANE				 88

								#define UPLANE				 84

								#define YPLANE				 80

								#define LP_INPUT			 76

								#define LPBI_INPUT			 72


								#define PYPREV				 48

								#define PYSPACE				 44

								#define PYNEXT				 40

								#define LOOP_I				 36

								#define LOOP_J				 32

								#define LOOP_K				 28

								#define BACK_TWO_LINES		 24

								#define STRETCH				 20

								#define MARK				 16

								#define LUMA_ITERS			 12

								#define LPCTABLE			  8

								#define YPITCH_ADJ			  4

								#define UVPITCH_ADJ			  0


									_asm {


									push	ebp

									push 	ebx

									push 	esi

									push 	edi

									sub 	esp, LOCALSIZE


								// lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER)

								// assign (ebx, lpbiInput)

									mov		eax, [esp + LPBI_INPUT]

									mov		ebx, eax

									add		eax, TYPE BITMAPINFOHEADER

									mov		[esp + LPCTABLE], eax

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (ecx, FrameWidth)

								// kill (edx, pitch)

									shr		ecx, 1

									sub		edx, ecx

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (ecx, LumaIters)

									xor		ecx, ecx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		ecx, [ecx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = ((lpbiInput->biWidth - FrameWidth) >> 2

								// assign (edx, width_adj)

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		edx, [esp + FRAME_WIDTH]

									shr		edx, 2

								// aspect = (width_adj ? LumaIters : 0)

								// assign (esi, aspect)

								// kill (ecx, LumaIters)

									mov		[esp + LUMA_ITERS], ecx

									xor		esi, esi

									test	edx, edx

									jz		L2

									mov		esi, ecx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (ecx, height_adj)

								L2:

									mov		ecx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		ecx, [esp + FRAME_HEIGHT]

									add		ecx, esi

									shr		ecx, 1

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	ecx, ecx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		edi, 12

									sub		edi, eax

									mov		[esp + MARK], edi

								// iNextLine = lpbiInput->biWidth >> 1

								// kill (ebx, lpbiInput)

								// assign (ebx, iNextLine)

									mov		ebx, (LPBITMAPINFOHEADER)[ebx].biWidth

									shr		ebx, 1

								// iBackTwoLines = -(iNextline + (FrameWidth >> 1))

									mov		edi, [esp + FRAME_WIDTH]

									shr		edi, 1

									add		edi, ebx

									neg		edi

									mov		[esp + BACK_TWO_LINES], edi

								// pnext = lpInput+(iNextLine*((FrameHeight-aspect-1)+height_adj))+ width_adj

								// kill (ebx, iNextLine)

								// kill (ecx, height_adj)

								// kill (edx, width_adj)

								// kill (esi, aspect)

								// assign (esi, pnext)

									mov		eax, [esp + FRAME_HEIGHT]

									sub		eax, esi

									dec		eax

									add		eax, ecx

									mov		esi, [esp + LP_INPUT]

									add		esi, edx

									imul	ebx

									add		esi, eax

								// assign (edi, YPlane)

									mov		edi, [esp + YPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								// for (k = 0; k < mark; k++)

								L4:

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								// for (i = FrameWidth; i > 0; i -= 2, pnext++)

								L5:

									mov		eax, [esp + FRAME_WIDTH]

									mov		[esp + LOOP_I], eax

								// This jump is here to make sure the following loop starts on the U pipe

									jmp		L6

								L6:

								// lpCEntry = &lpCTable[*pnext&0xF]

								// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

								//       GYUV[lpCEntry->rgbGreen>>1].YU +

								//       RYUV[lpCEntry->rgbRed>>1].YU )

								// *(YPlane+1) = (U8)((t>>8)+8)

								// lpCEntry = &lpCTable[(*pnext>>4)&0xF]

								// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +

								//       GYUV[lpCEntry->rgbGreen>>1].YU +

								//       RYUV[lpCEntry->rgbRed>>1].YU )

								// *YPlane = (U8)((t>>8)+8)

								// YPlane += 2

								// *UPlane++ = (U8)((t>24)+64)

								// t = ( RYUV[lpCEntry->rgbRed>>1].V +

								//       GYUV[lpCEntry->rgbGreen>>1].V +

								//       BYUV[lpCEntry->rgbBlue>>1].V )

								// *VPlane++ = (U8)((t>>8)+64)

								// assign (ebp: lpCEntry,B1)

								// assign (eax: P2,B2,Y2,Y1,U)

								// assign (ebx: B1,V)

								// assign (ecx: G2,G1)

								// assign (edx: R2,R1)

								// 1

									mov		al, [esi]

									mov		ebp, [esp + LPCTABLE]

								// 2

									and		eax, 0xF

									xor		ecx, ecx

								// 3

									lea		ebx, [ebp+eax*4]

									xor		edx, edx

								// 4

									mov		al, (LPRGBQUAD)[ebx].rgbBlue

									nop

								// 5

									mov		cl, (LPRGBQUAD)[ebx].rgbGreen

									and		al, 0xFE

								// 6

									mov		dl, (LPRGBQUAD)[ebx].rgbRed

									and		cl, 0xFE

								// 7

									mov		eax, [BYUV+eax*4].YU

									and		dl, 0xFE

								// 8

									add		eax, [GYUV+ecx*4].YU

									mov		bl, [esi]

								// 9

									add		eax, [RYUV+edx*4].YU

									and		ebx, 0xF0

								//

									shr		ebx, 4

									nop

								// 10

									shr		eax, 8

									lea		ebp, [ebp+ebx*4]

								// 11

									add		eax, 8

									nop

								// 12

									mov		[edi + 1], al

									mov		bl, (LPRGBQUAD)[ebp].rgbBlue

								// 13

									mov		cl, (LPRGBQUAD)[ebp].rgbGreen

									and		bl, 0xFE

								// 14

									mov		dl, (LPRGBQUAD)[ebp].rgbRed

									and		cl, 0xFE

								// 15

									mov		eax, [BYUV+ebx*4].YU

									and		dl, 0xFE

								// 16

									add		eax, [GYUV+ecx*4].YU

									mov		ebp, ebx

								// 17

									add		eax, [RYUV+edx*4].YU

									nop

								// 18

									shr		eax, 8

									mov		ebx, [esp + LOOP_K]

								// 19

									add		eax, 8

									and		ebx, 1

								// 20

									mov		[edi], al

									jnz		L9

								// 21

									mov 	ebx, [RYUV+edx*4].V

									mov 	edx, [esp + UPLANE]

								// 22

									sar		eax, 16

									add 	ebx, [GYUV+ecx*4].V

								// 23

									add		eax, 64

									add 	ebx, [BYUV+ebp*4].V

								// 24

									mov		[edx], al

									inc		edx

								// 25

									mov 	[esp + UPLANE], edx

									mov 	edx, [esp + VPLANE]

								// 26

									sar 	ebx, 8

									inc		edx

								// 27

									add 	ebx, 64

									mov 	[esp + VPLANE], edx

								// 28

									mov		[edx - 1], bl

									nop

								L9:

								// 32

									mov		eax, [esp + LOOP_I]

									lea		esi, [esi + 1]

								// 33

									sub		eax, 2

									lea		edi, [edi + 2]

								// 34

									mov		[esp + LOOP_I], eax

									jnz		L6


								// only esi (pnext) is live at this point (after line loop)

								// if (stretch && (0 == k) && j)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L14

									mov		eax, [esp + LOOP_K]

									test	eax, eax

									jnz		L14

									mov 	eax, [esp + LOOP_J]

									test	eax, eax

									jz		L14


								// spill YPlane ptr

									mov		[esp + YPLANE], edi

									nop


								// for (i = FrameWidth; i > 0; i -= 8)

								// assign (ebx, pyprev)

								// assign (ecx, t)

								// assign (edx, pynext)

								// assign (edi, pyspace)

								// assign (ebp, i)


								// make sure offsets are such that there are no bank conflicts here

									mov 	ebx, [esp + PYPREV]

									mov 	edi, [esp + PYSPACE]


									mov 	edx, [esp + PYNEXT]

									mov 	ebp, [esp + FRAME_WIDTH]


								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								// t = (*pyprev++ & 0xFEFEFEFE) >> 1

								// t += (*pynext++ & 0xFEFEFEFE) >> 1

								// *pyspace++ = t

								L15:

								// 1

									mov		eax, [ebx]

									lea		ebx, [ebx + 4]

								// 2

									mov		ecx, [edx]

									lea		edx, [edx + 4]

								// 3

									shr		ecx, 1

									and		eax, 0xFEFEFEFE

								// 4

									shr		eax, 1

									and		ecx, 0x7F7F7F7F

								// 5

									add		eax, ecx

									mov		ecx, [ebx]

								// 6

									shr		ecx, 1

									mov		[edi], eax

								// 7

									mov		eax, [edx]

									and		ecx, 0x7F7F7F7F

								// 8

									shr		eax, 1

									lea		edi, [edi + 4]

								// 9

									and		eax, 0x7F7F7F7F

									lea		ebx, [ebx + 4]

								// 10

									lea		edx, [edx + 4]

									add		eax, ecx

								// 11

									mov		[edi], eax

									lea		edi, [edi + 4]

								// 12

									sub		ebp, 8

									jnz		L15

								// kill (ebx, pyprev)

								// kill (ecx, t)

								// kill (edx, pynext)

								// kill (edi, pyspace)

								// kill (ebp, i)


								// restore YPlane

									mov		edi, [esp + YPLANE]


								// pnext += iBackTwoLines

								L14:

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += ypitch_adj;

									add		edi, [esp + YPITCH_ADJ]

								// if(0 == (k&1))

									mov		eax, [esp + LOOP_K]

									and		eax, 1

									jnz		L16

								// UPlane += uvpitch_adj;

								// VPlane += uvpitch_adj;

									mov		eax, [esp + UVPITCH_ADJ]

									add		[esp + UPLANE], eax

									add		[esp + VPLANE], eax


								L16:

									inc		DWORD PTR [esp + LOOP_K]

									mov		eax, [esp + LOOP_K]

									cmp		eax, [esp + MARK]

									jl		L5


								// if (stretch)

									cmp		DWORD PTR [esp + STRETCH], 0

									je	 	L17

								// pyprev = YPlane - pitch

									mov		eax, edi

									sub		eax, [esp + PITCH_PARM]

									mov		[esp + PYPREV], eax

								// pyspace = YPlane

									mov		[esp + PYSPACE], edi

								// pynext = (YPlane += pitch)

									add		edi, [esp + PITCH_PARM]

									mov		[esp + PYNEXT], edi


								L17:

									inc		DWORD PTR [esp + LOOP_J]

									mov		eax, [esp + LOOP_J]

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


								// kill(esi, pnext)

								// if (stretch)

									mov		esi, [esp + PYPREV]

									cmp		DWORD PTR [esp + STRETCH], 0

									je		L19


								// for (i = FrameWidth; i > 0; i -= 4)

								// assign (esi, pyprev)

								// assign (edi, pyspace)

								// assign (ebp, i)

									mov		edi, [esp + PYSPACE]

									 mov	ebp, [esp + FRAME_WIDTH]

								L18:

									mov		ecx, [esi]

									 lea	esi, [esi + 4]

									mov		[edi], ecx

									 lea	edi, [edi + 4]

									sub		ebp, 4

									 jnz	L18

								// kill (esi, pyprev)

								// kill (edi, pyspace)

								// kill (ebp, i)


								L19:

									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef PYPREV

								#undef PYSPACE

								#undef PYNEXT

								#undef LOOP_I

								#undef LOOP_J

								#undef LOOP_K

								#undef BACK_TWO_LINES

								#undef STRETCH

								#undef MARK

								#undef LUMA_ITERS

								#undef LPCTABLE

								#undef YPITCH_ADJ

								#undef UVPITCH_ADJ


								/***************************************************

								 * H26X_YVU9toYUV12()

								 *  Convert from YVU9 to YUV12

								 *  and copy to destination memory with pitch

								 *  defined by the constant PITCH.

								 *

								 * uv_plane_common()

								 *  Helper function to convert V and U plane information.

								 *  Since the process is similar for both planes, the

								 *  conversion code was included in this subroutine.

								 *

								 ***************************************************/


								#if 0

								#define READ_DWORD_AND_SHIFT(val,src) \

								 (((val) = *((unsigned int *)(src))), ((val) &= 0xFEFEFEFE), ((val) >>= 1))


								#define READ_QWORD_AND_SHIFT(val,src) \

								 (((val) = *((unsigned __int64 *)(src))), ((val) &= 0xFEFEFEFEFEFEFEFE), ((val) >>= 1))


								#define WRITE_DWORD(dest,val) ((*(unsigned int *)(dest)) = (val))


								#define WRITE_QWORD(dest,val) ((*(unsigned __int64 *)(dest)) = (val))


								#define AVERAGE_DWORDS(out,in1,in2)  ((out) = ((((in1) + (in2)) & 0xFEFEFEFE) >> 1))


								#define DUP_LOWER_TWO_BYTES(dest,val) \

								  (*((unsigned int *)(dest)) = (((val) & 0x000000FF) |	(((val) << 8) & 0x0000FF00) | \

															  	(((val) << 8) & 0x00FF0000) | (((val) << 16) & 0xFF000000)))


								#define DUP_UPPER_TWO_BYTES(dest,val) \

								  (*((unsigned int *)(dest)) = ((((val) >> 16) & 0x000000FF) |	(((val) >> 8) & 0x0000FF00) | \

															  	(((val) >> 8) & 0x00FF0000) | ((val) & 0xFF000000)))


								_STATIC void C_uv_plane_common(

									U8 *psrc,

									U8 *Plane,

									UN pitch,

									UN OutputFrameWidth,

									UN ChromaIters,

									UN spitch_adj) {


								U8* pnext = psrc + (OutputFrameWidth>>1) + spitch_adj;

								U8* pdest_copy = Plane;

								U8* pdest_avg = Plane + pitch;

								int dpitch_adj = pitch - OutputFrameWidth;

								int stretch = (spitch_adj ? 1 : 0);

								int mark = 6 - stretch;

								int flag = stretch;

								int i, j, k;

								UN t1,t2;


									for (j = ChromaIters; j > 0; j--) {

										for (k = mark + (flag & 1); k > 0; k--) {

											if (!stretch && (1 == j) && (1 == k)) {

												pnext = psrc;

											}

											for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4,

								                                                      pnext += 4,

								                                                      pdest_copy += 8,

								                                                      pdest_avg += 8) {

												READ_DWORD_AND_SHIFT(t1,psrc);

												DUP_LOWER_TWO_BYTES(pdest_copy,t1);

												DUP_UPPER_TWO_BYTES((pdest_copy+4),t1);

												READ_DWORD_AND_SHIFT(t2,pnext);

												AVERAGE_DWORDS(t1,t1,t2);

												DUP_LOWER_TWO_BYTES(pdest_avg,t1);

												DUP_UPPER_TWO_BYTES((pdest_avg+4),t1);

											}

											psrc += spitch_adj;

											pnext += spitch_adj;

											pdest_copy = pdest_avg + dpitch_adj;

											pdest_avg = pdest_copy + pitch;

										}

										if (stretch) {

											psrc -= ((OutputFrameWidth>>1) + spitch_adj);

											pnext -= ((OutputFrameWidth>>1) + spitch_adj);

											pdest_avg = pdest_copy;

											for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4,

								                                                      pnext += 4,

								                                                      pdest_avg += 8) {

												READ_DWORD_AND_SHIFT(t1,psrc);

												READ_DWORD_AND_SHIFT(t2,pnext);

												AVERAGE_DWORDS(t1,t1,t2);

												AVERAGE_DWORDS(t1,t1,t2);

												DUP_LOWER_TWO_BYTES(pdest_avg,t1);

												DUP_UPPER_TWO_BYTES((pdest_avg+4),t1);

											}

											psrc += spitch_adj;

											pnext += spitch_adj;

											pdest_copy = pdest_avg + dpitch_adj;

											pdest_avg = pdest_copy + pitch;

											flag++;

										}

									}

								}


								_STATIC void C_H26X_YVU9toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									UN FrameWidth,

									UN FrameHeight,

									const int pitch) {


								U8  *pnext, *plast, *pbn;

								U8  *pvsrc, *pusrc;

								int width_adj, height_adj;

								int stretch, mark, aspect;

								int iNextLine;

								int i, j, k, t;

								int LumaIters = 0;

								int ypitch_adj = pitch - FrameWidth;

								int uvpitch_adj = pitch - (FrameWidth >> 1);


									for (i = FrameHeight; i > 0; i -= 48) {

										LumaIters += 4;

									}

									width_adj = (lpbiInput->biWidth - FrameWidth) >> 1;

									aspect = (width_adj ? LumaIters : 0);

									height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;

									stretch = (height_adj ? 1 : 0);

									mark = 12 - stretch;

									iNextLine = width_adj << 1;

									pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj;

									for (j = LumaIters; j > 0; j--) {

										for (k = mark; k > 0; k--) {

											for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4) {

												*(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1;

											}

											pnext += iNextLine;

											YPlane += ypitch_adj;

										}

										if (stretch) {

											plast = pnext - lpbiInput->biWidth;

											pbn = pnext;

											for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, plast += 4, pbn += 4) {

												*(U32 *)YPlane =

													( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) +

												        ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1;

											}

											YPlane += ypitch_adj;

										}

									}


									pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight);

									pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight>>2));

									t = ((lpbiInput->biWidth>>2) * (height_adj>>2)) + (width_adj>>2);

									pvsrc += t;

									pusrc += t;

									C_uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1);

									C_uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1);

								}

								#endif


								__declspec(naked)

								_STATIC void IA_uv_plane_common(

									U8 *psrc,

									U8 *Plane,

									UN pitch,

									UN OutputFrameWidth,

									UN ChromaIters,

									UN spitch_adj)


								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//    | spitch_adj      |  + 64

								//    | ChromaIters     |  + 60

								//    | OutputFrameWidth|  + 56

								//    | pitch           |  + 52

								//    | Plane           |  + 48

								//    | psrc            |  + 44

								//  -----------------------------

								//    | return addr     |  + 40

								//    | saved ebp       |  + 36

								//    | saved ebx       |  + 32

								//    | saved esi       |  + 28

								//    | saved edi       |  + 24


								//    | dpitch_adj      |  + 20

								//    | stretch         |  + 16

								//    | mark            |  + 12

								//    | flag            |  +  8

								//    | j               |  +  4

								//    | k               |  +  0


								#define LOCALSIZE           24


								#define SPITCH_ADJ          64

								#define CHROMA_ITERS        60

								#define OUTPUT_FRAME_WIDTH  56

								#define PITCH_PARM          52

								#define PLANE               48

								#define PSRC                44


								#define DPITCH_ADJ          20

								#define STRETCH             16

								#define MARK                12

								#define FLAG                 8

								#define LOOP_J               4

								#define LOOP_K               0


									_asm {


									push	ebp

									push	ebx

									push	esi

									push	edi

									sub		esp, LOCALSIZE


								// pnext = psrc + (OuputFrameWidth>>1) + uvpitch_adj

								// pdest_copy = Plane

								// pdest_avg = Plane + pitch

								// assign (esi, psrc)

								// assign (ecx, pnext)

								// assign (edi, pdest_copy)

								// assign (edx, pdest_avg)

								// assign (ebp, i)

									mov		esi, [esp + PSRC]

									mov		ecx, esi

									mov		eax, [esp + OUTPUT_FRAME_WIDTH]

									shr		eax, 1

									add		eax, [esp + SPITCH_ADJ]

									add		ecx, eax

									mov		edi, [esp + PLANE]

									mov		edx, edi

									add		edx, [esp + PITCH_PARM]

								// dpitch_adj = pitch - OutputFrameWidth

									mov		eax, [esp + PITCH_PARM]

									sub		eax, [esp + OUTPUT_FRAME_WIDTH]

									mov		[esp + DPITCH_ADJ], eax

								// stretch = (spitch_adj ? 1 : 0)

									xor		ebx, ebx

									mov		eax, [esp + SPITCH_ADJ]

									test	eax, eax

									jz		L1

									inc		ebx

								L1:

									mov		[esp + STRETCH], ebx

								// mark = 6 - stretch

									mov		eax, 6

									sub		eax, ebx

									mov		[esp + MARK], eax

								// flag = stretch

									mov		DWORD PTR [esp + FLAG], ebx


								// for (j = ChromaIters; j > 0; j--)

									mov		eax, [esp + CHROMA_ITERS]

									mov		[esp + LOOP_J], eax

								L2:

								// for (k = mark + (flag & 1); k > 0; k--)

									mov		eax, [esp + FLAG]

									and		eax, 1

									add		eax, [esp + MARK]

									mov		[esp + LOOP_K], eax

								L3:

								// if (!stretch && (0 == j) && (0 == k))

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jnz		L4

									mov		eax, [esp + LOOP_J]

									cmp		eax, 1

									jne		L4

									mov		eax, [esp + LOOP_K]

									cmp		eax, 1

									jne		L4

								// pnext = psrc

									mov		ecx, esi

								L4:

								// for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4,

								//                                           pdest_copy += 8, pdest_avg += 8)

									mov		ebp, [esp + OUTPUT_FRAME_WIDTH]

								// Pentium pipeline scheduling has not been performed on the following loop code yet

								L5:

								// READ_DWORD_AND_SHIFT(t1,psrc)

									mov		eax, [esi]

									and		eax, 0xFEFEFEFE

									shr		eax, 1

								// DUP_LOWER_TWO_BYTES(pdest_copy,t1)

									mov		bl, ah

									mov		bh, ah

									shl		ebx, 16

									mov		bl, al

									mov		bh, al

									mov		[edi], ebx

								// DUP_UPPER_TWO_BYTES((pdest_copy+4),t1)

									shr		eax, 16

									mov		bl, ah

									mov		bh, ah

									shl		ebx, 16

									mov		bl, al

									mov		bh, al

									mov		[edi+4], ebx

								// READ_DWORD_AND_SHIFT(t2,pnext)

								// AVERAGE_DWORDS(t1,t1,t2)

									mov		eax, [esi]

									and		eax, 0xFEFEFEFE

									shr		eax, 1

									mov		ebx, [ecx]

									and		ebx, 0xFEFEFEFE

									shr		ebx, 1

									add		eax, ebx

									and		eax, 0xFEFEFEFE

									shr		eax, 1

								// DUP_LOWER_TWO_BYTES(pdest_avg,t1)

									mov		bl, ah

									mov		bh, ah

									shl		ebx, 16

									mov		bl, al

									mov		bh, al

									mov		[edx], ebx

								// DUP_UPPER_TWO_BYTES((pdest_avg+4),t1)

									shr		eax, 16

									mov		bl, ah

									mov		bh, ah

									shl		ebx, 16

									mov		bl, al

									mov		bh, al

									mov		[edx+4], ebx

								// end of i loop

									lea		esi, [esi + 4]

									lea		ecx, [ecx + 4]

									lea		edi, [edi + 8]

									lea		edx, [edx + 8]

									sub		ebp, 8

									jnz		L5


								// psrc += spitch_adj

								// pnext += spitch_adj

								// pdest_copy = pdest_avg + pitch_adj

								// pdest_avg = pdest_copy + pitch

									add		esi, [esp + SPITCH_ADJ]

									add		ecx, [esp + SPITCH_ADJ]

									mov		eax, edx

									add		eax, [esp + DPITCH_ADJ]

									mov		edi, eax

									mov		edx, edi

									add		edx, [esp + PITCH_PARM]

								// end of k loop

									dec		DWORD PTR [esp + LOOP_K]

									jnz		L3


								// if (stretch)

									cmp		DWORD PTR [esp + STRETCH], 0

									jz		L6


								// psrc -= ((OutputFrameWidth>>1)+spitch_adj)

								// pnext -= ((OutputFrameWidth>>1)+spitch_adj)

								// pdest_avg = pdest_copy

									mov		eax, [esp + OUTPUT_FRAME_WIDTH]

									shr		eax, 1

									add		eax, [esp + SPITCH_ADJ]

									sub		esi, eax

									sub		ecx, eax

									mov		edx, edi

								// for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_avg += 8)

									mov		ebp, [esp + OUTPUT_FRAME_WIDTH]

								// Pentium pipeline scheduling has not been performed on the following loop code yet

								L7:

								// READ_DWORD_AND_SHIFT(t1,psrc)

									mov		eax, [esi]

									and		eax, 0xFEFEFEFE

									shr		eax, 1

								// READ_DWORD_AND_SHIFT(t2,pnext)

									mov		ebx, [ecx]

									and		ebx, 0xFEFEFEFE

									shr		ebx, 1

								// AVERAGE_DWORDS(t1,t1,t2)

								// AVERAGE_DWORDS(t1,t1,t2)

									add		eax, ebx

									and		eax, 0xFEFEFEFE

									shr		eax, 1

									add		eax, ebx

									and		eax, 0xFEFEFEFE

									shr		eax, 1

								// DUP_LOWER_TWO_BYTES(pdest_avg,t1)

									mov		bl, ah

									mov		bh, ah

									shl		ebx, 16

									mov		bl, al

									mov		bh, al

									mov		[edx], ebx

								// DUP_UPPER_TWO_BYTES((pdest_avg+4),t1)

									shr		eax, 16

									mov		bl, ah

									mov		bh, ah

									shl		ebx, 16

									mov		bl, al

									mov		bh, al

									mov		[edx+4], ebx

								// end of i loop

									lea		esi, [esi + 4]

									lea		ecx, [ecx + 4]

									lea		edx, [edx + 8]

									sub		ebp, 8

									jnz		L7


								// psrc += spitch_adj

								// pnext += spitch_adj

								// pdest_copy = pdest_avg + dpitch_adj

								// pdest_avg = pdest_copy + pitch

								// flag++

									add		esi, [esp + SPITCH_ADJ]

									add		ecx, [esp + SPITCH_ADJ]

									mov		eax, edx

									add		eax, [esp + DPITCH_ADJ]

									mov		edi, eax

									mov		edx, edi

									add		edx, [esp + PITCH_PARM]

									inc		DWORD PTR [esp + FLAG]


								// end of j loop

								L6:

									dec		DWORD PTR [esp + LOOP_J]

									jnz		L2


									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef SPITCH_ADJ

								#undef CHROMA_ITERS

								#undef OUTPUT_FRAME_WIDTH

								#undef PITCH_PARM

								#undef PLANE

								#undef PSRC


								#undef DPITCH_ADJ

								#undef STRETCH

								#undef MARK

								#undef FLAG

								#undef LOOP_J

								#undef LOOP_K


								__declspec(naked)

								_STATIC void IA_H26X_YVU9toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									UN FrameWidth,

									UN FrameHeight,

									const int pitch)


								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//    | pitch           |  + 88

								//    | FrameHeight     |  + 84

								//    | FrameWidth      |  + 80

								//    | VPlane          |  + 76

								//    | UPlane          |  + 72

								//    | YPlane          |  + 68

								//    | lpInput         |  + 64

								//    | lpbiInput       |  + 60

								//  -----------------------------

								//    | return addr     |  + 56

								//    | saved ebp       |  + 52

								//    | saved ebx       |  + 48

								//    | saved esi       |  + 44

								//    | saved edi       |  + 40


								//    | width_adj       |  + 36

								//    | height_adj      |  + 32

								//    | stretch         |  + 28

								//    | mark            |  + 24

								//    | iNextLine       |  + 20

								//    | j               |  + 16

								//    | k               |  + 12

								//    | LumaIters       |  +  8

								//    | ypitch_adj      |  +  4

								//    | uvpitch_adj     |  +  0


								#define LOCALSIZE        40


								#define PITCH_PARM       88

								#define FRAME_HEIGHT     84

								#define FRAME_WIDTH      80

								#define VPLANE           76

								#define UPLANE           72

								#define YPLANE           68

								#define LP_INPUT         64

								#define LPBI_INPUT       60


								#define WIDTH_ADJ        36

								#define HEIGHT_ADJ       32

								#define STRETCH          28

								#define MARK             24

								#define NEXT_LINE        20

								#define LOOP_J           16

								#define LOOP_K           12

								#define LUMA_ITERS        8

								#define YPITCH_ADJ        4

								#define UVPITCH_ADJ       0


									_asm {


										push	ebp

										push	ebx

										push	esi

										push	edi

										sub		esp, LOCALSIZE


								// assign (ebx, lpbiInput)

									mov		ebx, [esp + LPBI_INPUT]

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (edx, pitch)

									mov		ebp, ecx

									shr		ebp, 1

									sub		edx, ebp

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (edx, LumaIters)

									xor		edx, edx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		edx, [edx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = (lpbiInput->biWidth - FrameWidth) >> 1

								// assign (esi, width_adj)

									mov		esi, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		esi, [esp + FRAME_WIDTH]

									shr		esi, 1

									mov		[esp + WIDTH_ADJ], esi

								// aspect = (width_adj ? LumaIters : 0)

								// assign (edi, aspect)

								// kill (edx, LumaIters)

									mov		[esp + LUMA_ITERS], edx

									xor		edi, edi

									test	esi, esi

									jz		L2

									mov		edi, edx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (edx, height_adj)

								L2:

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		edx, [esp + FRAME_HEIGHT]

									add		edx, edi

									shr		edx, 1

									mov		[esp + HEIGHT_ADJ], edx

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	edx, edx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		ebp, 12

									sub		ebp, eax

									mov		[esp + MARK], ebp

								// iNextLine = width_adj << 1

									mov		ebp, esi

									shl		ebp, 1

									mov		[esp + NEXT_LINE], ebp

								// pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj

								// kill (ebx, lpbiInput)

								// kill (ecx, FrameWidth)

								// kill (edx, height_adj)

								// kill (esi, width_adj)

								// kill (edi, aspect)

								// assign (esi, pnext)

									mov		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

									mov		ebx, edx

									imul	ebx

									add		esi, eax

									add		esi, [esp + LP_INPUT]

								// assign (edi, YPlane)

									mov		edi, [esp + YPLANE]

								// for (j = LumaIters; j > 0; j--)

									mov		eax, [esp + LUMA_ITERS]

									mov		[esp + LOOP_J], eax

								// for (k = mark; k > 0; k--)

								L4:

									mov		eax, [esp + MARK]

									mov		[esp + LOOP_K], eax

								// for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4)

								// assign (ebp, i)

								L5:

									mov		ebp, [esp + FRAME_WIDTH]

								// This jump is here to make sure the following loop starts on the U pipe

									jmp		L6

								L6:

								// *(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1;

								// 1

									mov		eax, [esi]

									lea		esi, [esi + 4]

								// 2

									and		eax, 0xFEFEFEFE

									lea		edi, [edi + 4]

								// 3

									shr		eax, 1

									sub		ebp, 4

								// 4

									mov		[edi - 4], eax

									jnz		L6


								// pnext += iNextLine

								// YPlane += ypitch_adj

									add		esi, [esp + NEXT_LINE]

									add		edi, [esp + YPITCH_ADJ]


								// end of k loop

									mov		eax, [esp + LOOP_K]

									sub		eax, 1

									mov		[esp + LOOP_K], eax

									jnz		L5


								// if (stretch)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L7


								// plast = pnext - lpbiInput->biWidth

								// pn = pnext

								// assign (ecx, plast)

								// assign (edx, pn)

									mov		ecx, esi

									mov		eax, [esp + LPBI_INPUT]

									sub		ecx, (LPBITMAPINFOHEADER)[eax].biWidth

									mov		edx, esi


								// for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4)

								// assign (ebp, i)

									mov		ebp, [esp + FRAME_WIDTH]

								// This jump is here just to make sure the loop code starts with the U pipe

									jmp		L8

								L8:

								// *(U32 *)YPlane =

								//  ( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) +

								//      ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1

								// 1

									mov		eax, [ecx]

									lea		ecx, [ecx + 4]

								// 2

									shr		eax, 1

								// 3

									and		eax, 0x7F7F7F7F

									mov		ebx, [edx]

								// 4

									shr		ebx, 1

									lea		edi, [edi + 4]

								// 5

									and		ebx, 0x7F7F7F7F

								// 6

									add		eax, ebx

								// 7

									and		eax, 0xFEFEFEFE

								// 8

									shr		eax, 1

								// 9

									mov		[edi - 4], eax

									sub		ebp, 4

								// 10

									lea		edx, [edx + 4]

									jnz		L8


								// YPlane += ypitch_adj

									add		edi, [esp + YPITCH_ADJ]


								L7:

								// end of the LumaIters loop

									dec		DWORD PTR [esp + LOOP_J]

									jnz		L4


								// pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight)

								// assign (esi, pvsrc)

									mov		eax, [esp + LPBI_INPUT]

									mov		ebx, (LPBITMAPINFOHEADER)[eax].biWidth

									mov		eax, (LPBITMAPINFOHEADER)[eax].biHeight

									imul	ebx

									add		eax, [esp + LP_INPUT]

									mov		esi, eax

								// pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight)>>2)

								// assign (edi, pusrc)

									mov		eax, [esp + LPBI_INPUT]

									mov		ecx, (LPBITMAPINFOHEADER)[eax].biWidth

									shr		ecx, 2

									mov		eax, (LPBITMAPINFOHEADER)[eax].biHeight

									shr		eax, 2

									imul	ecx

									add		eax, esi

									mov		edi, eax

								// t = ((lpbiInput->biWidth>>2) * (height>>2)) + (width_adj>>2)

								// assign (eax, t)

									mov		eax, [esp + LPBI_INPUT]

									mov		eax, (LPBITMAPINFOHEADER)[eax].biWidth

									shr		eax, 2

									mov		ebx, [esp + HEIGHT_ADJ]

									shr		ebx, 2

									imul	ebx

									mov		ebx, [esp + WIDTH_ADJ]

									shr		ebx, 2

									add		eax, ebx

								// pvsrc += t

								// pusrc += t

									add		esi, eax

									add		edi, eax


								// uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1)

									mov		ebp, esp

									mov		eax, [ebp + WIDTH_ADJ]

									shr		eax, 1

									push	eax

									mov		eax, [ebp + LUMA_ITERS]

									shr		eax, 1

									push	eax

									mov		eax, [ebp + FRAME_WIDTH]

									shr		eax, 1

									push	eax

									push	DWORD PTR [ebp + PITCH_PARM]

									push	DWORD PTR [ebp + UPLANE]

									push	edi

									call	IA_uv_plane_common

									lea		esp, [esp + 24]


								// uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1)

									mov		ebp, esp

									mov		eax, [ebp + WIDTH_ADJ]

									shr		eax, 1

									push	eax

									mov		eax, [ebp + LUMA_ITERS]

									shr		eax, 1

									push	eax

									mov		eax, [ebp + FRAME_WIDTH]

									shr		eax, 1

									push	eax

									push	DWORD PTR [ebp + PITCH_PARM]

									push	DWORD PTR [ebp + VPLANE]

									push	esi

									call	IA_uv_plane_common

									lea		esp, [esp + 24]


									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef WIDTH_ADJ

								#undef HEIGHT_ADJ

								#undef STRETCH

								#undef MARK

								#undef NEXT_LINE

								#undef LOOP_J

								#undef LOOP_K

								#undef LUMA_ITERS

								#undef YPITCH_ADJ

								#undef UVPITCH_ADJ


								/***************************************************

								 * H26X_YUV12toEncYUV12()

								 *  Copy YUV12 data to encoder memory at the

								 *  appropriate location. It is assumed that the input

								 *  data is stored as rows of Y, followed by rows of U,

								 *  then rows of V.

								 *

								 ***************************************************/


								#if 0

								_STATIC void C_H26X_YUV12toEncYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									UN FrameWidth,

									UN FrameHeight,

									const int pitch) {


									int i, j;

									U32 *pnext = (U32 *)lpInput;


									int ypitch_adj = pitch - FrameWidth;

									int yinput_height = lpbiInput->biHeight;

									int yinput_width = lpbiInput->biWidth;

									int yheight_diff = FrameHeight - yinput_height;

									int ywidth_diff = FrameWidth - yinput_width;


									int uvpitch_adj = pitch - (FrameWidth >> 1);

									int uvoutput_width = FrameWidth >> 1;

									int uvinput_height = yinput_height >> 1;

									int uvinput_width = yinput_width >> 1;

									int uvheight_diff = yheight_diff >> 1;

									int uvwidth_diff = ywidth_diff >> 1;


									for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj) {

											for (i = yinput_width; i > 0; i -= 8) {

													*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4;

													*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4;

												}

											for (i = ywidth_diff; i > 0; i -= 8) {

													*(U32 *)YPlane = 0; YPlane += 4;

													*(U32 *)YPlane = 0; YPlane += 4;

												}

									}

									for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj) {

											for (i = FrameWidth; i > 0; i -= 8) {

												*(U32 *)YPlane = 0; YPlane += 4;

												*(U32 *)YPlane = 0; YPlane += 4;

											}

									}


									for (j = uvinput_height; j > 0; j--, UPlane += uvpitch_adj) {

											for (i = uvinput_width; i > 0; i -= 8) {

													*(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4;

													*(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4;

												}

											for (i = uvwidth_diff; i > 0; i -= 8) {

													*(U32 *)UPlane = 0x40404040; UPlane += 4;

													*(U32 *)UPlane = 0x40404040; UPlane += 4;

												}

									}

									for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj) {

											for (i = uvoutput_width; i > 0; i -= 8) {

												*(U32 *)UPlane = 0x40404040; UPlane += 4;

												*(U32 *)UPlane = 0x40404040; UPlane += 4;

											}

									}


									for (j = uvinput_height; j > 0; j--, VPlane += uvpitch_adj) {

											for (i = uvinput_width; i > 0; i -= 8) {

													*(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4;

													*(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4;

												}

											for (i = uvwidth_diff; i > 0; i -= 8) {

													*(U32 *)VPlane = 0x40404040; VPlane += 4;

													*(U32 *)VPlane = 0x40404040; VPlane += 4;

												}

									}

									for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj) {

											for (i = uvoutput_width; i > 0; i -= 8) {

												*(U32 *)VPlane = 0x40404040; VPlane += 4;

												*(U32 *)VPlane = 0x40404040; VPlane += 4;

											}

									}


								}

								#endif


								__declspec(naked)

								_STATIC void IA_H26X_YUV12toEncYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									UN FrameWidth,

									UN FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//    | pitch           |  + 92

								//    | FrameHeight     |  + 88

								//    | FrameWidth      |  + 84

								//    | VPlane          |  + 80

								//    | UPlane          |  + 76

								//    | YPlane          |  + 72

								//    | lpInput         |  + 68

								//    | lpbiInput       |  + 64

								//  -----------------------------

								//    | return addr     |  + 60

								//    | saved ebp       |  + 56

								//    | saved ebx       |  + 52

								//    | saved esi       |  + 48

								//    | saved edi       |  + 44


								//    | ypitch_adj      |  + 40

								//    | yinput_height   |  + 36

								//    | yinput_width    |  + 32

								//    | yheight_diff    |  + 28

								//    | ywidth_diff     |  + 24

								//    | uvpitch_adj     |  + 20

								//    | uvoutput_width  |  + 16

								//    | uvinput_height  |  + 12

								//    | uvinput_width   |  +  8

								//    | uvheight_diff   |  +  4

								//    | uvwidth_diff    |  +  0


								#define LOCALSIZE        44


								#define PITCH_PARM       92

								#define FRAME_HEIGHT     88

								#define FRAME_WIDTH      84

								#define VPLANE           80

								#define UPLANE           76

								#define YPLANE           72

								#define LP_INPUT         68

								#define LPBI_INPUT       64


								#define YPITCH_ADJ       40

								#define YINPUT_HEIGHT    36

								#define YINPUT_WIDTH     32

								#define YHEIGHT_DIFF     28

								#define YWIDTH_DIFF      24

								#define UVPITCH_ADJ      20

								#define UVOUTPUT_WIDTH   16

								#define UVINPUT_HEIGHT   12

								#define UVINPUT_WIDTH     8

								#define UVHEIGHT_DIFF     4

								#define UVWIDTH_DIFF      0


									_asm {


										push	ebp

										push	ebx

										push	esi

										push	edi

										sub		esp, LOCALSIZE


										mov		ebx, [esp + FRAME_HEIGHT]

										mov		ecx, [esp + FRAME_WIDTH]

										mov		edx, [esp + PITCH_PARM]

								// ypitch_adj = pitch - FrameWidth

										mov		eax, edx

										sub		eax, ecx

										mov		[esp + YPITCH_ADJ], eax

								// uvoutput_width = FrameWidth >> 1

										mov		ebp, ecx

										shr		ebp, 1

										mov		[esp + UVOUTPUT_WIDTH], ebp

								// uvpitch_adj = pitch - (FrameWidth >> 1)

										sub		edx, ebp

										mov		[esp + UVPITCH_ADJ], edx

								// yinput_height = lpbiInput->biHeight

								// uvinput_height = yinput_height >> 1

								// yinput_width = lpbiInput->biWidth

								// uvinput_width = yinput_width >> 1

										mov		ebx, [esp + LPBI_INPUT]

										mov		eax, (LPBITMAPINFOHEADER)[ebx].biHeight

										mov		[esp + YINPUT_HEIGHT], eax

										shr		eax, 1

										mov		[esp + UVINPUT_HEIGHT], eax

										mov		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

										mov		[esp + YINPUT_WIDTH], eax

										shr		eax, 1

										mov		[esp + UVINPUT_WIDTH], eax

								// yheight_diff = FrameHeight - yinput_height

								// uvheight_diff = yheight_diff >> 1;

										mov		eax, [esp + FRAME_HEIGHT]

										mov		ebx, eax

										sub		eax, [esp + YINPUT_HEIGHT]

										jns		NoCrop0

										xor		eax, eax

										mov		[esp + YINPUT_HEIGHT], ebx

										shr		ebx, 1

										mov		[esp + UVINPUT_HEIGHT], ebx

								NoCrop0:

										mov		[esp + YHEIGHT_DIFF], eax

										shr		eax, 1

										mov		[esp + UVHEIGHT_DIFF], eax

								// ywidth_diff = FrameWidth - yinput_width

								// uvwidth_diff = ywidth_diff >> 1;

										mov		eax, [esp + FRAME_WIDTH]

										xor		ebx, ebx

										sub		eax, [esp + YINPUT_WIDTH]

										jns		NoCrop1

										mov		eax, [esp + FRAME_WIDTH]

										mov		ebx, [esp + YINPUT_WIDTH]

										sub		ebx, eax

										mov		[esp + YINPUT_WIDTH], eax

										shr		eax, 1

										mov		[esp + UVINPUT_WIDTH], eax

										xor		eax, eax

								NoCrop1:

										mov		[esp + YWIDTH_DIFF], eax

										shr		eax, 1

										mov		[esp + UVWIDTH_DIFF], eax

								// assign (esi, lpInput)

										mov		esi, [esp + LP_INPUT]


								// assign (edi, YPlane)

										mov		edi, [esp + YPLANE]

								// for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj)

								// assign (ecx, j)

										mov		ecx, [esp + YINPUT_HEIGHT]

								L1:

								// for (i = yinput_width; i > 0; i -= 8)

								// assign (ebp, i)

										mov		ebp, [esp + YINPUT_WIDTH]

								L2:

								// *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4

								// *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4

								// 1

										mov		eax, [esi]

										mov		edx, [esi + 4]

								// 2

										shr		eax, 1

										and		edx, 0xFEFEFEFE

								// 3

										shr		edx, 1

										and		eax, 0x7F7F7F7F

								// 4

										lea		esi, [esi + 8]

										mov		[edi], eax

								// 5

										sub		ebp, 8

										mov		[edi + 4], edx

								// 6

										lea		edi, [edi + 8]

										jnz		L2

								// for (i = ywidth_diff; i > 0; i -= 8)

								//    *(U32 *)YPlane = 0; YPlane += 4;

								//    *(U32 *)YPlane = 0; YPlane += 4;

								// assign (ebp, i)

										mov		ebp, [esp + YWIDTH_DIFF]

										test	ebp, ebp

										jz		L3

								L4:

								// 1

										xor		eax, eax

										sub		ebp, 8

								// 2

										mov		[edi], eax

										mov		[edi + 4], eax

								// 3

										lea		edi, [edi + 8]

										jnz		L4

								// j--, YPlane += ypitch_adj

								L3:

										mov		eax, [esp + YPITCH_ADJ]

										add		edi, eax

										add		esi, ebx

										dec		ecx

										jnz		L1


								// for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj)

								// assign (ecx, j)

										mov		ecx, [esp + YHEIGHT_DIFF]

										test	ecx, ecx

										jz		L7

								L5:

								// for (i = FrameWidth; i > 0; i -= 8)

								//    *(U32 *)YPlane = 0; YPlane += 4;

								//    *(U32 *)YPlane = 0; YPlane += 4;

								// assign (ebp, i)

										mov		ebp, [esp + FRAME_WIDTH]

								L6:

								// 1

										xor		eax, eax

										sub		ebp, 8

								// 2

										mov		[edi], eax

										mov		[edi + 4], eax

								// 3

										lea		edi, [edi + 8]

										jnz		L6

								// j--, YPlane += ypitch_adj

										mov		eax, [esp + YPITCH_ADJ]

										add		edi, eax

										dec		ecx

										jnz		L5


								L7:

								// recompute start of input U plane

										mov		edx, [esp + LPBI_INPUT]

										mov		eax, (LPBITMAPINFOHEADER)[edx].biHeight

										mov		ecx, (LPBITMAPINFOHEADER)[edx].biWidth

										imul	eax, ecx

								// assign (esi, lpInput)

										mov		esi, [esp + LP_INPUT]

										add		esi, eax

								// assign (edi, UPlane)

										mov		edi, [esp + UPLANE]

										shr		ebx, 1

								// for (j = uvinput_height; j > 0; j--, UPlane += ypitch_adj)

								// assign (ecx, j)

										mov		ecx, [esp + UVINPUT_HEIGHT]

								L8:

								// for (i = uvinput_width; i > 0; i -= 8)

								// assign (ebp, i)

										mov		ebp, [esp + UVINPUT_WIDTH]

								L9:

								// *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4

								// *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4

								// 1

										mov		eax, [esi]

										mov		edx, [esi + 4]

								// 2

										shr		eax, 1

										and		edx, 0xFEFEFEFE

								// 3

										shr		edx, 1

										and		eax, 0x7F7F7F7F

								// 4

										lea		esi, [esi + 8]

										mov		[edi], eax

								// 5

										sub		ebp, 8

										mov		[edi + 4], edx

								// 6

										lea		edi, [edi + 8]

										jnz		L9

								// for (i = uvwidth_diff; i > 0; i -= 8)

								//    *(U32 *)UPlane = 0x40404040; UPlane += 4;

								//    *(U32 *)UPlane = 0x40404040; UPlane += 4;

								// assign (ebp, i)

										mov		ebp, [esp + UVWIDTH_DIFF]

										test	ebp, ebp

										jz		L11

								L10:

								// 1

										mov		eax, 040404040H

										sub		ebp, 8

								// 2

										mov		[edi], eax

										mov		[edi + 4], eax

								// 3

										lea		edi, [edi + 8]

										jnz		L10

								// j--, UPlane += uvpitch_adj

								L11:

										mov		eax, [esp + UVPITCH_ADJ]

										add		edi, eax

										add		esi, ebx

										dec		ecx

										jnz		L8


								// for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj)

								// assign (ecx, j)

										mov		ecx, [esp + UVHEIGHT_DIFF]

										test	ecx, ecx

										jz		L14

								L12:

								// for (i = uvoutput_width; i > 0; i -= 8)

								//    *(U32 *)UPlane = 0x40404040; UPlane += 4;

								//    *(U32 *)UPlane = 0x40404040; UPlane += 4;

								// assign (ebp, i)

										mov		ebp, [esp + UVOUTPUT_WIDTH]

								L13:

								// 1

										mov		eax, 040404040H

										sub		ebp, 8

								// 2

										mov		[edi], eax

										mov		[edi + 4], eax

								// 3

										lea		edi, [edi + 8]

										jnz		L13

								// j--, UPlane += uvpitch_adj

										mov		eax, [esp + UVPITCH_ADJ]

										add		edi, eax

										dec		ecx

										jnz		L12


								L14:

								// recompute start of input V plane

										mov		edx, [esp + LPBI_INPUT]

										mov		eax, (LPBITMAPINFOHEADER)[edx].biHeight

										mov		ecx, (LPBITMAPINFOHEADER)[edx].biWidth

										imul	eax, ecx

								// assign (esi, lpInput)

										mov		esi, [esp + LP_INPUT]

										add		esi, eax

										shr		eax, 2

										add		esi, eax

								// assign (edi, VPlane)

										mov		edi, [esp + VPLANE]

								// for (j = uvinput_height; j > 0; j--, VPlane += ypitch_adj)

								// assign (ecx, j)

										mov		ecx, [esp + UVINPUT_HEIGHT]

								L15:

								// for (i = uvinput_width; i > 0; i -= 8)

								// assign (ebp, i)

										mov		ebp, [esp + UVINPUT_WIDTH]

								L16:

								// *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4

								// *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4

								// 1

										mov		eax, [esi]

										mov		edx, [esi + 4]

								// 2

										shr		eax, 1

										and		edx, 0xFEFEFEFE

								// 3

										shr		edx, 1

										and		eax, 0x7F7F7F7F

								// 4

										lea		esi, [esi + 8]

										mov		[edi], eax

								// 5

										sub		ebp, 8

										mov		[edi + 4], edx

								// 6

										lea		edi, [edi + 8]

										jnz		L16

								// for (i = uvwidth_diff; i > 0; i -= 8)

								//    *(U32 *)VPlane = 0x40404040; VPlane += 4;

								//    *(U32 *)VPlane = 0x40404040; VPlane += 4;

								// assign (ebp, i)

										mov		ebp, [esp + UVWIDTH_DIFF]

										test	ebp, ebp

										jz		L18

								L17:

								// 1

										mov		eax, 040404040H

										sub		ebp, 8

								// 2

										mov		[edi], eax

										mov		[edi + 4], eax

								// 3

										lea		edi, [edi + 8]

										jnz		L17

								// j--, VPlane += uvpitch_adj

								L18:

										mov		eax, [esp + UVPITCH_ADJ]

										add		edi, eax

										add		esi, ebx

										dec		ecx

										jnz		L15


								// for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj)

								// assign (ecx, j)

										mov		ecx, [esp + UVHEIGHT_DIFF]

										test	ecx, ecx

										jz		L21

								L19:

								// for (i = uvoutput_width; i > 0; i -= 8)

								//    *(U32 *)VPlane = 0x40404040; VPlane += 4;

								//    *(U32 *)VPlane = 0x40404040; VPlane += 4;

								// assign (ebp, i)

										mov		ebp, [esp + UVOUTPUT_WIDTH]

								L20:

								// 1

										mov		eax, 040404040H

										sub		ebp, 8

								// 2

										mov		[edi], eax

										mov		[edi + 4], eax

								// 3

										lea		edi, [edi + 8]

										jnz		L20

								// j--, VPlane += uvpitch_adj

										mov		eax, [esp + UVPITCH_ADJ]

										add		edi, eax

										dec		ecx

										jnz		L19


								L21:

										add		esp, LOCALSIZE

										pop		edi

										pop		esi

										pop		ebx

										pop		ebp

										ret

									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef YPITCH_ADJ

								#undef YINPUT_HEIGHT

								#undef YINPUT_WIDTH

								#undef YHEIGHT_DIFF

								#undef YWIDTH_DIFF

								#undef UVPITCH_ADJ

								#undef UVOUTPUT_WIDTH

								#undef UVINPUT_HEIGHT

								#undef UVINPUT_WIDTH

								#undef UVHEIGHT_DIFF

								#undef UVWIDTH_DIFF


								#if 0

								void C_H26X_YUY2toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 *lpInput,

									U8 *YPlane,

									U8 *UPlane,

									U8 *VPlane,

									UN FrameWidth,

									UN FrameHeight,

									const int pitch) {


								U8  *pnext, *plast, *pbn, *peol;

								int width_adj, height_adj;

								int stretch, mark, aspect;

								int iBackTwoLines;

								int j, k;

								int LumaIters = 0;

								int ypitch_adj = pitch - FrameWidth;

								int uvpitch_adj = pitch - (FrameWidth >> 1);

								int nextline = -(lpbiInput->biWidth << 1);


									for (j = FrameHeight; j > 0; j -= 48) {

										LumaIters += 4;

									}

									width_adj = lpbiInput->biWidth - FrameWidth;

									aspect = (width_adj ? LumaIters : 0);

									height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;

									stretch = (height_adj ? 1 : 0);

									mark = 12 - stretch;

									// Move from end of line N to beginning of line N-1

									iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) << 1);

									// Point to the beginning of the last line.

									pnext = lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj))

											        + width_adj;


									for (j = LumaIters; j > 0; j--) {

										for (k = 0; k < mark; k++) {

											for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) {

												if (0 == (k & 1)) {

													*(YPlane+0) = *(pnext+ 0) >> 1;	*(YPlane+1) = *(pnext+ 2) >> 1;

													*(YPlane+2) = *(pnext+ 4) >> 1;	*(YPlane+3) = *(pnext+ 6) >> 1;

													*(YPlane+4) = *(pnext+ 8) >> 1;	*(YPlane+5) = *(pnext+10) >> 1;

													*(YPlane+6) = *(pnext+12) >> 1;	*(YPlane+7) = *(pnext+14) >> 1;

													*(UPlane+0) = ((*(pnext+ 1)>>1) + (*(pnext+ 1+nextline)>>1)) >> 1;

													*(UPlane+1) = ((*(pnext+ 5)>>1) + (*(pnext+ 5+nextline)>>1)) >> 1;

													*(UPlane+2) = ((*(pnext+ 9)>>1) + (*(pnext+ 9+nextline)>>1)) >> 1;

													*(UPlane+3) = ((*(pnext+13)>>1) + (*(pnext+13+nextline)>>1)) >> 1;

													*(VPlane+0) = ((*(pnext+ 3)>>1) + (*(pnext+ 3+nextline)>>1)) >> 1;

													*(VPlane+1) = ((*(pnext+ 7)>>1) + (*(pnext+ 7+nextline)>>1)) >> 1;

													*(VPlane+2) = ((*(pnext+11)>>1) + (*(pnext+11+nextline)>>1)) >> 1;

													*(VPlane+3) = ((*(pnext+15)>>1) + (*(pnext+15+nextline)>>1)) >> 1;

													UPlane += 4; VPlane += 4;

												} else {

													*(YPlane+0) = *(pnext+ 0) >> 1;	*(YPlane+1) = *(pnext+ 2) >> 1;

													*(YPlane+2) = *(pnext+ 4) >> 1;	*(YPlane+3) = *(pnext+ 6) >> 1;

													*(YPlane+4) = *(pnext+ 8) >> 1;	*(YPlane+5) = *(pnext+10) >> 1;

													*(YPlane+6) = *(pnext+12) >> 1;	*(YPlane+7) = *(pnext+14) >> 1;

												}

											}

											pnext += iBackTwoLines;

											YPlane += ypitch_adj;

											if (0 == (k & 1)) {

												UPlane += uvpitch_adj;

												VPlane += uvpitch_adj;

											}

										}

										if (stretch) {

											plast = pnext - (lpbiInput->biWidth << 1);

											pbn = pnext;

											for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4,

																							  plast += 8,

																							  pbn += 8) {

												*(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1;

												*(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1;

												*(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1;

												*(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1;

											}

											YPlane += ypitch_adj;

										}

									}

								}

								#endif


								__declspec(naked)

								_STATIC void IA_H26X_YUY2toYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * BGR24Image,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch			|  + 96

								//	| FrameHeight	|  + 92

								//	| FrameWidth	|  + 88

								//	| VPlane		|  + 84

								//	| UPlane		|  + 80

								//	| YPlane		|  + 76

								//	| lpInput		|  + 72

								//	| lpbiInput		|  + 68

								//	----------------------------

								//	| return addr	|  + 64

								//	| saved ebp		|  + 60

								//	| saved ebx		|  + 56

								//	| saved esi		|  + 52

								//	| saved edi		|  + 48


								//  | pyprev		|  + 44

								//  | pyspace       |  + 40

								//  | pynext        |  + 36

								//	| peol			|  + 32

								//	| j				|  + 28

								//	| k				|  + 24

								//	| iBackTwoLines	|  + 20

								//	| stretch		|  + 16

								//	| mark			|  + 12

								//	| LumaIters		|  +  8

								//	| ypitch_adj	|  +  4

								//	| uvpitch_adj	|  +  0


								#define LOCALSIZE			 48


								#define PITCH_PARM			 96

								#define FRAME_HEIGHT		 92

								#define FRAME_WIDTH			 88

								#define VPLANE				 84

								#define UPLANE				 80

								#define YPLANE				 76

								#define LP_INPUT			 72

								#define LPBI_INPUT			 68


								#define PYPREV				 44

								#define PYSPACE				 40

								#define PYNEXT				 36

								#define PEOL				 32

								#define LOOP_J				 28

								#define LOOP_K				 24

								#define BACK_TWO_LINES		 20

								#define STRETCH				 16

								#define MARK				 12

								#define LUMA_ITERS			  8

								#define YPITCH_ADJ			  4

								#define UVPITCH_ADJ			  0


									_asm {


									push	ebp

									push	ebx

									push	esi

									push	edi

									sub		esp, LOCALSIZE


								// assign (ebx, lpbiInput)

									mov		ebx, [esp + LPBI_INPUT]

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (edx, pitch)

									mov		ebp, ecx

									shr		ebp, 1

									sub		edx, ebp

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (edx, LumaIters)

									xor		edx, edx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		edx, [edx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = lpbiInput->biWidth - FrameWidth;

								// assign (esi, width_adj)

									mov		esi, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		esi, [esp + FRAME_WIDTH]

								// aspect = (width_adj ? LumaIters : 0)

								// assign (edi, aspect)

								// kill (edx, LumaIters)

									mov		[esp + LUMA_ITERS], edx

									xor		edi, edi

									test	esi, esi

									jz		L2

									mov		edi, edx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (edx, height_adj)

								L2:

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		edx, [esp + FRAME_HEIGHT]

									add		edx, edi

									shr		edx, 1

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	edx, edx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		ebp, 12

									sub		ebp, eax

									mov		[esp + MARK], ebp

								// iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)

									mov		ebp, (LPBITMAPINFOHEADER)[ebx].biWidth

									add		ebp, [esp + FRAME_WIDTH]

									shl		ebp, 1

									neg		ebp

									mov		[esp + BACK_TWO_LINES], ebp

								// pnext = lpInput +

								//            ((lpbiInput->biWidth << 1) *

								//             ((FrameHeight - aspect - 1) + height_adj)) +

								//            width_adj

								// kill (ebx, lpbiInput)

								// kill (ecx, FrameWidth)

								// kill (edx, height_adj)

								// kill (esi, width_adj)

								// kill (edi, aspect)

								// assign (esi, pnext)

									mov		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

									shl		eax, 1

									mov		ebx, [esp + FRAME_HEIGHT]

									sub		ebx, edi

									dec		ebx

									add		ebx, edx

									imul	ebx

									add		esi, eax

									add		esi, [esp + LP_INPUT]

								// assign (edi, YPlane)

								// assign (edx, UPlane)

								// assign (ebp, VPlane)

									mov		edi, [esp + YPLANE]

									mov		edx, [esp + UPLANE]

									mov		ebp, [esp + VPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								L4:

								// for (k = 0; k < mark; k++)

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								L5:

								// for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8)

									mov		ecx, [esp + FRAME_WIDTH]

									shl		ecx, 1

									add		ecx, esi

									mov		[esp + PEOL], ecx

								// if (0 == (k & 1)) {

									mov		eax, [esp + LOOP_K]

									test	eax, 1

									jnz		L6

								// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1

								// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1

								// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1

								// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1

								// *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1

								// *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1

								// *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1

								// *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1

								// or graphically

								//         *************************************************************************************************

								// Values  * Y 0 * U 0 * Y 1 * V 0 * Y 2 * U 1 * Y 3 * V 1 * Y 4 * U 2 * Y 5 * V 2 * Y 6 * U 3 * Y 7 * V 3 *

								//         *************************************************************************************************

								// Y Offsets  0			  2			  4			  6			  8			  10		  12		  14

								// U Offsets        1                       5                       9   					13

								// Y Offsets                    3                       7                       11                      15

								// Register usage:

								// eax - accumulate Y values

								// ebx - accumulate U values

								// ecx - accumulate V values

								// esi - ptr to interlaced (VYUY) input

								// edi - ptr for writing Y values

								// edx - ptr for writing U values

								// ebp - ptr for writing V values

								L7:

								; 1

									mov		al, [esi+4]			; Y2

									mov		bl, [esi+9]			; U2

								; 2

									mov		ah, [esi+6]			; Y3

									mov		bh, [esi+13]		; U3

								; 3

									shl		eax, 16

									mov		cl, [esi+11]		; V2

								; 4

									shl		ebx, 16

									mov		ch, [esi+15]		; V3

								; 5

									shl		ecx, 16

									mov		al, [esi]			; Y0

								; 6

									mov		bh, [esi+5]			; U1

									mov		ah, [esi+2]			; Y1

								; 7

									shr		eax, 1

									mov		bl, [esi+1]			; U0

								; 8

									shr		ebx, 1

									mov		ch, [esi+7]			; V1

								; 9

									and		eax, 07F7F7F7FH

									mov		cl, [esi+3]			; V0

								; 10

									shr		ecx, 1

									and		ebx, 07F7F7F7FH

								; 11

									mov		[edi], eax

									and		ecx, 07F7F7F7FH

								; 12

									mov		al, [esi+12]		; Y6

									mov		[edx], ebx

								; 13

									mov		ah, [esi+14]		; Y7

									mov		[ebp], ecx

								; 14

									shl		eax, 16

									mov		ecx, [esp + PEOL]

								; 15

									mov		al, [esi+8]			; Y4

									lea		edi, [edi+8]

								; 16

									mov		ah, [esi+10]		; Y5

									lea		edx, [edx+4]

								; 17

									shr		eax, 1

									lea		ebp, [ebp+4]

								; 18

									and		eax, 07F7F7F7FH

									lea		esi, [esi+16]

								; 19

									mov		[edi-4], eax

									cmp		esi, ecx

								; 20

									jl		L7


									jmp		L8

								// } else {

								// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1

								// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1

								// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1

								// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1

								// }

								// Register usage:

								// eax, ebx - accumulate Y values

								// ecx - peol

								// esi - ptr to interlaced (VYUY) input

								// edi - ptr for writing Y values

								L6:

								; 1

									mov		al, [esi+4]			; Y2

									mov		bl, [esi+12]		; Y6

								; 2

									mov		ah, [esi+6]			; Y3

									mov		bh, [esi+14]		; Y7

								; 3

									shl		eax, 16

									lea		edi, [edi+8]

								; 4

									shl		ebx, 16

									mov		al, [esi]			; Y0

								; 5

									mov		ah, [esi+2]			; Y1

									mov		bh, [esi+10]		; Y5

								; 6

									shr		eax, 1

									mov		bl, [esi+8]			; Y4

								; 7

									shr		ebx, 1

									and		eax, 07F7F7F7FH

								; 8

									mov		[edi-8], eax

									and		ebx, 07F7F7F7FH

								; 9

									mov		[edi-8+4], ebx

									lea		esi, [esi+16]

								; 10

									cmp		esi, ecx

									jl		L6

								L8:

								// pnext += iBackTwoLines

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += ypitch_adj

									add		edi, [esp + YPITCH_ADJ]

								// if (0 == (k&1))

									mov		eax, [esp + LOOP_K]

									test	eax, 1

									jnz		L9

								// UPlane += uvpitch_adj

									add		edx, [esp + UVPITCH_ADJ]

								// VPlane += uvpitch_adj

									add		ebp, [esp + UVPITCH_ADJ]

								L9:

									mov		eax, [esp + LOOP_K]

									inc		eax

									mov		[esp + LOOP_K], eax

									cmp		eax, [esp + MARK]

									jl		L5

								// if (stretch)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L10

								// Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average.

									mov		[esp + UPLANE], edx

									mov		[esp + VPLANE], ebp

								// plast = pnext - (lpbiInput->biWidth << 1)

								// assign (plast, edx)

									mov		edx, esi

									mov		eax, [esp + LPBI_INPUT]

									mov		eax, (LPBITMAPINFOHEADER)[eax].biWidth

									shl		eax, 1

									sub		edx, eax

								// pbn = pnext

								// assign (pbn, ebp)

									mov		ebp, esi

								// for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8)

									mov		ecx, [esp + FRAME_WIDTH]

									shl		ecx, 1

									add		ecx, ebp

								//     *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1

								//     *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1

								//     *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1

								//     *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1


									mov		al, [edx+4]

									mov		bl, [ebp+4]

									mov		bh, [ebp+6]

									shl		ebx, 16


								L11:

								; 1

									mov		ah, [edx+6]

									mov		bl, [ebp]

								; 2

									shl		eax, 16

									mov		bh, [ebp+2]

								; 3

									mov		al, [edx]

									lea		edi, [edi+4]

								; 4

									mov		ah, [edx+2]

									lea		edx, [edx+8]

								; 5

									and		eax, 0xFEFEFEFE

									lea		ebp, [ebp+8]

								; 6

									shr		eax, 1

									and		ebx, 0xFEFEFEFE

								; 7

									shr		ebx, 1

									nop

								; 8

									add		eax, ebx

									mov		bl, [ebp+4]

								; 9

									shr		eax, 1

									mov		bh, [ebp+6]

								; 10

									shl		ebx, 16

									and		eax, 0x7F7F7F7F

								; 11

									mov		[edi-4], eax

									mov		al, [edx+4]

								; 12

									cmp		ebp, ecx

									jl		L11

								//   YPlane += ypitch_adj;

									add		edi, [esp + YPITCH_ADJ]

								// Recover pts to UPlane and VPlane

									mov		edx, [esp + UPLANE]

									mov		ebp, [esp + VPLANE]

								L10:

									mov		eax, [esp + LOOP_J]

									inc		eax

									mov		[esp + LOOP_J], eax

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								bool UYVY_to_YUV12_Flip(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * pImage,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

									DWORD dwFrameWidthHalf, dwFrameHeightHalf;

									BYTE *pRowStartY, *pRowStartSrc, *pRowStartU, *pRowStartV;

									int offset;


									int nRowsToSkip=0, nColsToSkip=0, nRowSkipDelta=0xffffff, nColSkipDelta=0xffffff;

									int nSrcRowIndex, nDstRowIndex, nSrcColIndex, nDstColIndex, COLUMNSTOSKIP=0, ROWSTOSKIP=0;


									if ((FrameWidth != (DWORD)(lpbiInput->biWidth)) || (FrameHeight != (DWORD)(lpbiInput->biHeight)))

									{

										nColsToSkip = COLUMNSTOSKIP = lpbiInput->biWidth - FrameWidth;

										nRowsToSkip = ROWSTOSKIP = lpbiInput->biHeight - FrameHeight;

										if ((nColsToSkip < 0) || (nRowsToSkip < 0))

										{

											return false;

										}


										// nXXXSkipDelta dictate how often we "skip" a row or col

										if (nRowsToSkip)

										{

											nRowSkipDelta = (lpbiInput->biHeight + (nRowsToSkip - 1)) / nRowsToSkip;

										}


										if (nColsToSkip)

										{

											nColSkipDelta = (lpbiInput->biWidth + (nColsToSkip - 1)) / nColsToSkip;

										}


									}


									// quick check to make sure we're processing CIF, QCIF, or SQCIF

									if ((FrameWidth % 4) || (FrameHeight % 4))

									{

										return false;

									}


									dwFrameWidthHalf = FrameWidth / 2;

									dwFrameHeightHalf = FrameHeight / 2;


									nSrcRowIndex = 0;

									nDstRowIndex = 0;


									// step 1, convert the Y values over

									while ((DWORD)nDstRowIndex < FrameHeight)

									{

										// ASSERT(nSrcRowIndex < lpbiInput->biHeight);


										pRowStartY = YPlane + (pitch * nDstRowIndex);

										pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 1;


										// do we need to skip this row ?

										if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0))

										{

											nRowsToSkip--;

											nSrcRowIndex++;

											continue;

										}


										// Copy the Y values of the input row into the destination row

										nSrcColIndex = 0;

										nDstColIndex = 0;


										nColsToSkip = COLUMNSTOSKIP;


										while ((DWORD)nDstColIndex < FrameWidth)

										{

											// ASSERT(nSrcColIndex < lpbiInput->biWidth);


											// do we need to skip this column ?

											if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0))

											{

												nColsToSkip--;

												nSrcColIndex++;

												continue;

											}


											pRowStartY[nDstColIndex] = pRowStartSrc[nSrcColIndex * 2] >> 1;


											nSrcColIndex++;

											nDstColIndex++;

										}


										nSrcRowIndex++;

										nDstRowIndex++;

									}


									nSrcRowIndex = 0;

									nDstRowIndex = 0;

									nRowsToSkip = ROWSTOSKIP;


									// step 2, process U and V values

									while ((DWORD)nDstRowIndex < dwFrameHeightHalf)  // dest is only half as many rows as src

									{

										// ASSERT(nSrcRowIndex < lpbiInput->biHeight);


										// don't process odd numbered rows

										if (nSrcRowIndex % 2)

										{


											// if we were supposed to skip this src row anyway, make sure

											// we update our decrement

											if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0))

											{

												nRowsToSkip--;

											}


											nSrcRowIndex++;


											continue;

										}


										// do we need to skip this row ?

										if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0))

										{

											nRowsToSkip--;

											nSrcRowIndex++;

											continue;

										}


										pRowStartU = UPlane + (pitch * nDstRowIndex);

										pRowStartV = VPlane + (pitch * nDstRowIndex);

										pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 0;


										// Copy the U and V values of the input row into the destination row

										nSrcColIndex = 0;

										nDstColIndex = 0;


										nColsToSkip = COLUMNSTOSKIP;  // reset column skip count


										while ((DWORD)nDstColIndex < dwFrameWidthHalf)

										{

											// ASSERT(nSrcColIndex < lpbiInput->biWidth);


											// skip odd numbered columns

											if (nSrcColIndex % 2)

											{


												// if we were supposed to skip this src row anyway, make sure

												// we update our decrement


												if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0))

												{

													nColsToSkip--;

												}


												nSrcColIndex++;


												continue;

											}


											// do we need to skip this column ?

											if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0))

											{

												nSrcColIndex++;

												nColsToSkip--;

												continue;

											}


											offset = nSrcColIndex * 2;

											pRowStartU[nDstColIndex] = pRowStartSrc[offset] >> 1;

											pRowStartV[nDstColIndex] = pRowStartSrc[offset+2] >> 1;


											nSrcColIndex++;

											nDstColIndex++;

										}


										nSrcRowIndex++;

										nDstRowIndex++;

									}


									// and we are done!

									return true;


								}


								__declspec(naked)

								_STATIC void IA_H26X_UYVYtoYUV12(

									LPBITMAPINFOHEADER lpbiInput,

									U8 * BGR24Image,

									U8 * YPlane,

									U8 * UPlane,

									U8 * VPlane,

									UN  FrameWidth,

									UN  FrameHeight,

									const int pitch)

								{

								// Permanent (callee-save) registers - ebx, esi, edi, ebp

								// Temporary (caller-save) registers - eax, ecx, edx

								//

								// Stack frame layout

								//	| pitch			|  + 96

								//	| FrameHeight	|  + 92

								//	| FrameWidth	|  + 88

								//	| VPlane		|  + 84

								//	| UPlane		|  + 80

								//	| YPlane		|  + 76

								//	| lpInput		|  + 72

								//	| lpbiInput		|  + 68

								//	----------------------------

								//	| return addr	|  + 64

								//	| saved ebp		|  + 60

								//	| saved ebx		|  + 56

								//	| saved esi		|  + 52

								//	| saved edi		|  + 48


								//  | pyprev		|  + 44

								//  | pyspace       |  + 40

								//  | pynext        |  + 36

								//	| peol			|  + 32

								//	| j				|  + 28

								//	| k				|  + 24

								//	| iBackTwoLines	|  + 20

								//	| stretch		|  + 16

								//	| mark			|  + 12

								//	| LumaIters		|  +  8

								//	| ypitch_adj	|  +  4

								//	| uvpitch_adj	|  +  0


									_asm {


									push	ebp

									push	ebx

									push	esi

									push	edi

									sub		esp, LOCALSIZE


								// assign (ebx, lpbiInput)

									mov		ebx, [esp + LPBI_INPUT]

								// ypitch_adj = pitch - FrameWidth

								// assign (ecx, FrameWidth)

								// assign (edx, pitch)

									mov		ecx, [esp + FRAME_WIDTH]

									mov		edx, [esp + PITCH_PARM]

									mov		eax, edx

									sub		eax, ecx

									mov		[esp + YPITCH_ADJ], eax

								// uvpitch_adj = pitch - (FrameWidth >> 1)

								// kill (edx, pitch)

									mov		ebp, ecx

									shr		ebp, 1

									sub		edx, ebp

									mov		[esp + UVPITCH_ADJ], edx

								// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4

								// assign (edx, LumaIters)

									xor		edx, edx

									mov		eax, [esp + FRAME_HEIGHT]

								L1:

									lea		edx, [edx + 4]

									sub		eax, 48

									jnz		L1

								// width_adj = lpbiInput->biWidth - FrameWidth;

								// assign (esi, width_adj)

									mov		esi, (LPBITMAPINFOHEADER)[ebx].biWidth

									sub		esi, [esp + FRAME_WIDTH]

								// aspect = (width_adj ? LumaIters : 0)

								// assign (edi, aspect)

								// kill (edx, LumaIters)

									mov		[esp + LUMA_ITERS], edx

									xor		edi, edi

									test	esi, esi

									jz		L2

									mov		edi, edx

								// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1

								// assign (edx, height_adj)

								L2:

									mov		edx, (LPBITMAPINFOHEADER)[ebx].biHeight

									sub		edx, [esp + FRAME_HEIGHT]

									add		edx, edi

									shr		edx, 1

								// stretch = (height_adj ? 1 : 0)

									xor		eax, eax

									test	edx, edx

									jz		L3

									inc		eax

								L3:

									mov		[esp + STRETCH], eax

								// mark = 12 - stretch

									mov		ebp, 12

									sub		ebp, eax

									mov		[esp + MARK], ebp

								// iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)

									mov		ebp, (LPBITMAPINFOHEADER)[ebx].biWidth

									add		ebp, [esp + FRAME_WIDTH]

									shl		ebp, 1

									neg		ebp

									mov		[esp + BACK_TWO_LINES], ebp

								// pnext = lpInput +

								//            ((lpbiInput->biWidth << 1) *

								//             ((FrameHeight - aspect - 1) + height_adj)) +

								//            width_adj

								// kill (ebx, lpbiInput)

								// kill (ecx, FrameWidth)

								// kill (edx, height_adj)

								// kill (esi, width_adj)

								// kill (edi, aspect)

								// assign (esi, pnext)

									mov		eax, (LPBITMAPINFOHEADER)[ebx].biWidth

									shl		eax, 1

									mov		ebx, [esp + FRAME_HEIGHT]

									sub		ebx, edi

									dec		ebx

									add		ebx, edx

									imul	ebx

									add		esi, eax

									add		esi, [esp + LP_INPUT]

								// assign (edi, YPlane)

								// assign (edx, UPlane)

								// assign (ebp, VPlane)

									mov		edi, [esp + YPLANE]

									mov		edx, [esp + UPLANE]

									mov		ebp, [esp + VPLANE]

								// for (j = 0; j < LumaIters; j++)

									xor		eax, eax

									mov		[esp + LOOP_J], eax

								L4:

								// for (k = 0; k < mark; k++)

									xor		eax, eax

									mov		[esp + LOOP_K], eax

								L5:

								// for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8)

									mov		ecx, [esp + FRAME_WIDTH]

									shl		ecx, 1

									add		ecx, esi

									mov		[esp + PEOL], ecx

								// if (0 == (k & 1)) {

									mov		eax, [esp + LOOP_K]

									test	eax, 1

									jnz		L6

								// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1

								// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1

								// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1

								// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1

								// *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1

								// *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1

								// *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1

								// *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1

								// or graphically

								//         *************************************************************************************************

								// Values  * U 0 * Y 0 * V 0 * Y 1 * U 1 * Y 2 * V 1 * Y 3 * U 2 * Y 4 * V 2 * Y 5 * U 3 * Y 6 * V 3 * Y 7 *

								//         *************************************************************************************************

								// Y Offsets        1           3           5           7           9           11          13          15

								// U Offsets  0                       4                       8                       12

								// Y Offsets              2                       6                       10                      14

								// Register usage:

								// eax - accumulate Y values

								// ebx - accumulate U values

								// ecx - accumulate V values

								// esi - ptr to interlaced (VYUY) input

								// edi - ptr for writing Y values

								// edx - ptr for writing U values

								// ebp - ptr for writing V values

								L7:

								; 1

									mov		al, [esi+5]			; Y2

									mov		bl, [esi+8]			; U2

								; 2

									mov		ah, [esi+7]			; Y3

									mov		bh, [esi+12]		; U3

								; 3

									shl		eax, 16

									mov		cl, [esi+10]		; V2

								; 4

									shl		ebx, 16

									mov		ch, [esi+14]		; V3

								; 5

									shl		ecx, 16

									mov		al, [esi+1]			; Y0

								; 6

									mov		bh, [esi+4]			; U1

									mov		ah, [esi+3]			; Y1

								; 7

									shr		eax, 1

									mov		bl, [esi]			; U0

								; 8

									shr		ebx, 1

									mov		ch, [esi+6]			; V1

								; 9

									and		eax, 07F7F7F7FH

									mov		cl, [esi+2]			; V0

								; 10

									shr		ecx, 1

									and		ebx, 07F7F7F7FH

								; 11

									mov		[edi], eax

									and		ecx, 07F7F7F7FH

								; 12

									mov		al, [esi+13]		; Y6

									mov		[edx], ebx

								; 13

									mov		ah, [esi+15]		; Y7

									mov		[ebp], ecx

								; 14

									shl		eax, 16

									mov		ecx, [esp + PEOL]

								; 15

									mov		al, [esi+9]			; Y4

									lea		edi, [edi+8]

								; 16

									mov		ah, [esi+11]		; Y5

									lea		edx, [edx+4]

								; 17

									shr		eax, 1

									lea		ebp, [ebp+4]

								; 18

									and		eax, 07F7F7F7FH

									lea		esi, [esi+16]

								; 19

									mov		[edi-4], eax

									cmp		esi, ecx

								; 20

									jl		L7


									jmp		L8

								// } else {

								// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1

								// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1

								// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1

								// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1

								// }

								// Register usage:

								// eax, ebx - accumulate Y values

								// ecx - peol

								// esi - ptr to interlaced (VYUY) input

								// edi - ptr for writing Y values

								L6:

								; 1

									mov		al, [esi+5]			; Y2

									mov		bl, [esi+13]		; Y6

								; 2

									mov		ah, [esi+7]			; Y3

									mov		bh, [esi+15]		; Y7

								; 3

									shl		eax, 16

									lea		edi, [edi+8]

								; 4

									shl		ebx, 16

									mov		al, [esi+1]			; Y0

								; 5

									mov		ah, [esi+3]			; Y1

									mov		bh, [esi+11]		; Y5

								; 6

									shr		eax, 1

									mov		bl, [esi+9]			; Y4

								; 7

									shr		ebx, 1

									and		eax, 07F7F7F7FH

								; 8

									mov		[edi-8], eax

									and		ebx, 07F7F7F7FH

								; 9

									mov		[edi-8+4], ebx

									lea		esi, [esi+16]

								; 10

									cmp		esi, ecx

									jl		L6

								L8:

								// pnext += iBackTwoLines

									add		esi, [esp + BACK_TWO_LINES]

								// YPlane += ypitch_adj

									add		edi, [esp + YPITCH_ADJ]

								// if (0 == (k&1))

									mov		eax, [esp + LOOP_K]

									test	eax, 1

									jnz		L9

								// UPlane += uvpitch_adj

									add		edx, [esp + UVPITCH_ADJ]

								// VPlane += uvpitch_adj

									add		ebp, [esp + UVPITCH_ADJ]

								L9:

									mov		eax, [esp + LOOP_K]

									inc		eax

									mov		[esp + LOOP_K], eax

									cmp		eax, [esp + MARK]

									jl		L5

								// if (stretch)

									mov		eax, [esp + STRETCH]

									test	eax, eax

									jz		L10

								// Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average.

									mov		[esp + UPLANE], edx

									mov		[esp + VPLANE], ebp

								// plast = pnext - (lpbiInput->biWidth << 1)

								// assign (plast, edx)

									mov		edx, esi

									mov		eax, [esp + LPBI_INPUT]

									mov		eax, (LPBITMAPINFOHEADER)[eax].biWidth

									shl		eax, 1

									sub		edx, eax

								// pbn = pnext

								// assign (pbn, ebp)

									mov		ebp, esi

								// for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8)

									mov		ecx, [esp + FRAME_WIDTH]

									shl		ecx, 1

									add		ecx, ebp

								//     *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1

								//     *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1

								//     *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1

								//     *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1


									mov		al, [edx+5]

									mov		bl, [ebp+5]

									mov		bh, [ebp+7]

									shl		ebx, 16


								L11:

								; 1

									mov		ah, [edx+7]

									mov		bl, [ebp+1]

								; 2

									shl		eax, 16

									mov		bh, [ebp+3]

								; 3

									mov		al, [edx+1]

									lea		edi, [edi+4]

								; 4

									mov		ah, [edx+3]

									lea		edx, [edx+8]

								; 5

									and		eax, 0xFEFEFEFE

									lea		ebp, [ebp+8]

								; 6

									shr		eax, 1

									and		ebx, 0xFEFEFEFE

								; 7

									shr		ebx, 1

									nop

								; 8

									add		eax, ebx

									mov		bl, [ebp+5]

								; 9

									shr		eax, 1

									mov		bh, [ebp+7]

								; 10

									shl		ebx, 16

									and		eax, 0x7F7F7F7F

								; 11

									mov		[edi-4], eax

									mov		al, [edx+5]

								; 12

									cmp		ebp, ecx

									jl		L11

								//   YPlane += ypitch_adj;

									add		edi, [esp + YPITCH_ADJ]

								// Recover pts to UPlane and VPlane

									mov		edx, [esp + UPLANE]

									mov		ebp, [esp + VPLANE]

								L10:

									mov		eax, [esp + LOOP_J]

									inc		eax

									mov		[esp + LOOP_J], eax

									cmp		eax, [esp + LUMA_ITERS]

									jl		L4


									add		esp, LOCALSIZE

									pop		edi

									pop		esi

									pop		ebx

									pop		ebp

									ret


									}

								}


								#undef LOCALSIZE


								#undef PITCH_PARM

								#undef FRAME_HEIGHT

								#undef FRAME_WIDTH

								#undef VPLANE

								#undef UPLANE

								#undef YPLANE

								#undef LP_INPUT

								#undef LPBI_INPUT


								#undef PYPREV

								#undef PYSPACE

								#undef PYNEXT

								#undef PEOL

								#undef LOOP_J

								#undef LOOP_K

								#undef BACK_TWO_LINES

								#undef STRETCH

								#undef MARK

								#undef LUMA_ITERS

								#undef YPITCH_ADJ

								#undef UVPITCH_ADJ


								/*************************************************************

								 *  Name:         colorCnvtFrame

								 *  Description:  Color convert and copy input frame.

								 ************************************************************/

								void colorCnvtFrame(

								    T_H263EncoderCatalog * EC,

								    LPCODINST              lpCompInst,

								    ICCOMPRESS           * lpicComp,

								    U8                   * YPlane,

								    U8                   * UPlane,

								    U8                   * VPlane

								)

								{

								    U8 *RGBCursor = (U8 *) lpicComp->lpInput;

									LPBITMAPINFOHEADER	lpbiInput = lpicComp->lpbiInput;


								    /*  The Connectix Quick Cam requires RGB to YUV12 conversion.

								     *  The B/W camera generates palette versions (8 and 4 bit).

								     *  The color camera generates RGB24 for million colors and

								     *  RGB16555 for thousands colors.

								     */


								    if (BI_RGB == lpicComp->lpbiInput->biCompression)

									{

								        if (24 == lpicComp->lpbiInput->biBitCount) {

								#if 0

										if ((128 == lpbiInput->biWidth) && (96 == lpbiInput->biHeight)) {

											U8 YTest[12288];

											U8 UTest[6144];

											U8 VTest[6144];

											int i, j, k;

											U8 R,G,B;

											C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YTest, UTest, VTest,

												EC->FrameWidth, EC->FrameHeight, 128);

											for (i = 0; i < 96; i++) {

												for (j = 0; j < 128; j++) {

													k = (i*128)+j;

													if (1 < abs(YPlane[(i*384)+j]-YTest[(i*128)+j])) {

														B = RGBCursor[(((95-i)*128)+j)*3];

														G = RGBCursor[(((95-i)*128)+j)*3+1];

														R = RGBCursor[(((95-i)*128)+j)*3+2];

													}

													if ((0 == (i%2)) && (0 == (j%2))) {

														k =	((i>>1)*128)+(j>>1);

														if (1 < abs(UPlane[((i>>1)*384)+(j>>1)]-UTest[((i>>1)*128)+(j>>1)])) {

															B = RGBCursor[(((95-i)*128)+j)*3];

															G = RGBCursor[(((95-i)*128)+j)*3+1];

															R = RGBCursor[(((95-i)*128)+j)*3+2];

														}

														if (1 < abs(VPlane[((i>>1)*384)+(j>>1)] != VTest[((i>>1)*128)+(j>>1)])) {

															B = RGBCursor[(((95-i)*128)+j)*3];

															G = RGBCursor[(((95-i)*128)+j)*3+1];

															R = RGBCursor[(((95-i)*128)+j)*3+2];

														}

													}

												}

											}

										}

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

											C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, PITCH);

								#else

											IA_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

										}

										else if(16 == lpicComp->lpbiInput->biBitCount)

										{

										// To use a common routine for all possible combinations of RGB16,

										// a bitfield number is passed. This number identifies the proper bit shift

										// and masking values to extract the color information

										// from the 16-bit pixel words.

										//

										//   number             shift              mask

										//                     B, G, R

										//   ------          -----------       ----------------

										//    555              2, 3, 8         0x7C, 0x7C, 0x7C

										//    664              3, 3, 9         0x78, 0x7E, 0x7E

										//    565              2, 4, 9         0x7C, 0x7E, 0x7C

										//    655              2, 3, 9         0x7C, 0x7C, 0x7E

										//

										// Only 555 falls under BI_RGB. The others are specified using the

										// BI_BITFIELDS compression specification. For BI_BITFIELDS, call

										// Build16bitModeID to get the actual bitfield number. This routine requires the

										// three array elements in the bmiColors field of a BITMAPINFO object.

										//

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

											C_H26X_BGR16toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, 555, PITCH);

								#else

											IA_H26X_BGR16555toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

										}

										else if(8 == lpicComp->lpbiInput->biBitCount)

										{

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

											C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, 8, PITCH);

								#else

											IA_H26X_CLUT8toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

										}

										else if(4 == lpicComp->lpbiInput->biBitCount)

										{

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

											C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, 4, PITCH);

								#else

											IA_H26X_CLUT4toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

												EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

										}

										else

										{

											DBOUT("ERROR: Unexpected input format detected.");

										}

								    }

								    else if (FOURCC_YVU9 == lpicComp->lpbiInput->biCompression)

								    {

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

								        C_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);

								#else

								        IA_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

								    }

								    else if ((FOURCC_YUV12 == lpicComp->lpbiInput->biCompression) || (FOURCC_IYUV == lpicComp->lpbiInput->biCompression))

									{

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

								        C_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);

								#else

								        IA_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

									}

								    else if (FOURCC_YUY2 == lpicComp->lpbiInput->biCompression)

								    {

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif

								#if 0

								        C_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);

								#else

								        IA_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);

								#endif

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

								    }

								    else if (FOURCC_UYVY == lpicComp->lpbiInput->biCompression)

								    {

								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time = PENTIUM_TIMER();

											}

								#endif


								       UYVY_to_YUV12_Flip(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								            EC->FrameWidth, EC->FrameHeight, PITCH);


								//        IA_H26X_UYVYtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,

								//            EC->FrameWidth, EC->FrameHeight, PITCH);


								#if defined(_CODEC_STATS)

											if (pEncoderStats) {

												pEncoderStats->color_convertor_time =

													PENTIUM_TIMER() - pEncoderStats->color_convertor_time;

											}

								#endif

								    }

								    else

								    {

								        DBOUT("ERROR: Unexpected input format detected.");

								    }

								}