|
|
/* *************************************************************************
** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */
////////////////////////////////////////////////////////////////////////////
//
// $Author: MDUDA $
// $Date: 21 Nov 1996 17:33:56 $
// $Archive: S:\h26x\src\enc\excolcnv.cpv $
// $Header: S:\h26x\src\enc\excolcnv.cpv 1.45 21 Nov 1996 17:33:56 MDUDA $
// $Log: S:\h26x\src\enc\excolcnv.cpv $
//
// Rev 1.45 21 Nov 1996 17:33:56 MDUDA
// Added more non-compressed YUV12 support (RGB16 and RGB24).
// Also rewrote IA_YUV12toEncYUV12 to be more readable.
//
// Rev 1.44 31 Oct 1996 10:05:48 KLILLEVO
// changed from DBOUT to DbgLog
//
// Rev 1.43 22 Oct 1996 16:44:22 MDUDA
// Added IA support for YUY2 input color conversion and cleaned up C version.
// Now using IA version.
//
// Rev 1.42 18 Oct 1996 14:31:32 MDUDA
//
// Added a C-version of YUY2 input color conversion.
//
// Rev 1.41 11 Oct 1996 16:04:50 MDUDA
// Using new RGB to YUV lookup tables.
//
// Rev 1.40 03 Oct 1996 10:43:58 AGUPTA2
// Got rid of segment directives; made tables read-only.
//
// Rev 1.39 13 Sep 1996 13:34:04 MDUDA
// Fixed YVU9 bug where input = output frame size was not colored
// (U and V planes) properly.
//
// Rev 1.38 11 Sep 1996 15:45:06 MDUDA
// Modified RGB look-up tables and added C_H26X_YUV12toEncYUV12 and
// IA_H26X_YUV12toEncYUV12.
//
// Rev 1.37 03 Sep 1996 14:54:46 MDUDA
// Fixed problem causing VC++ 4.1 internal compiler error. Replaced
// inline assembler constructs such as [ebx.biWidth] with
// (LPBITMAPINFOHEADER)[ebx].biWidth.
//
// Rev 1.36 29 Aug 1996 16:31:14 MDUDA
// Added Pentium assembler versions for all RGB conversion routines.
// Also, rewrote YVU9 support to allow input frame sizes other
// than 160x120 and 240x180.
//
// Rev 1.35 16 Aug 1996 12:17:48 MDUDA
// Fixed bug where U and V values in the BGR converters were treated as unsign
// values. Also did some general cleanup of BGR converters in preparation for
// doing Pentium assembler version.
//
// Rev 1.34 13 Aug 1996 10:35:38 MDUDA
// Added support for RGB4. Generalized RGB LUT support for 4-bit and
// and 8-bit pixels into a single routine.
//
// Rev 1.33 09 Aug 1996 09:45:02 MDUDA
// Added support for RGB16 format on input. This is for the color
// Quick Cam. Also, generalized RGB16 for other bit combinations.
// However, these can only be specified under BI_BITFIELDS format.
//
// Rev 1.32 02 Aug 1996 13:44:48 MDUDA
// modified H26X_BGR24toYUV12 to crop and stretch 240x180 and 160x120
// frames
//
// Rev 1.31 01 Aug 1996 14:03:50 MDUDA
//
// Optimized H26X_YVU9toYUV12 by rewriting function in assembler code. Used in
// _asm. Also re-arranged functions so that colorCnvtFrame is at the end of
// the file.
//
// Rev 1.30 22 Jul 1996 13:28:22 BECHOLS
// Added a CLUT8 to YUV12 color convertor (CC). This CC crops and stretches
// either the 240x180 or the 160x120 image size to produce QCIF and SubQCIF
// image sizes respectively.
//
// Rev 1.29 11 Jul 1996 15:47:02 MDUDA
//
// Modified H263_YVU9toYUV12 to create subQCIF and QCIF from
// 160x120 and 240x180 images, respectively. To fit the new
// formats, the original images are cropped and stretched using a
// dither pattern for the color planes.
//
// Rev 1.28 14 May 1996 12:04:08 KLILLEVO
// changed RGB->YUV color conversion to use the inverse
// if the output YUV->RGB conversion instead of the conversion
// "recommended by the CCIR". Compression performance for RGB
// input was significantly improved (33% less bits for same
// fixed QP)
//
// Rev 1.27 04 May 1996 21:55:20 BECHOLS
// For RGB24 to YVU12 conversion, I unrolled the inner loop by 8 and changed
// the writes to DWORD vs. BYTE writes. This resulted in a 30% reduction in
// the execution time.
//
// Rev 1.26 10 Apr 1996 16:44:14 RHAZRA
// Fixed a bug in 320x240 mode for the H26X_YUV12toEncYUV12() function.
// DWORD should be and-ed with 0x7f7f7f7f and not 0x7f7f7f.
//
// Rev 1.25 27 Mar 1996 15:10:08 SCDAY
// Optimized H26X_YUV12toEncYUV12 'C' code to read/write DWORDs
//
// Rev 1.24 08 Jan 1996 17:46:14 unknown
//
// Correct logic on bIs320x240 check
//
// Rev 1.23 05 Jan 1996 17:34:38 RMCKENZX
// corrected chroma pad value to 0x40 to achieve black padding
//
// Rev 1.22 05 Jan 1996 17:29:46 RMCKENZX
// Added code to pad out 320x240 stills to 352x288
// full CIF images.
//
// Rev 1.21 04 Jan 1996 18:37:20 TRGARDOS
// Added code to permit 320x240 input and then set a boolean
// bIs320x240.
//
// Rev 1.20 02 Jan 1996 17:09:04 TRGARDOS
// Moved colorCnvFrame into this file and made the
// color convertor functions static.
//
// Rev 1.19 27 Dec 1995 15:32:56 RMCKENZX
// Added copyright notice
//
// Rev 1.18 06 Dec 1995 09:35:42 TRGARDOS
// Added Brian's fix to the input color convertor to avoid
// overflow of the chars.
//
// Rev 1.17 27 Nov 1995 16:09:04 TRGARDOS
// Removed two unused variables to get rid of compiler warnings.
//
// Rev 1.16 30 Oct 1995 14:34:12 TRGARDOS
// Fixed 240x180 to center clip.
//
// Rev 1.15 30 Oct 1995 12:03:16 TRGARDOS
// Added color convertor support for YUV9 240x180.
//
// Rev 1.14 28 Oct 1995 15:39:28 TRGARDOS
// Fixed color conversion problem from YVU9 to YVU12.
//
// Rev 1.13 12 Oct 1995 17:40:12 TRGARDOS
// Fixed YUV12 input color convertor.
//
// Rev 1.12 12 Oct 1995 12:04:16 TRGARDOS
// Changed some variable names in YUV12 convertor.
//
// Rev 1.11 10 Oct 1995 16:34:12 TRGARDOS
// Added YUV12 input support.
//
// Rev 1.10 28 Sep 1995 17:02:36 DBRUCKS
// fix colorIn to not swap left to right
//
// Rev 1.9 15 Sep 1995 16:37:38 TRGARDOS
//
//
// Rev 1.8 13 Sep 1995 17:09:22 TRGARDOS
//
// Finished adding encoder support for YVU9 160x120 frames.
//
// Rev 1.7 11 Sep 1995 11:14:06 DBRUCKS
// add h261 ifdef
//
// Rev 1.6 07 Sep 1995 09:27:54 TRGARDOS
// Added YVU9 to YVU12 color convertor.
//
// Rev 1.5 05 Sep 1995 15:50:46 TRGARDOS
// Added color back in to convertors.
//
// Rev 1.4 01 Sep 1995 17:51:42 TRGARDOS
// Fixed bugs in color converter.
//
// Rev 1.3 01 Sep 1995 10:13:42 TRGARDOS
// Debugging bit stream errors.
//
// Rev 1.2 30 Aug 1995 12:42:26 TRGARDOS
// Fixed bugs in intra AC coef VLC coding.
//
// Rev 1.1 02 Aug 1995 17:28:06 TRGARDOS
//
// Cleaned up stuff to get stub working under new
// version control system.
//
// Rev 1.0 31 Jul 1995 13:07:10 DBRUCKS
// Initial revision.
//
// Rev 1.0 17 Jul 1995 14:46:16 CZHU
// Initial revision.
//
// Rev 1.0 17 Jul 1995 14:14:22 CZHU
// Initial revision.
;////////////////////////////////////////////////////////////////////////////
/*
CCIR 601 Specifies a conversion from RGB to YCrCb. For what we call U and V, they are equivalent as U = Cb, V = Cr.
From CCIR 601-2 Annex II, we can go from RGB with values in the range of 0-255, to YUV values in the same range by the equation:
Y = ( 77*R + 150*G + 29*B ) >> 8; V = ( 131*R - 110*G - 21*B ) >> 8 + 128; // Cr
U = ( (-44)*R - 87*G + 131*B ) >> 8 + 128; // Cb
Has now changed to the inverse of the YUV->RGB on the output, since the old version produced way too many bits. The new version is:
Y = ( 16836*R + 33056*G + 6416*B ) >> 16 + 16; V = ( 28777*R - 24117*G - 4660*B ) >> 16 + 128; // Cr
U = ( (-9726)*R - 19064*G + 28790*B ) >> 16 + 128; // Cb
*/
#include "precomp.h"
#if defined(_CODEC_STATS)
static const double RDTSC_SHIFT_32 = 4294967296.0;
static double PENTIUM_TIMER() { unsigned long int a, b; double temp1, temp2, result;
__asm { _emit 0x0f _emit 0x31 mov a, eax mov b, edx }
temp1 = (double) a; temp2 = (double) (b & 0xFFFF); if (RDTSC_CLOCK_FREQ) { result = (temp1 + temp2 * RDTSC_SHIFT_32) / RDTSC_CLOCK_FREQ; } else { result = 0.0; } return( result * 1000.0 ); }
#endif
// Set all local functions to "static", and then set it accordingly if
// VTune statistics are to be collected. VTune doesn't recognize static functions
// so we need some way to turn off the static attribute if VTune is to be run
// on the executable. For now, simply use a define of _VTUNE to build the driver.
#if defined(_VTUNE)
#define _STATIC
#else
#define _STATIC static
#endif
// These are the look-up tables for the RGB converters. They are 8 bytes/entry
// to allow addressing via the scale by 8 indexed addressing mode. A pseudo-SIMD
// arrangement is used in these tables. Since all R, G and B contributions to the
// Y value are positive and fit in 15 bits, these are stored in the lower 16-bits
// of the YU word. In some cases, the U contribution is negative so it is placed
// in the upper 16 bits of the YU word. When a Y value is calculated, the U value
// is calculated in parallel. The V contribution is negative in some cases, but it
// gets its own word.
// This is the code that was used to generate the tables.
#if 0
#define YRCoef 16836
#define YGCoef 33056
#define YBCoef 6416
#define URCoef 9726
#define UGCoef 19064
#define UBCoef 28790
#define VRCoef 28777
#define VGCoef 24117
#define VBCoef 4660
#include <stdio.h>
void main() { int i,j;
printf("struct YUV {\n"); printf(" int YU;\n"); printf(" int V;\n"); printf("};\n\n");
printf("struct YUV RYUV[] = {\n"); for (i = 0; i < 64; i++) { for (j = 0; j < 4; j += 2) { printf("{0x%.8x, 0x%.8x}, ", ((YRCoef*((i*4)+j+1))>>9) | ((-(((URCoef*((i*4)+j+1)))>>9))<<16), ((VRCoef*((i*4)+j+1))>>9)); } printf("\n"); } printf("};\n");
printf("struct YUV GYUV[] = {\n"); for (i = 0; i < 64; i++) { for (j = 0; j < 4; j += 2) { printf("{0x%.8x, 0x%.8x}, ", ((YGCoef*((i*4)+j+1))>>9) | ((-(((UGCoef*((i*4)+j+1)))>>9))<<16), -((VGCoef*((i*4)+j+1))>>9)); } printf("\n"); } printf("};\n");
printf("struct YUV BYUV[] = {\n"); for (i = 0; i < 64; i++) { for (j = 0; j < 4; j += 2) { printf("{0x%.8x, 0x%.8x}, ", ((YBCoef*((i*4)+j+1))>>9) | (((UBCoef*((i*4)+j+1))>>9)<<16), -((VBCoef*((i*4)+j+1))>>9)); } printf("\n"); } printf("};\n"); } #endif
struct YUV { int YU; int V; };
const struct YUV RYUV[] = { {0xffee0020, 0x00000038}, {0xffc80062, 0x000000a8}, {0xffa200a4, 0x00000119}, {0xff7c00e6, 0x00000189}, {0xff560127, 0x000001f9}, {0xff300169, 0x0000026a}, {0xff0a01ab, 0x000002da}, {0xfee401ed, 0x0000034b}, {0xfebe022f, 0x000003bb}, {0xfe980270, 0x0000042b}, {0xfe7202b2, 0x0000049c}, {0xfe4c02f4, 0x0000050c}, {0xfe260336, 0x0000057d}, {0xfe000377, 0x000005ed}, {0xfdda03b9, 0x0000065d}, {0xfdb403fb, 0x000006ce}, {0xfd8e043d, 0x0000073e}, {0xfd68047e, 0x000007af}, {0xfd4204c0, 0x0000081f}, {0xfd1c0502, 0x0000088f}, {0xfcf60544, 0x00000900}, {0xfcd00585, 0x00000970}, {0xfcaa05c7, 0x000009e1}, {0xfc840609, 0x00000a51}, {0xfc5e064b, 0x00000ac2}, {0xfc38068d, 0x00000b32}, {0xfc1206ce, 0x00000ba2}, {0xfbec0710, 0x00000c13}, {0xfbc60752, 0x00000c83}, {0xfba00794, 0x00000cf4}, {0xfb7a07d5, 0x00000d64}, {0xfb540817, 0x00000dd4}, {0xfb2e0859, 0x00000e45}, {0xfb08089b, 0x00000eb5}, {0xfae208dc, 0x00000f26}, {0xfabc091e, 0x00000f96}, {0xfa960960, 0x00001006}, {0xfa7009a2, 0x00001077}, {0xfa4a09e3, 0x000010e7}, {0xfa240a25, 0x00001158}, {0xf9fe0a67, 0x000011c8}, {0xf9d80aa9, 0x00001239}, {0xf9b20aeb, 0x000012a9}, {0xf98c0b2c, 0x00001319}, {0xf9660b6e, 0x0000138a}, {0xf9400bb0, 0x000013fa}, {0xf91a0bf2, 0x0000146b}, {0xf8f40c33, 0x000014db}, {0xf8ce0c75, 0x0000154b}, {0xf8a80cb7, 0x000015bc}, {0xf8820cf9, 0x0000162c}, {0xf85c0d3a, 0x0000169d}, {0xf8360d7c, 0x0000170d}, {0xf8100dbe, 0x0000177d}, {0xf7ea0e00, 0x000017ee}, {0xf7c40e41, 0x0000185e}, {0xf79e0e83, 0x000018cf}, {0xf7780ec5, 0x0000193f}, {0xf7520f07, 0x000019af}, {0xf72c0f49, 0x00001a20}, {0xf7060f8a, 0x00001a90}, {0xf6e00fcc, 0x00001b01}, {0xf6ba100e, 0x00001b71}, {0xf6941050, 0x00001be2}, {0xf66e1091, 0x00001c52}, {0xf64810d3, 0x00001cc2}, {0xf6221115, 0x00001d33}, {0xf5fc1157, 0x00001da3}, {0xf5d61198, 0x00001e14}, {0xf5b011da, 0x00001e84}, {0xf58a121c, 0x00001ef4}, {0xf564125e, 0x00001f65}, {0xf53e12a0, 0x00001fd5}, {0xf51812e1, 0x00002046}, {0xf4f21323, 0x000020b6}, {0xf4cc1365, 0x00002126}, {0xf4a613a7, 0x00002197}, {0xf48013e8, 0x00002207}, {0xf45a142a, 0x00002278}, {0xf434146c, 0x000022e8}, {0xf40e14ae, 0x00002359}, {0xf3e814ef, 0x000023c9}, {0xf3c21531, 0x00002439}, {0xf39c1573, 0x000024aa}, {0xf37615b5, 0x0000251a}, {0xf35015f6, 0x0000258b}, {0xf32a1638, 0x000025fb}, {0xf304167a, 0x0000266b}, {0xf2de16bc, 0x000026dc}, {0xf2b816fe, 0x0000274c}, {0xf292173f, 0x000027bd}, {0xf26c1781, 0x0000282d}, {0xf24617c3, 0x0000289d}, {0xf2201805, 0x0000290e}, {0xf1fa1846, 0x0000297e}, {0xf1d41888, 0x000029ef}, {0xf1ae18ca, 0x00002a5f}, {0xf188190c, 0x00002acf}, {0xf162194d, 0x00002b40}, {0xf13c198f, 0x00002bb0}, {0xf11619d1, 0x00002c21}, {0xf0f01a13, 0x00002c91}, {0xf0ca1a54, 0x00002d02}, {0xf0a41a96, 0x00002d72}, {0xf07e1ad8, 0x00002de2}, {0xf0581b1a, 0x00002e53}, {0xf0321b5c, 0x00002ec3}, {0xf00c1b9d, 0x00002f34}, {0xefe61bdf, 0x00002fa4}, {0xefc01c21, 0x00003014}, {0xef9a1c63, 0x00003085}, {0xef741ca4, 0x000030f5}, {0xef4e1ce6, 0x00003166}, {0xef281d28, 0x000031d6}, {0xef021d6a, 0x00003246}, {0xeedc1dab, 0x000032b7}, {0xeeb61ded, 0x00003327}, {0xee901e2f, 0x00003398}, {0xee6a1e71, 0x00003408}, {0xee441eb2, 0x00003479}, {0xee1e1ef4, 0x000034e9}, {0xedf81f36, 0x00003559}, {0xedd21f78, 0x000035ca}, {0xedac1fba, 0x0000363a}, {0xed861ffb, 0x000036ab}, {0xed60203d, 0x0000371b}, {0xed3a207f, 0x0000378b}, {0xed1420c1, 0x000037fc}, }; const struct YUV GYUV[] = { {0xffdb0040, 0xffffffd1}, {0xff9100c1, 0xffffff73}, {0xff460142, 0xffffff15}, {0xfefc01c3, 0xfffffeb7}, {0xfeb10245, 0xfffffe59}, {0xfe6702c6, 0xfffffdfa}, {0xfe1c0347, 0xfffffd9c}, {0xfdd203c8, 0xfffffd3e}, {0xfd880449, 0xfffffce0}, {0xfd3d04ca, 0xfffffc82}, {0xfcf3054b, 0xfffffc23}, {0xfca805cc, 0xfffffbc5}, {0xfc5e064e, 0xfffffb67}, {0xfc1306cf, 0xfffffb09}, {0xfbc90750, 0xfffffaaa}, {0xfb7e07d1, 0xfffffa4c}, {0xfb340852, 0xfffff9ee}, {0xfae908d3, 0xfffff990}, {0xfa9f0954, 0xfffff932}, {0xfa5409d5, 0xfffff8d3}, {0xfa0a0a57, 0xfffff875}, {0xf9bf0ad8, 0xfffff817}, {0xf9750b59, 0xfffff7b9}, {0xf92a0bda, 0xfffff75b}, {0xf8e00c5b, 0xfffff6fc}, {0xf8960cdc, 0xfffff69e}, {0xf84b0d5d, 0xfffff640}, {0xf8010dde, 0xfffff5e2}, {0xf7b60e60, 0xfffff584}, {0xf76c0ee1, 0xfffff525}, {0xf7210f62, 0xfffff4c7}, {0xf6d70fe3, 0xfffff469}, {0xf68c1064, 0xfffff40b}, {0xf64210e5, 0xfffff3ad}, {0xf5f71166, 0xfffff34e}, {0xf5ad11e7, 0xfffff2f0}, {0xf5621269, 0xfffff292}, {0xf51812ea, 0xfffff234}, {0xf4cd136b, 0xfffff1d6}, {0xf48313ec, 0xfffff177}, {0xf439146d, 0xfffff119}, {0xf3ee14ee, 0xfffff0bb}, {0xf3a4156f, 0xfffff05d}, {0xf35915f0, 0xffffeffe}, {0xf30f1672, 0xffffefa0}, {0xf2c416f3, 0xffffef42}, {0xf27a1774, 0xffffeee4}, {0xf22f17f5, 0xffffee86}, {0xf1e51876, 0xffffee27}, {0xf19a18f7, 0xffffedc9}, {0xf1501978, 0xffffed6b}, {0xf10519f9, 0xffffed0d}, {0xf0bb1a7b, 0xffffecaf}, {0xf0701afc, 0xffffec50}, {0xf0261b7d, 0xffffebf2}, {0xefdb1bfe, 0xffffeb94}, {0xef911c7f, 0xffffeb36}, {0xef471d00, 0xffffead8}, {0xeefc1d81, 0xffffea79}, {0xeeb21e02, 0xffffea1b}, {0xee671e84, 0xffffe9bd}, {0xee1d1f05, 0xffffe95f}, {0xedd21f86, 0xffffe901}, {0xed882007, 0xffffe8a2}, {0xed3d2088, 0xffffe844}, {0xecf32109, 0xffffe7e6}, {0xeca8218a, 0xffffe788}, {0xec5e220b, 0xffffe72a}, {0xec13228d, 0xffffe6cb}, {0xebc9230e, 0xffffe66d}, {0xeb7e238f, 0xffffe60f}, {0xeb342410, 0xffffe5b1}, {0xeaea2491, 0xffffe552}, {0xea9f2512, 0xffffe4f4}, {0xea552593, 0xffffe496}, {0xea0a2614, 0xffffe438}, {0xe9c02696, 0xffffe3da}, {0xe9752717, 0xffffe37b}, {0xe92b2798, 0xffffe31d}, {0xe8e02819, 0xffffe2bf}, {0xe896289a, 0xffffe261}, {0xe84b291b, 0xffffe203}, {0xe801299c, 0xffffe1a4}, {0xe7b62a1d, 0xffffe146}, {0xe76c2a9f, 0xffffe0e8}, {0xe7212b20, 0xffffe08a}, {0xe6d72ba1, 0xffffe02c}, {0xe68c2c22, 0xffffdfcd}, {0xe6422ca3, 0xffffdf6f}, {0xe5f82d24, 0xffffdf11}, {0xe5ad2da5, 0xffffdeb3}, {0xe5632e26, 0xffffde55}, {0xe5182ea8, 0xffffddf6}, {0xe4ce2f29, 0xffffdd98}, {0xe4832faa, 0xffffdd3a}, {0xe439302b, 0xffffdcdc}, {0xe3ee30ac, 0xffffdc7e}, {0xe3a4312d, 0xffffdc1f}, {0xe35931ae, 0xffffdbc1}, {0xe30f322f, 0xffffdb63}, {0xe2c432b1, 0xffffdb05}, {0xe27a3332, 0xffffdaa6}, {0xe22f33b3, 0xffffda48}, {0xe1e53434, 0xffffd9ea}, {0xe19b34b5, 0xffffd98c}, {0xe1503536, 0xffffd92e}, {0xe10635b7, 0xffffd8cf}, {0xe0bb3638, 0xffffd871}, {0xe07136ba, 0xffffd813}, {0xe026373b, 0xffffd7b5}, {0xdfdc37bc, 0xffffd757}, {0xdf91383d, 0xffffd6f8}, {0xdf4738be, 0xffffd69a}, {0xdefc393f, 0xffffd63c}, {0xdeb239c0, 0xffffd5de}, {0xde673a41, 0xffffd580}, {0xde1d3ac3, 0xffffd521}, {0xddd23b44, 0xffffd4c3}, {0xdd883bc5, 0xffffd465}, {0xdd3d3c46, 0xffffd407}, {0xdcf33cc7, 0xffffd3a9}, {0xdca93d48, 0xffffd34a}, {0xdc5e3dc9, 0xffffd2ec}, {0xdc143e4a, 0xffffd28e}, {0xdbc93ecc, 0xffffd230}, {0xdb7f3f4d, 0xffffd1d2}, {0xdb343fce, 0xffffd173}, {0xdaea404f, 0xffffd115}, }; const struct YUV BYUV[] = { {0x0038000c, 0xfffffff7}, {0x00a80025, 0xffffffe5}, {0x0119003e, 0xffffffd3}, {0x01890057, 0xffffffc1}, {0x01fa0070, 0xffffffaf}, {0x026a0089, 0xffffff9c}, {0x02da00a2, 0xffffff8a}, {0x034b00bb, 0xffffff78}, {0x03bb00d5, 0xffffff66}, {0x042c00ee, 0xffffff54}, {0x049c0107, 0xffffff41}, {0x050d0120, 0xffffff2f}, {0x057d0139, 0xffffff1d}, {0x05ee0152, 0xffffff0b}, {0x065e016b, 0xfffffef9}, {0x06cf0184, 0xfffffee6}, {0x073f019d, 0xfffffed4}, {0x07b001b6, 0xfffffec2}, {0x082001cf, 0xfffffeb0}, {0x089001e8, 0xfffffe9e}, {0x09010201, 0xfffffe8b}, {0x0971021a, 0xfffffe79}, {0x09e20233, 0xfffffe67}, {0x0a52024c, 0xfffffe55}, {0x0ac30266, 0xfffffe43}, {0x0b33027f, 0xfffffe30}, {0x0ba40298, 0xfffffe1e}, {0x0c1402b1, 0xfffffe0c}, {0x0c8502ca, 0xfffffdfa}, {0x0cf502e3, 0xfffffde8}, {0x0d6602fc, 0xfffffdd5}, {0x0dd60315, 0xfffffdc3}, {0x0e46032e, 0xfffffdb1}, {0x0eb70347, 0xfffffd9f}, {0x0f270360, 0xfffffd8c}, {0x0f980379, 0xfffffd7a}, {0x10080392, 0xfffffd68}, {0x107903ab, 0xfffffd56}, {0x10e903c4, 0xfffffd44}, {0x115a03dd, 0xfffffd31}, {0x11ca03f7, 0xfffffd1f}, {0x123b0410, 0xfffffd0d}, {0x12ab0429, 0xfffffcfb}, {0x131c0442, 0xfffffce9}, {0x138c045b, 0xfffffcd6}, {0x13fc0474, 0xfffffcc4}, {0x146d048d, 0xfffffcb2}, {0x14dd04a6, 0xfffffca0}, {0x154e04bf, 0xfffffc8e}, {0x15be04d8, 0xfffffc7b}, {0x162f04f1, 0xfffffc69}, {0x169f050a, 0xfffffc57}, {0x17100523, 0xfffffc45}, {0x1780053c, 0xfffffc33}, {0x17f10555, 0xfffffc20}, {0x1861056e, 0xfffffc0e}, {0x18d20588, 0xfffffbfc}, {0x194205a1, 0xfffffbea}, {0x19b205ba, 0xfffffbd8}, {0x1a2305d3, 0xfffffbc5}, {0x1a9305ec, 0xfffffbb3}, {0x1b040605, 0xfffffba1}, {0x1b74061e, 0xfffffb8f}, {0x1be50637, 0xfffffb7d}, {0x1c550650, 0xfffffb6a}, {0x1cc60669, 0xfffffb58}, {0x1d360682, 0xfffffb46}, {0x1da7069b, 0xfffffb34}, {0x1e1706b4, 0xfffffb22}, {0x1e8806cd, 0xfffffb0f}, {0x1ef806e6, 0xfffffafd}, {0x1f6806ff, 0xfffffaeb}, {0x1fd90719, 0xfffffad9}, {0x20490732, 0xfffffac7}, {0x20ba074b, 0xfffffab4}, {0x212a0764, 0xfffffaa2}, {0x219b077d, 0xfffffa90}, {0x220b0796, 0xfffffa7e}, {0x227c07af, 0xfffffa6c}, {0x22ec07c8, 0xfffffa59}, {0x235d07e1, 0xfffffa47}, {0x23cd07fa, 0xfffffa35}, {0x243e0813, 0xfffffa23}, {0x24ae082c, 0xfffffa11}, {0x251e0845, 0xfffff9fe}, {0x258f085e, 0xfffff9ec}, {0x25ff0877, 0xfffff9da}, {0x26700890, 0xfffff9c8}, {0x26e008aa, 0xfffff9b6}, {0x275108c3, 0xfffff9a3}, {0x27c108dc, 0xfffff991}, {0x283208f5, 0xfffff97f}, {0x28a2090e, 0xfffff96d}, {0x29130927, 0xfffff95b}, {0x29830940, 0xfffff948}, {0x29f40959, 0xfffff936}, {0x2a640972, 0xfffff924}, {0x2ad4098b, 0xfffff912}, {0x2b4509a4, 0xfffff8ff}, {0x2bb509bd, 0xfffff8ed}, {0x2c2609d6, 0xfffff8db}, {0x2c9609ef, 0xfffff8c9}, {0x2d070a08, 0xfffff8b7}, {0x2d770a21, 0xfffff8a4}, {0x2de80a3b, 0xfffff892}, {0x2e580a54, 0xfffff880}, {0x2ec90a6d, 0xfffff86e}, {0x2f390a86, 0xfffff85c}, {0x2faa0a9f, 0xfffff849}, {0x301a0ab8, 0xfffff837}, {0x308a0ad1, 0xfffff825}, {0x30fb0aea, 0xfffff813}, {0x316b0b03, 0xfffff801}, {0x31dc0b1c, 0xfffff7ee}, {0x324c0b35, 0xfffff7dc}, {0x32bd0b4e, 0xfffff7ca}, {0x332d0b67, 0xfffff7b8}, {0x339e0b80, 0xfffff7a6}, {0x340e0b99, 0xfffff793}, {0x347f0bb2, 0xfffff781}, {0x34ef0bcc, 0xfffff76f}, {0x35600be5, 0xfffff75d}, {0x35d00bfe, 0xfffff74b}, {0x36400c17, 0xfffff738}, {0x36b10c30, 0xfffff726}, {0x37210c49, 0xfffff714}, {0x37920c62, 0xfffff702}, {0x38020c7b, 0xfffff6f0}, };
#define COEF_WIDTH 8
#define SHIFT_WIDTH COEF_WIDTH
//
// All of the RGB converters follow the template given below. The converters make
// some assumptions about the frame size. All output frame sizes are assumed to
// have a frame height that is a multiple of 48. Also, the output frame width
// is assumed to be a multiple of 8. If the input frame size is equal
// to the output frame size, no stretching or cropping is done. Otherwise, the
// image is cropped and stretched for an 11:12 aspect ratio.
//
#if 0
void rgb_color_converter() { for (j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= m, pnext += n) { compute m Y values using look-up tables if (0 == (k&1)) { compute m/2 U,V values using look-up tables } } if ((0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8 { t = *pyprev++ & 0xFEFEFEFE; t += *pynext++ & 0xFEFEFEFE; *pyspace++ = t; t = *pyprev++ & 0xFEFEFEFE; t += *pynext++ & 0xFEFEFEFE; *pyspace++ = t; } } pnext += iBackTwoLines; py += ypitch_adj; if (0 == (k&1)) { pu += uvpitch_adj; pv += uvpitch_adj; } } if (stretch) { pyprev = py - pitch; pyspace = py; pynext = py + pitch; } } if (stretch) { for (i = FrameWidth; i > 0; i -= 4 { *pyspace++ = *pyprev++; } } } #endif
//
// For the IA versions, the strategy is to compute the Y value for an odd RGB value
// followed by computing the Y value for the corresponding even RGB value. The registers
// are then set with the proper values to compute U and V values for the even RGB
// value. This avoids repeating the shifting and masking needed to extract the Red,
// Green and Blue components.
//
/*****************************************************************************
* * H26X_BGR24toYUV12() * * Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory * with pitch defined by the constant PITCH. The input data is stored in * the order B,G,R,B,G,R... * */
#if 0
_STATIC void C_H26X_BGR24toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm; int t; int i, j, k; int iBackTwoLines; int stretch, mark, aspect; int height_adj, width_adj; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1);
// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).
for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = (lpbiInput->biWidth - FrameWidth) >> 1; width_adj += (width_adj << 1); aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // The input image is upside down - process the lines in reverse order.
// Move from end of line N to beginning of line N-1
iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 2); iBackTwoLines += (iBackTwoLines << 1);
// Point to the beginning of the last line.
pnext = (U32 *) (lpInput + ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj);
for ( j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= 4, pnext += 3) { tm = pnext[0]; t = BYUV[tm>>25].YU; tm = pnext[1]; t += (GYUV[(tm>>1)&0x7F].YU + RYUV[(tm>>9)&0x7F].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); tm = pnext[0]; t = (BYUV[(tm>>1)&0x7F].YU + GYUV[(tm>>9)&0x7F].YU + RYUV[(tm>>17)&0x7F].YU); *YPlane = (U8)((t>>SHIFT_WIDTH)+8); if (0 == (k&1)) { *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>17)&0x7F].V + GYUV[(tm>>9)&0x7F].V + BYUV[(tm>>1)&0x7F].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); } tm = pnext[2]; t = (BYUV[(tm>>9)&0x7F].YU + GYUV[(tm>>17)&0x7F].YU + RYUV[tm>>25].YU); *(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8); tm = pnext[1]; t = BYUV[(tm>>17)&0x7F].YU + GYUV[tm>>25].YU; tm = pnext[2]; t += RYUV[(tm>>1)&0x7F].YU; *(YPlane+2) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 4; if (0 == (k&1)) { *UPlane++ = (U8)((t>>24)+64); t = RYUV[(tm>>1)&0x7F].V; tm = pnext[1]; t += GYUV[tm>>25].V + BYUV[(tm>>17)&0x7F].V; *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines.
if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } // end of for k
if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } // end of for j
if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of C_H26X_BGR24toYUV12()
#endif
__declspec(naked) _STATIC void IA_H26X_BGR24toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * BGR24Image, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | i | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 48
#define PITCH_PARM 96
#define FRAME_HEIGHT 92
#define FRAME_WIDTH 88
#define VPLANE 84
#define UPLANE 80
#define YPLANE 76
#define LP_INPUT 72
#define LPBI_INPUT 68
#define PYPREV 44
#define PYSPACE 40
#define PYNEXT 36
#define LOOP_I 32
#define LOOP_J 28
#define LOOP_K 24
#define BACK_TWO_LINES 20
#define STRETCH 16
#define MARK 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = (lpbiInput->biWidth - FrameWidth) >> 1
// width_adj += width_adj << 1
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] mov eax, esi shr eax, 1 add esi, eax // aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0)
xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -(lpbiInput->biWidth + FrameWidth)
// iBackTwoLines += (iBackTwoLines << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] neg ebp mov eax, ebp shl eax, 1 add ebp, eax mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput +
// ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 add eax, (LPBITMAPINFOHEADER)[ebx].biWidth mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane)
mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++)
xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++)
L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 4, pnext += 12)
L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts in the U pipe
jmp L6 L6: // ---------------------
// | B2 | R1 | G1 | B1 | pnext[0]
// ---------------------
// | G3 | B3 | R2 | G2 | pnext[1]
// ---------------------
// | R4 | G4 | B4 | R3 | pnext[2]
// ---------------------
// t0 = pnext[0]
// t1 = pnext[1]
// t = ( BYUV[t0>>25].YU +
// GYUV[(t1>> 1)&0x7F].YU +
// RYUV[(t1>> 9)&0x7F].YU )
// *(YPlane+1) = ((t>>8)+8)
// t = ( BYUV[(t0>> 1)&0x7F].YU +
// GYUV[(t0>> 9)&0x7F].YU +
// RYUV[(t0>>17)&0x7F].YU )
// *YPlane = ((t>>8)+8)
// assign(eax: B2,Y1,Y2,U)
// assign(ebx: B1,V)
// assign(ecx: G2,G1)
// assign(edx: R2,R1)
// assign(ebp: B1)
// 1
mov eax, [esi] mov ecx, [esi + 4] // 2
mov ebx, eax mov edx, ecx // 3
shr eax, 25 and ecx, 0xFE // 4
shr ecx, 1 and edx, 0xFE00 // 5
shr edx, 9 and ebx, 0xFEFEFE // 6
mov eax, [BYUV+eax*8].YU nop // 7
add eax, [GYUV+ecx*8].YU mov ecx, ebx // 8
add eax, [RYUV+edx*8].YU mov edx, ebx // 9
and ebx, 0xFE add eax, 0x800 // 10
sar eax, 8 nop // 11
shr ebx, 1 nop // 12
shr ecx, 9 mov [edi + 1], al // 13
shr edx, 17 and ecx, 0x7F // 14
mov eax, [BYUV+ebx*8].YU and edx, 0x7F // 15
add eax, [GYUV+ecx*8].YU mov ebp, ebx // 16
add eax, [RYUV+edx*8].YU nop // 17
sar eax, 8 mov ebx, [esp + LOOP_K] // 18
add eax, 8 and ebx, 1 // 19
mov [edi], al jnz L9
// At this point, ebp: B1, ecx: G1, edx: R1
// t0 = pnext[0]
// *UPlane++ = ((t>>24)+64)
// t = ( RYUV[(t0>>17)&0x7F].V +
// GYUV[(t0>> 9)&0x7F].V +
// BYUV[(t0>> 1)&0x7F].V )
// *VPlane++ = ((t>>8)+64)
// 20
mov ebx, [RYUV+edx*8].V mov edx, [esp + UPLANE] // 21
sar eax, 16 add ebx, [GYUV+ecx*8].V // 22
add eax, 64 add ebx, [BYUV+ebp*8].V // 23
mov [edx], al inc edx // 24
mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 25
sar ebx, 8 inc edx // 26
add ebx, 64 mov [esp + VPLANE], edx // 27
mov [edx - 1], bl nop
L9: // ---------------------
// | B2 | R1 | G1 | B1 | pnext[0]
// ---------------------
// | G3 | B3 | R2 | G2 | pnext[1]
// ---------------------
// | R4 | G4 | B4 | R3 | pnext[2]
// ---------------------
// t1 = pnext[1]
// t2 = pnext[2]
// t = ( BYUV[(t2>> 9)&0x7F].YU +
// GYUV[(t2>>17)&0x7F].YU +
// RYUV[t2>>25].YR )
// *(YPlane+3) = ((t>>8)+8)
// t = ( BYUV[(t1>>17)&0x7F].YU +
// GYUV[t1>>25].YU +
// RYUV[(t2>> 1)&0x7F].YU )
// *(YPlane+2) = ((t>>8)+8)
// YPlane += 4
// assign(eax: B4,Y3,Y4,U)
// assign(ebx: R3,V)
// assign(ecx: G4,G3)
// assign(edx: R4/B3)
// assign(ebp: R3)
// 28
mov ebp, [esi + 4] mov ebx, [esi + 8] // 29
mov eax, ebx mov ecx, ebx // 30
shr eax, 9 mov edx, ebx // 31
shr ecx, 17 and eax, 0x7F // 32
shr edx, 25 and ecx, 0x7F // 33
mov eax, [BYUV+eax*8].YU nop // 34
add eax, [GYUV+ecx*8].YU and ebx, 0xFE // 35
add eax, [RYUV+edx*8].YU mov ecx, ebp // 36
shr ebx, 1 add eax, 0x800 // 37
sar eax, 8 mov edx, ebp // 38
shr edx, 17 mov [edi + 3], al // 39
shr ecx, 25 and edx, 0x7F // 40
mov eax, [RYUV+ebx*8].YU mov ebp, ebx // 41
add eax, [GYUV+ecx*8].YU nop // 42
add eax, [BYUV+edx*8].YU nop // 43
sar eax, 8 mov ebx, [esp + LOOP_K] // 44
add eax, 8 and ebx, 1 // 45
mov [edi + 2], al jnz L16
// At this point, ebp: R3, ecx: G3, edx: B3
// t1 = pnext[1]
// t2 = pnext[2]
// *UPlane++ = ((t>>16)+64)
// t = ( RYUV[(t2>> 1)&0x7F].V +
// GYUV[t1>>25].V +
// BYUV[(t1>>17)&0x7F].V )
// *VPlane++ = ((t>>8)+64)
// 46
mov ebx, [BYUV+edx*8].V mov edx, [esp + UPLANE] // 47
sar eax, 16 add ebx, [GYUV+ecx*8].V // 48
add eax, 64 add ebx, [RYUV+ebp*8].V // 49
mov [edx], al inc edx // 50
mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 51
sar ebx, 8 inc edx // 52
add ebx, 64 mov [esp + VPLANE], edx // 53
mov [edx - 1], bl nop L16: // 54
mov eax, [esp + LOOP_I] lea esi, [esi + 12] // 55
sub eax, 4 lea edi, [edi + 4] // 56
mov [esp + LOOP_I], eax jnz L6
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH] test eax, eax jz L21 mov eax, [esp + LOOP_K] test eax, eax jnz L21 mov eax, [esp + LOOP_J] test eax, eax jz L21
// spill YPlane ptr
mov [esp + YPLANE], edi nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L22: // 1
mov eax, [ebx] lea ebx, [ebx + 4] // 2
mov ecx, [edx] lea edx, [edx + 4] // 3
shr ecx, 1 and eax, 0xFEFEFEFE // 4
shr eax, 1 and ecx, 0x7F7F7F7F // 5
add eax, ecx mov ecx, [ebx] // 6
shr ecx, 1 mov [edi], eax // 7
mov eax, [edx] and ecx, 0x7F7F7F7F // 8
shr eax, 1 lea edi, [edi + 4] // 9
and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10
lea edx, [edx + 4] add eax, ecx // 11
mov [edi], eax lea edi, [edi + 4] // 12
sub ebp, 8 jnz L22 // kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L21: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ] // if(0 == (k&1))
mov eax, [esp + LOOP_K] and eax, 1 jnz L23 // UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax
L23: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0 je L24 // pyprev = YPlane - pitch
mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane
mov [esp + PYSPACE], edi // pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi
L24: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// if (stretch)
mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L26
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH] mov edi, [esp + PYSPACE] L25: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L25 // kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L26: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
#if 0
_STATIC void C_H26X_BGR16toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, UN bitfield, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm; int t; int i, j, k; int iBackTwoLines; int stretch, mark, aspect; int width_adj, height_adj; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1);
// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).
for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = lpbiInput->biWidth - FrameWidth; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch;
// The input image is upside down - process the lines in reverse order.
// Move from end of line N to beginning of line N-1
iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 1);
// Point to the beginning of the last line.
pnext = (U32 *)(lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj);
for ( j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= 2, pnext++) {
tm = *pnext; switch (bitfield) { // 555 2, 3, 8 0x7C, 0x7C, 0x7C
case 555: t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>19)&0x7C].YU + RYUV[(tm>>24)&0x7C].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>3)&0x7C].YU + RYUV[(tm>>8)&0x7C].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; #if 0
// Beware - untested code ahead
// 664 3, 3, 9 0x78, 0x7E, 0x7E
case 664: t = (BYUV[(tm>>13)&0x78].YU + GYUV[(tm>>19)&0x7E].YU + RYUV[(tm>>25)&0x7E].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<3)&0x78].YU + GYUV[(tm>>3)&0x7E].YU + RYUV[(tm>>9)&0x7E].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; // 565 2, 4, 9 0x7C, 0x7E, 0x7C
case 565: t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>20)&0x7E].YU + RYUV[(tm>>25)&0x7C].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>4)&0x7E].YU + RYUV[(tm>>9)&0x7C].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; // 655 2, 3, 9 0x7C, 0x7C, 0x7E
case 655: t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>19)&0x7C].YU + RYUV[(tm>>25)&0x7E].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>3)&0x7C].YU + RYUV[(tm>>9)&0x7E].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; #endif
} if (0 == (k&1)) { switch (bitfield) { // 555 2, 3, 8 0x7C, 0x7C, 0x7C
case 555: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>8)&0x7C].V + GYUV[(tm>>3)&0x7C].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; #if 0
// Beware - untested code ahead
// 664 3, 3, 9 0x78, 0x7E, 0x7E
case 664: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>9)&0x7E].V + GYUV[(tm>>3)&0x7E].V + BYUV[(tm<<3)&0x78].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; // 565 2, 4, 9 0x7C, 0x7E, 0x7C
case 565: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>9)&0x7C].V + GYUV[(tm>>4)&0x7E].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; // 655 2, 3, 9 0x7C, 0x7C, 0x7E
case 655: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>9)&0x7E].V + GYUV[(tm>>3)&0x7C].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; #endif
} } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines.
if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } // end of for k
if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } // end of for j
if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of C_H26X_BGR16toYUV12
#endif
__declspec(naked) _STATIC void IA_H26X_BGR16555toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | i | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 48
#define PITCH_PARM 96
#define FRAME_HEIGHT 92
#define FRAME_WIDTH 88
#define VPLANE 84
#define UPLANE 80
#define YPLANE 76
#define LP_INPUT 72
#define LPBI_INPUT 68
#define PYPREV 44
#define PYSPACE 40
#define PYNEXT 36
#define LOOP_I 32
#define LOOP_J 28
#define LOOP_K 24
#define BACK_TWO_LINES 20
#define STRETCH 16
#define MARK 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = lpbiInput->biWidth - FrameWidth
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] // aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0)
xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] shl ebp, 1 neg ebp mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput +
// ((lpbiInput->biWidth << 1) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane)
mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++)
xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++)
L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 2, pnext += 4)
L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts on the U pipe
jmp L6 L6: // tm = pnext[0]
// t = ( BYUV[(tm>>14)&0x7C].YU +
// GYUV[(tm>>19)&0x7C].YU +
// RYUV[(tm>>24)&0x7C].YU )
// *(YPlane+1) = (U8)((t>>8)+8)
// t = ( BYUV[(tm<< 2)&0x7C].YU +
// GYUV[(tm>> 8)&0x7C].YU +
// RYUV[(tm>>13)&0x7C].YU )
// *YPlane = (U8)((t>>8)+8)
// YPlane += 2
// assign(eax: B2/Y1/Y2/U)
// assign(ebx: B1/V)
// assign(ecx: G2/G1)
// assign(edx: R2/R1)
// assign(ebp: B1)
// 1
mov eax, [esi] nop // 2
mov ebx, eax mov ecx, eax // 3
shr eax, 14 mov edx, ebx // 4
shr ecx, 19 and eax, 0x7C // 5
shr edx, 24 and ecx, 0x7C // 6
mov eax, [BYUV+eax*8].YU and edx, 0x7C // 7
add eax, [GYUV+ecx*8].YU mov ecx, ebx // 8
add eax, [RYUV+edx*8].YU mov edx, ebx // 9
sar eax, 8 and ebx, 0x1F // 10
shl ebx, 2 add eax, 8 // 11
shr ecx, 3 mov [edi + 1], al // 12
shr edx, 8 and ecx, 0x7C // 13
mov eax, [BYUV+ebx*8].YU and edx, 0x7C // 14
add eax, [GYUV+ecx*8].YU mov ebp, ebx // 15
add eax, [RYUV+edx*8].YU nop // 16
sar eax, 8 mov ebx, [esp + LOOP_K] // 17
add eax, 8 and ebx, 1 // 18
mov [edi], al jnz L9
// At this point, ebp: B1, ecx: G1, edx: R1
// *UPlane++ = (U8)((t>>24)+64)
// t = ( VBGR[(t>>13)&0x7C].VR +
// VBGR[(t>> 8)&0x7C].VG +
// VBGR[(t<< 2)&0x7C].VB )
// *VPlane++ = (U8)((t>>8)+64)
// 19
mov ebx, [RYUV+edx*8].V mov edx, [esp + UPLANE] // 20
sar eax, 16 add ebx, [GYUV+ecx*8].V // 21
add eax, 64 add ebx, [BYUV+ebp*8].V // 22
mov [edx], al inc edx // 23
mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 24
sar ebx, 8 inc edx // 25
add ebx, 64 mov [esp + VPLANE], edx // 26
mov [edx - 1], bl nop L9: // 27
mov eax, [esp + LOOP_I] lea esi, [esi + 4] // 28
sub eax, 2 lea edi, [edi + 2] // 29
mov [esp + LOOP_I], eax jnz L6
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH] test eax, eax jz L14 mov eax, [esp + LOOP_K] test eax, eax jnz L14 mov eax, [esp + LOOP_J] test eax, eax jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15: // 1
mov eax, [ebx] lea ebx, [ebx + 4] // 2
mov ecx, [edx] lea edx, [edx + 4] // 3
shr ecx, 1 and eax, 0xFEFEFEFE // 4
shr eax, 1 and ecx, 0x7F7F7F7F // 5
add eax, ecx mov ecx, [ebx] // 6
shr ecx, 1 mov [edi], eax // 7
mov eax, [edx] and ecx, 0x7F7F7F7F // 8
shr eax, 1 lea edi, [edi + 4] // 9
and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10
lea edx, [edx + 4] add eax, ecx // 11
mov [edi], eax lea edi, [edi + 4] // 12
sub ebp, 8 jnz L15 // kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L14: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ] // if(0 == (k&1))
mov eax, [esp + LOOP_K] and eax, 1 jnz L16 // UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax
L16: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0 je L17 // pyprev = YPlane - pitch
mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane
mov [esp + PYSPACE], edi // pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi
L17: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// if (stretch)
mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L19
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH] mov edi, [esp + PYSPACE] L18: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L18 // kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/*****************************************************************************
* * H26X_CLUTtoYUV12() * * Convert from CLUT8/CLUT4 to YUV12 (YCrCb 4:2:0) and copy to destination memory * with pitch defined by the constant PITCH. * * This is needed to support the quickcam. */
#if 0
_STATIC void C_H26X_CLUTtoYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, UN pixel_bits, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm, tn; int t; int i, j, k, m, n; int iNextLine, iBackTwoLines; int stretch, mark, aspect; int width_adj, height_adj; int yshift, uvshift; int pixel_mask, loop_cnt, loop_limit; RGBQUAD *lpCEntry, *lpCTable = (RGBQUAD *)((U8 *)lpbiInput + sizeof(BITMAPINFOHEADER)); int LumaIters = 0; int ypitch_adj = (pitch - FrameWidth); int uvpitch_adj = (pitch - (FrameWidth >> 1));
ASSERT((8 == pixel_bits) || (4 == pixel_bits)); // This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).
for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1); aspect = (width_adj ? LumaIters : 0); height_adj = ((lpbiInput->biHeight - (FrameHeight - aspect)) >> 1); stretch = (height_adj ? 1 : 0); mark = 12 - stretch; iNextLine = lpbiInput->biWidth; iBackTwoLines = -((iNextLine + (int)FrameWidth) >> 2); if (8 == pixel_bits) { yshift = 8; uvshift = 16; pixel_mask = 0xFF; loop_cnt = 2; loop_limit = 4; } else { yshift = 4; uvshift = 8; pixel_mask = 0xF; loop_cnt = 1; loop_limit = 8; width_adj >>= 1; iNextLine >>= 1; iBackTwoLines >>= 1; }
// The input image is upside down - process the lines in reverse order.
// Point to the beginning of the last line.
pnext = (U32 *)(lpInput + (iNextLine * ((FrameHeight - aspect - 1) + height_adj)) + width_adj);
for (j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= 8) {
for (n = 0; n < loop_cnt; n++) { tm = *pnext++; tm = ((4 == pixel_bits) ? ( ((tm >> 4) & 0x0F0F0F0F) | ((tm << 4) & 0xF0F0F0F0) ) : tm); tn = tm; for (m = 0; m < loop_limit; m += 4) { lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; } if (0 == (k&1)) { for (m = 0; m < loop_limit; m += 2, tn >>= uvshift) { lpCEntry = &lpCTable[tn&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU ); *UPlane++ = (U8)((t>>24)+64); t = ( RYUV[lpCEntry->rgbRed>>1].V + GYUV[lpCEntry->rgbGreen>>1].V + BYUV[lpCEntry->rgbBlue>>1].V ); *VPlane++ = (U8)((t>>8)+64); } } } }
if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } }
pnext += iBackTwoLines;
YPlane += ypitch_adj; // Increment after even lines.
if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } }
if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } }
if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of H26X_CLUTtoYUV12()
#endif
__declspec(naked) _STATIC void IA_H26X_CLUT8toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | +100
// | FrameHeight | + 96
// | FrameWidth | + 92
// | VPlane | + 88
// | UPlane | + 84
// | YPlane | + 80
// | lpInput | + 76
// | lpbiInput | + 72
// ----------------------------
// | return addr | + 68
// | saved ebp | + 64
// | saved ebx | + 60
// | saved esi | + 56
// | saved edi | + 52
// | pyprev | + 48
// | pyspace | + 44
// | pynext | + 40
// | i | + 36
// | j | + 32
// | k | + 28
// | iBackTwoLines | + 24
// | stretch | + 20
// | mark | + 16
// | lpCEntry | + 12
// | lpCTable | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 52
#define PITCH_PARM 100
#define FRAME_HEIGHT 96
#define FRAME_WIDTH 92
#define VPLANE 88
#define UPLANE 84
#define YPLANE 80
#define LP_INPUT 76
#define LPBI_INPUT 72
#define PYPREV 48
#define PYSPACE 44
#define PYNEXT 40
#define LOOP_I 36
#define LOOP_J 32
#define LOOP_K 28
#define BACK_TWO_LINES 24
#define STRETCH 20
#define MARK 16
#define LUMA_ITERS 12
#define LPCTABLE 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE
// lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER)
// assign (ebx, lpbiInput)
mov eax, [esp + LPBI_INPUT] mov ebx, eax add eax, TYPE BITMAPINFOHEADER mov [esp + LPCTABLE], eax // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (ecx, FrameWidth)
// kill (edx, pitch)
shr ecx, 1 sub edx, ecx mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (ecx, LumaIters)
xor ecx, ecx mov eax, [esp + FRAME_HEIGHT] L1: lea ecx, [ecx + 4] sub eax, 48 jnz L1 // width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1
// assign (edx, width_adj)
mov edx, (LPBITMAPINFOHEADER)[ebx].biWidth sub edx, [esp + FRAME_WIDTH] shr edx, 1 // aspect = (width_adj ? LumaIters : 0)
// assign (esi, aspect)
// kill (ecx, LumaIters)
mov [esp + LUMA_ITERS], ecx xor esi, esi test edx, edx jz L2 mov esi, ecx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (ecx, height_adj)
L2: mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight sub ecx, [esp + FRAME_HEIGHT] add ecx, esi shr ecx, 1 // stretch = (height_adj ? 1 : 0)
xor eax, eax test ecx, ecx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov edi, 12 sub edi, eax mov [esp + MARK], edi // iNextLine = lpbiInput->biWidth
// kill (ebx, lpbiInput)
// assign (ebx, iNextLine)
mov ebx, (LPBITMAPINFOHEADER)[ebx].biWidth // iBackTwoLines = -(iNextline + FrameWidth)
mov edi, [esp + FRAME_WIDTH] add edi, ebx neg edi mov [esp + BACK_TWO_LINES], edi // pnext = lpInput +
// (iNextLine*((FrameHeight-aspect-1) + height_adj)) +
// width_adj
// kill (ebx, iNextLine)
// kill (ecx, height_adj)
// kill (edx, width_adj)
// kill (esi, aspect)
// assign (esi, pnext)
mov eax, [esp + FRAME_HEIGHT] sub eax, esi dec eax add eax, ecx mov esi, [esp + LP_INPUT] add esi, edx imul ebx add esi, eax // assign (edi, YPlane)
mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++)
xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++)
L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 2, pnext += 2)
L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts on the U pipe
jmp L6 L6: // lpCEntry = &lpCTable[*(pnext+1)]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *(YPlane+1) = (U8)((t>>8)+8)
// lpCEntry = &lpCTable[*pnext]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *YPlane = (U8)((t>>8)+8)
// YPlane += 2
// *UPlane++ = (U8)((t>>24)+64)
// t = ( VBGR[lpCEntry->rgbRed>>1].V +
// VBGR[lpCEntry->rgbGreen>>1].V +
// VBGR[lpCEntry->rgbBlue>>1].V )
// *VPlane++ = (U8)((t>>8)+64)
// assign (ebp: lpCEntry,B1)
// assign (eax: P2,B2,Y2,Y1,U)
// assign (ebx: B1,V)
// assign (ecx: G2,G1)
// assign (edx: R2,R1)
// 1
xor eax, eax mov ebp, [esp + LPCTABLE] // 2
mov al, [esi + 1] xor ecx, ecx // 3
lea ebx, [ebp+eax*4] xor edx, edx // 4
mov al, (LPRGBQUAD)[ebx].rgbBlue nop // 5
mov cl, (LPRGBQUAD)[ebx].rgbGreen and al, 0xFE // 6
mov dl, (LPRGBQUAD)[ebx].rgbRed and cl, 0xFE // 7
mov eax, [BYUV+eax*4].YU and dl, 0xFE // 8
add eax, [GYUV+ecx*4].YU xor ebx, ebx // 9
add eax, [RYUV+edx*4].YU mov bl, [esi] // 10
sar eax, 8 lea ebp, [ebp+ebx*4] // 11
add eax, 8 nop // 12
mov [edi + 1], al mov bl, (LPRGBQUAD)[ebp].rgbBlue // 13
mov cl, (LPRGBQUAD)[ebp].rgbGreen and bl, 0xFE // 14
mov dl, (LPRGBQUAD)[ebp].rgbRed and cl, 0xFE // 15
mov eax, [BYUV+ebx*4].YU and dl, 0xFE // 16
add eax, [GYUV+ecx*4].YU mov ebp, ebx // 17
add eax, [RYUV+edx*4].YU nop // 18
sar eax, 8 mov ebx, [esp + LOOP_K] // 19
add eax, 8 and ebx, 1 // 20
mov [edi], al jnz L9 // 21
mov ebx, [RYUV+edx*4].V mov edx, [esp + UPLANE] // 22
sar eax, 16 add ebx, [GYUV+ecx*4].V // 23
add eax, 64 add ebx, [BYUV+ebp*4].V // 24
mov [edx], al inc edx // 25
mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 26
sar ebx, 8 inc edx // 27
add ebx, 64 mov [esp + VPLANE], edx // 28
mov [edx - 1], bl nop L9: // 29
mov eax, [esp + LOOP_I] lea esi, [esi + 2] // 30
sub eax, 2 lea edi, [edi + 2] // 31
mov [esp + LOOP_I], eax jnz L6
// only esi (pnext) is live at this point (after line loop)
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH] test eax, eax jz L14 mov eax, [esp + LOOP_K] test eax, eax jnz L14 mov eax, [esp + LOOP_J] test eax, eax jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15: // 1
mov eax, [ebx] lea ebx, [ebx + 4] // 2
mov ecx, [edx] lea edx, [edx + 4] // 3
shr ecx, 1 and eax, 0xFEFEFEFE // 4
shr eax, 1 and ecx, 0x7F7F7F7F // 5
add eax, ecx mov ecx, [ebx] // 6
shr ecx, 1 mov [edi], eax // 7
mov eax, [edx] and ecx, 0x7F7F7F7F // 8
shr eax, 1 lea edi, [edi + 4] // 9
and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10
lea edx, [edx + 4] add eax, ecx // 11
mov [edi], eax lea edi, [edi + 4] // 12
sub ebp, 8 jnz L15 // kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L14: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ] // if(0 == (k&1))
mov eax, [esp + LOOP_K] and eax, 1 jnz L16 // UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax
L16: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0 je L17 // pyprev = YPlane - pitch
mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane
mov [esp + PYSPACE], edi // pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi
L17: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// if (stretch)
mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L19
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH] mov edi, [esp + PYSPACE] L18: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L18 // kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef LPCTABLE
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
__declspec(naked) _STATIC void IA_H26X_CLUT4toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | +100
// | FrameHeight | + 96
// | FrameWidth | + 92
// | VPlane | + 88
// | UPlane | + 84
// | YPlane | + 80
// | lpInput | + 76
// | lpbiInput | + 72
// ----------------------------
// | return addr | + 68
// | saved ebp | + 64
// | saved ebx | + 60
// | saved esi | + 56
// | saved edi | + 52
// | pyprev | + 48
// | pyspace | + 44
// | pynext | + 40
// | i | + 36
// | j | + 32
// | k | + 28
// | iBackTwoLines | + 24
// | stretch | + 20
// | mark | + 16
// | lpCEntry | + 12
// | lpCTable | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 52
#define PITCH_PARM 100
#define FRAME_HEIGHT 96
#define FRAME_WIDTH 92
#define VPLANE 88
#define UPLANE 84
#define YPLANE 80
#define LP_INPUT 76
#define LPBI_INPUT 72
#define PYPREV 48
#define PYSPACE 44
#define PYNEXT 40
#define LOOP_I 36
#define LOOP_J 32
#define LOOP_K 28
#define BACK_TWO_LINES 24
#define STRETCH 20
#define MARK 16
#define LUMA_ITERS 12
#define LPCTABLE 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE
// lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER)
// assign (ebx, lpbiInput)
mov eax, [esp + LPBI_INPUT] mov ebx, eax add eax, TYPE BITMAPINFOHEADER mov [esp + LPCTABLE], eax // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (ecx, FrameWidth)
// kill (edx, pitch)
shr ecx, 1 sub edx, ecx mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (ecx, LumaIters)
xor ecx, ecx mov eax, [esp + FRAME_HEIGHT] L1: lea ecx, [ecx + 4] sub eax, 48 jnz L1 // width_adj = ((lpbiInput->biWidth - FrameWidth) >> 2
// assign (edx, width_adj)
mov edx, (LPBITMAPINFOHEADER)[ebx].biWidth sub edx, [esp + FRAME_WIDTH] shr edx, 2 // aspect = (width_adj ? LumaIters : 0)
// assign (esi, aspect)
// kill (ecx, LumaIters)
mov [esp + LUMA_ITERS], ecx xor esi, esi test edx, edx jz L2 mov esi, ecx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (ecx, height_adj)
L2: mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight sub ecx, [esp + FRAME_HEIGHT] add ecx, esi shr ecx, 1 // stretch = (height_adj ? 1 : 0)
xor eax, eax test ecx, ecx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov edi, 12 sub edi, eax mov [esp + MARK], edi // iNextLine = lpbiInput->biWidth >> 1
// kill (ebx, lpbiInput)
// assign (ebx, iNextLine)
mov ebx, (LPBITMAPINFOHEADER)[ebx].biWidth shr ebx, 1 // iBackTwoLines = -(iNextline + (FrameWidth >> 1))
mov edi, [esp + FRAME_WIDTH] shr edi, 1 add edi, ebx neg edi mov [esp + BACK_TWO_LINES], edi // pnext = lpInput+(iNextLine*((FrameHeight-aspect-1)+height_adj))+ width_adj
// kill (ebx, iNextLine)
// kill (ecx, height_adj)
// kill (edx, width_adj)
// kill (esi, aspect)
// assign (esi, pnext)
mov eax, [esp + FRAME_HEIGHT] sub eax, esi dec eax add eax, ecx mov esi, [esp + LP_INPUT] add esi, edx imul ebx add esi, eax // assign (edi, YPlane)
mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++)
xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++)
L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 2, pnext++)
L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts on the U pipe
jmp L6 L6: // lpCEntry = &lpCTable[*pnext&0xF]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *(YPlane+1) = (U8)((t>>8)+8)
// lpCEntry = &lpCTable[(*pnext>>4)&0xF]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *YPlane = (U8)((t>>8)+8)
// YPlane += 2
// *UPlane++ = (U8)((t>24)+64)
// t = ( RYUV[lpCEntry->rgbRed>>1].V +
// GYUV[lpCEntry->rgbGreen>>1].V +
// BYUV[lpCEntry->rgbBlue>>1].V )
// *VPlane++ = (U8)((t>>8)+64)
// assign (ebp: lpCEntry,B1)
// assign (eax: P2,B2,Y2,Y1,U)
// assign (ebx: B1,V)
// assign (ecx: G2,G1)
// assign (edx: R2,R1)
// 1
mov al, [esi] mov ebp, [esp + LPCTABLE] // 2
and eax, 0xF xor ecx, ecx // 3
lea ebx, [ebp+eax*4] xor edx, edx // 4
mov al, (LPRGBQUAD)[ebx].rgbBlue nop // 5
mov cl, (LPRGBQUAD)[ebx].rgbGreen and al, 0xFE // 6
mov dl, (LPRGBQUAD)[ebx].rgbRed and cl, 0xFE // 7
mov eax, [BYUV+eax*4].YU and dl, 0xFE // 8
add eax, [GYUV+ecx*4].YU mov bl, [esi] // 9
add eax, [RYUV+edx*4].YU and ebx, 0xF0 //
shr ebx, 4 nop // 10
shr eax, 8 lea ebp, [ebp+ebx*4] // 11
add eax, 8 nop // 12
mov [edi + 1], al mov bl, (LPRGBQUAD)[ebp].rgbBlue // 13
mov cl, (LPRGBQUAD)[ebp].rgbGreen and bl, 0xFE // 14
mov dl, (LPRGBQUAD)[ebp].rgbRed and cl, 0xFE // 15
mov eax, [BYUV+ebx*4].YU and dl, 0xFE // 16
add eax, [GYUV+ecx*4].YU mov ebp, ebx // 17
add eax, [RYUV+edx*4].YU nop // 18
shr eax, 8 mov ebx, [esp + LOOP_K] // 19
add eax, 8 and ebx, 1 // 20
mov [edi], al jnz L9 // 21
mov ebx, [RYUV+edx*4].V mov edx, [esp + UPLANE] // 22
sar eax, 16 add ebx, [GYUV+ecx*4].V // 23
add eax, 64 add ebx, [BYUV+ebp*4].V // 24
mov [edx], al inc edx // 25
mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 26
sar ebx, 8 inc edx // 27
add ebx, 64 mov [esp + VPLANE], edx // 28
mov [edx - 1], bl nop L9: // 32
mov eax, [esp + LOOP_I] lea esi, [esi + 1] // 33
sub eax, 2 lea edi, [edi + 2] // 34
mov [esp + LOOP_I], eax jnz L6
// only esi (pnext) is live at this point (after line loop)
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH] test eax, eax jz L14 mov eax, [esp + LOOP_K] test eax, eax jnz L14 mov eax, [esp + LOOP_J] test eax, eax jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15: // 1
mov eax, [ebx] lea ebx, [ebx + 4] // 2
mov ecx, [edx] lea edx, [edx + 4] // 3
shr ecx, 1 and eax, 0xFEFEFEFE // 4
shr eax, 1 and ecx, 0x7F7F7F7F // 5
add eax, ecx mov ecx, [ebx] // 6
shr ecx, 1 mov [edi], eax // 7
mov eax, [edx] and ecx, 0x7F7F7F7F // 8
shr eax, 1 lea edi, [edi + 4] // 9
and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10
lea edx, [edx + 4] add eax, ecx // 11
mov [edi], eax lea edi, [edi + 4] // 12
sub ebp, 8 jnz L15 // kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L14: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ] // if(0 == (k&1))
mov eax, [esp + LOOP_K] and eax, 1 jnz L16 // UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax
L16: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0 je L17 // pyprev = YPlane - pitch
mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane
mov [esp + PYSPACE], edi // pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi
L17: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4
// kill(esi, pnext)
// if (stretch)
mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L19
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov edi, [esp + PYSPACE] mov ebp, [esp + FRAME_WIDTH] L18: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L18 // kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef LPCTABLE
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/***************************************************
* H26X_YVU9toYUV12() * Convert from YVU9 to YUV12 * and copy to destination memory with pitch * defined by the constant PITCH. * * uv_plane_common() * Helper function to convert V and U plane information. * Since the process is similar for both planes, the * conversion code was included in this subroutine. * ***************************************************/
#if 0
#define READ_DWORD_AND_SHIFT(val,src) \
(((val) = *((unsigned int *)(src))), ((val) &= 0xFEFEFEFE), ((val) >>= 1))
#define READ_QWORD_AND_SHIFT(val,src) \
(((val) = *((unsigned __int64 *)(src))), ((val) &= 0xFEFEFEFEFEFEFEFE), ((val) >>= 1))
#define WRITE_DWORD(dest,val) ((*(unsigned int *)(dest)) = (val))
#define WRITE_QWORD(dest,val) ((*(unsigned __int64 *)(dest)) = (val))
#define AVERAGE_DWORDS(out,in1,in2) ((out) = ((((in1) + (in2)) & 0xFEFEFEFE) >> 1))
#define DUP_LOWER_TWO_BYTES(dest,val) \
(*((unsigned int *)(dest)) = (((val) & 0x000000FF) | (((val) << 8) & 0x0000FF00) | \ (((val) << 8) & 0x00FF0000) | (((val) << 16) & 0xFF000000)))
#define DUP_UPPER_TWO_BYTES(dest,val) \
(*((unsigned int *)(dest)) = ((((val) >> 16) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \ (((val) >> 8) & 0x00FF0000) | ((val) & 0xFF000000)))
_STATIC void C_uv_plane_common( U8 *psrc, U8 *Plane, UN pitch, UN OutputFrameWidth, UN ChromaIters, UN spitch_adj) {
U8* pnext = psrc + (OutputFrameWidth>>1) + spitch_adj; U8* pdest_copy = Plane; U8* pdest_avg = Plane + pitch; int dpitch_adj = pitch - OutputFrameWidth; int stretch = (spitch_adj ? 1 : 0); int mark = 6 - stretch; int flag = stretch; int i, j, k; UN t1,t2;
for (j = ChromaIters; j > 0; j--) { for (k = mark + (flag & 1); k > 0; k--) { if (!stretch && (1 == j) && (1 == k)) { pnext = psrc; } for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_copy += 8, pdest_avg += 8) { READ_DWORD_AND_SHIFT(t1,psrc); DUP_LOWER_TWO_BYTES(pdest_copy,t1); DUP_UPPER_TWO_BYTES((pdest_copy+4),t1); READ_DWORD_AND_SHIFT(t2,pnext); AVERAGE_DWORDS(t1,t1,t2); DUP_LOWER_TWO_BYTES(pdest_avg,t1); DUP_UPPER_TWO_BYTES((pdest_avg+4),t1); } psrc += spitch_adj; pnext += spitch_adj; pdest_copy = pdest_avg + dpitch_adj; pdest_avg = pdest_copy + pitch; } if (stretch) { psrc -= ((OutputFrameWidth>>1) + spitch_adj); pnext -= ((OutputFrameWidth>>1) + spitch_adj); pdest_avg = pdest_copy; for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_avg += 8) { READ_DWORD_AND_SHIFT(t1,psrc); READ_DWORD_AND_SHIFT(t2,pnext); AVERAGE_DWORDS(t1,t1,t2); AVERAGE_DWORDS(t1,t1,t2); DUP_LOWER_TWO_BYTES(pdest_avg,t1); DUP_UPPER_TWO_BYTES((pdest_avg+4),t1); } psrc += spitch_adj; pnext += spitch_adj; pdest_copy = pdest_avg + dpitch_adj; pdest_avg = pdest_copy + pitch; flag++; } } }
_STATIC void C_H26X_YVU9toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) {
U8 *pnext, *plast, *pbn; U8 *pvsrc, *pusrc; int width_adj, height_adj; int stretch, mark, aspect; int iNextLine; int i, j, k, t; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1);
for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = (lpbiInput->biWidth - FrameWidth) >> 1; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; iNextLine = width_adj << 1; pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj; for (j = LumaIters; j > 0; j--) { for (k = mark; k > 0; k--) { for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4) { *(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1; } pnext += iNextLine; YPlane += ypitch_adj; } if (stretch) { plast = pnext - lpbiInput->biWidth; pbn = pnext; for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, plast += 4, pbn += 4) { *(U32 *)YPlane = ( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) + ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1; } YPlane += ypitch_adj; } }
pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight); pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight>>2)); t = ((lpbiInput->biWidth>>2) * (height_adj>>2)) + (width_adj>>2); pvsrc += t; pusrc += t; C_uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1); C_uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1); } #endif
__declspec(naked) _STATIC void IA_uv_plane_common( U8 *psrc, U8 *Plane, UN pitch, UN OutputFrameWidth, UN ChromaIters, UN spitch_adj)
{ // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | spitch_adj | + 64
// | ChromaIters | + 60
// | OutputFrameWidth| + 56
// | pitch | + 52
// | Plane | + 48
// | psrc | + 44
// -----------------------------
// | return addr | + 40
// | saved ebp | + 36
// | saved ebx | + 32
// | saved esi | + 28
// | saved edi | + 24
// | dpitch_adj | + 20
// | stretch | + 16
// | mark | + 12
// | flag | + 8
// | j | + 4
// | k | + 0
#define LOCALSIZE 24
#define SPITCH_ADJ 64
#define CHROMA_ITERS 60
#define OUTPUT_FRAME_WIDTH 56
#define PITCH_PARM 52
#define PLANE 48
#define PSRC 44
#define DPITCH_ADJ 20
#define STRETCH 16
#define MARK 12
#define FLAG 8
#define LOOP_J 4
#define LOOP_K 0
_asm {
push ebp push ebx push esi push edi sub esp, LOCALSIZE
// pnext = psrc + (OuputFrameWidth>>1) + uvpitch_adj
// pdest_copy = Plane
// pdest_avg = Plane + pitch
// assign (esi, psrc)
// assign (ecx, pnext)
// assign (edi, pdest_copy)
// assign (edx, pdest_avg)
// assign (ebp, i)
mov esi, [esp + PSRC] mov ecx, esi mov eax, [esp + OUTPUT_FRAME_WIDTH] shr eax, 1 add eax, [esp + SPITCH_ADJ] add ecx, eax mov edi, [esp + PLANE] mov edx, edi add edx, [esp + PITCH_PARM] // dpitch_adj = pitch - OutputFrameWidth
mov eax, [esp + PITCH_PARM] sub eax, [esp + OUTPUT_FRAME_WIDTH] mov [esp + DPITCH_ADJ], eax // stretch = (spitch_adj ? 1 : 0)
xor ebx, ebx mov eax, [esp + SPITCH_ADJ] test eax, eax jz L1 inc ebx L1: mov [esp + STRETCH], ebx // mark = 6 - stretch
mov eax, 6 sub eax, ebx mov [esp + MARK], eax // flag = stretch
mov DWORD PTR [esp + FLAG], ebx
// for (j = ChromaIters; j > 0; j--)
mov eax, [esp + CHROMA_ITERS] mov [esp + LOOP_J], eax L2: // for (k = mark + (flag & 1); k > 0; k--)
mov eax, [esp + FLAG] and eax, 1 add eax, [esp + MARK] mov [esp + LOOP_K], eax L3: // if (!stretch && (0 == j) && (0 == k))
mov eax, [esp + STRETCH] test eax, eax jnz L4 mov eax, [esp + LOOP_J] cmp eax, 1 jne L4 mov eax, [esp + LOOP_K] cmp eax, 1 jne L4 // pnext = psrc
mov ecx, esi L4: // for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4,
// pdest_copy += 8, pdest_avg += 8)
mov ebp, [esp + OUTPUT_FRAME_WIDTH] // Pentium pipeline scheduling has not been performed on the following loop code yet
L5: // READ_DWORD_AND_SHIFT(t1,psrc)
mov eax, [esi] and eax, 0xFEFEFEFE shr eax, 1 // DUP_LOWER_TWO_BYTES(pdest_copy,t1)
mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edi], ebx // DUP_UPPER_TWO_BYTES((pdest_copy+4),t1)
shr eax, 16 mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edi+4], ebx // READ_DWORD_AND_SHIFT(t2,pnext)
// AVERAGE_DWORDS(t1,t1,t2)
mov eax, [esi] and eax, 0xFEFEFEFE shr eax, 1 mov ebx, [ecx] and ebx, 0xFEFEFEFE shr ebx, 1 add eax, ebx and eax, 0xFEFEFEFE shr eax, 1 // DUP_LOWER_TWO_BYTES(pdest_avg,t1)
mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx], ebx // DUP_UPPER_TWO_BYTES((pdest_avg+4),t1)
shr eax, 16 mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx+4], ebx // end of i loop
lea esi, [esi + 4] lea ecx, [ecx + 4] lea edi, [edi + 8] lea edx, [edx + 8] sub ebp, 8 jnz L5
// psrc += spitch_adj
// pnext += spitch_adj
// pdest_copy = pdest_avg + pitch_adj
// pdest_avg = pdest_copy + pitch
add esi, [esp + SPITCH_ADJ] add ecx, [esp + SPITCH_ADJ] mov eax, edx add eax, [esp + DPITCH_ADJ] mov edi, eax mov edx, edi add edx, [esp + PITCH_PARM] // end of k loop
dec DWORD PTR [esp + LOOP_K] jnz L3
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0 jz L6
// psrc -= ((OutputFrameWidth>>1)+spitch_adj)
// pnext -= ((OutputFrameWidth>>1)+spitch_adj)
// pdest_avg = pdest_copy
mov eax, [esp + OUTPUT_FRAME_WIDTH] shr eax, 1 add eax, [esp + SPITCH_ADJ] sub esi, eax sub ecx, eax mov edx, edi // for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_avg += 8)
mov ebp, [esp + OUTPUT_FRAME_WIDTH] // Pentium pipeline scheduling has not been performed on the following loop code yet
L7: // READ_DWORD_AND_SHIFT(t1,psrc)
mov eax, [esi] and eax, 0xFEFEFEFE shr eax, 1 // READ_DWORD_AND_SHIFT(t2,pnext)
mov ebx, [ecx] and ebx, 0xFEFEFEFE shr ebx, 1 // AVERAGE_DWORDS(t1,t1,t2)
// AVERAGE_DWORDS(t1,t1,t2)
add eax, ebx and eax, 0xFEFEFEFE shr eax, 1 add eax, ebx and eax, 0xFEFEFEFE shr eax, 1 // DUP_LOWER_TWO_BYTES(pdest_avg,t1)
mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx], ebx // DUP_UPPER_TWO_BYTES((pdest_avg+4),t1)
shr eax, 16 mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx+4], ebx // end of i loop
lea esi, [esi + 4] lea ecx, [ecx + 4] lea edx, [edx + 8] sub ebp, 8 jnz L7
// psrc += spitch_adj
// pnext += spitch_adj
// pdest_copy = pdest_avg + dpitch_adj
// pdest_avg = pdest_copy + pitch
// flag++
add esi, [esp + SPITCH_ADJ] add ecx, [esp + SPITCH_ADJ] mov eax, edx add eax, [esp + DPITCH_ADJ] mov edi, eax mov edx, edi add edx, [esp + PITCH_PARM] inc DWORD PTR [esp + FLAG]
// end of j loop
L6: dec DWORD PTR [esp + LOOP_J] jnz L2
add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef SPITCH_ADJ
#undef CHROMA_ITERS
#undef OUTPUT_FRAME_WIDTH
#undef PITCH_PARM
#undef PLANE
#undef PSRC
#undef DPITCH_ADJ
#undef STRETCH
#undef MARK
#undef FLAG
#undef LOOP_J
#undef LOOP_K
__declspec(naked) _STATIC void IA_H26X_YVU9toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch)
{ // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 88
// | FrameHeight | + 84
// | FrameWidth | + 80
// | VPlane | + 76
// | UPlane | + 72
// | YPlane | + 68
// | lpInput | + 64
// | lpbiInput | + 60
// -----------------------------
// | return addr | + 56
// | saved ebp | + 52
// | saved ebx | + 48
// | saved esi | + 44
// | saved edi | + 40
// | width_adj | + 36
// | height_adj | + 32
// | stretch | + 28
// | mark | + 24
// | iNextLine | + 20
// | j | + 16
// | k | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 40
#define PITCH_PARM 88
#define FRAME_HEIGHT 84
#define FRAME_WIDTH 80
#define VPLANE 76
#define UPLANE 72
#define YPLANE 68
#define LP_INPUT 64
#define LPBI_INPUT 60
#define WIDTH_ADJ 36
#define HEIGHT_ADJ 32
#define STRETCH 28
#define MARK 24
#define NEXT_LINE 20
#define LOOP_J 16
#define LOOP_K 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp push ebx push esi push edi sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = (lpbiInput->biWidth - FrameWidth) >> 1
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] shr esi, 1 mov [esp + WIDTH_ADJ], esi // aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 mov [esp + HEIGHT_ADJ], edx // stretch = (height_adj ? 1 : 0)
xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iNextLine = width_adj << 1
mov ebp, esi shl ebp, 1 mov [esp + NEXT_LINE], ebp // pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth mov ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane)
mov edi, [esp + YPLANE] // for (j = LumaIters; j > 0; j--)
mov eax, [esp + LUMA_ITERS] mov [esp + LOOP_J], eax // for (k = mark; k > 0; k--)
L4: mov eax, [esp + MARK] mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4)
// assign (ebp, i)
L5: mov ebp, [esp + FRAME_WIDTH] // This jump is here to make sure the following loop starts on the U pipe
jmp L6 L6: // *(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1;
// 1
mov eax, [esi] lea esi, [esi + 4] // 2
and eax, 0xFEFEFEFE lea edi, [edi + 4] // 3
shr eax, 1 sub ebp, 4 // 4
mov [edi - 4], eax jnz L6
// pnext += iNextLine
// YPlane += ypitch_adj
add esi, [esp + NEXT_LINE] add edi, [esp + YPITCH_ADJ]
// end of k loop
mov eax, [esp + LOOP_K] sub eax, 1 mov [esp + LOOP_K], eax jnz L5
// if (stretch)
mov eax, [esp + STRETCH] test eax, eax jz L7
// plast = pnext - lpbiInput->biWidth
// pn = pnext
// assign (ecx, plast)
// assign (edx, pn)
mov ecx, esi mov eax, [esp + LPBI_INPUT] sub ecx, (LPBITMAPINFOHEADER)[eax].biWidth mov edx, esi
// for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH] // This jump is here just to make sure the loop code starts with the U pipe
jmp L8 L8: // *(U32 *)YPlane =
// ( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) +
// ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1
// 1
mov eax, [ecx] lea ecx, [ecx + 4] // 2
shr eax, 1 // 3
and eax, 0x7F7F7F7F mov ebx, [edx] // 4
shr ebx, 1 lea edi, [edi + 4] // 5
and ebx, 0x7F7F7F7F // 6
add eax, ebx // 7
and eax, 0xFEFEFEFE // 8
shr eax, 1 // 9
mov [edi - 4], eax sub ebp, 4 // 10
lea edx, [edx + 4] jnz L8
// YPlane += ypitch_adj
add edi, [esp + YPITCH_ADJ]
L7: // end of the LumaIters loop
dec DWORD PTR [esp + LOOP_J] jnz L4
// pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight)
// assign (esi, pvsrc)
mov eax, [esp + LPBI_INPUT] mov ebx, (LPBITMAPINFOHEADER)[eax].biWidth mov eax, (LPBITMAPINFOHEADER)[eax].biHeight imul ebx add eax, [esp + LP_INPUT] mov esi, eax // pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight)>>2)
// assign (edi, pusrc)
mov eax, [esp + LPBI_INPUT] mov ecx, (LPBITMAPINFOHEADER)[eax].biWidth shr ecx, 2 mov eax, (LPBITMAPINFOHEADER)[eax].biHeight shr eax, 2 imul ecx add eax, esi mov edi, eax // t = ((lpbiInput->biWidth>>2) * (height>>2)) + (width_adj>>2)
// assign (eax, t)
mov eax, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[eax].biWidth shr eax, 2 mov ebx, [esp + HEIGHT_ADJ] shr ebx, 2 imul ebx mov ebx, [esp + WIDTH_ADJ] shr ebx, 2 add eax, ebx // pvsrc += t
// pusrc += t
add esi, eax add edi, eax
// uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1)
mov ebp, esp mov eax, [ebp + WIDTH_ADJ] shr eax, 1 push eax mov eax, [ebp + LUMA_ITERS] shr eax, 1 push eax mov eax, [ebp + FRAME_WIDTH] shr eax, 1 push eax push DWORD PTR [ebp + PITCH_PARM] push DWORD PTR [ebp + UPLANE] push edi call IA_uv_plane_common lea esp, [esp + 24]
// uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1)
mov ebp, esp mov eax, [ebp + WIDTH_ADJ] shr eax, 1 push eax mov eax, [ebp + LUMA_ITERS] shr eax, 1 push eax mov eax, [ebp + FRAME_WIDTH] shr eax, 1 push eax push DWORD PTR [ebp + PITCH_PARM] push DWORD PTR [ebp + VPLANE] push esi call IA_uv_plane_common lea esp, [esp + 24]
add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef WIDTH_ADJ
#undef HEIGHT_ADJ
#undef STRETCH
#undef MARK
#undef NEXT_LINE
#undef LOOP_J
#undef LOOP_K
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/***************************************************
* H26X_YUV12toEncYUV12() * Copy YUV12 data to encoder memory at the * appropriate location. It is assumed that the input * data is stored as rows of Y, followed by rows of U, * then rows of V. * ***************************************************/
#if 0
_STATIC void C_H26X_YUV12toEncYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) {
int i, j; U32 *pnext = (U32 *)lpInput;
int ypitch_adj = pitch - FrameWidth; int yinput_height = lpbiInput->biHeight; int yinput_width = lpbiInput->biWidth; int yheight_diff = FrameHeight - yinput_height; int ywidth_diff = FrameWidth - yinput_width;
int uvpitch_adj = pitch - (FrameWidth >> 1); int uvoutput_width = FrameWidth >> 1; int uvinput_height = yinput_height >> 1; int uvinput_width = yinput_width >> 1; int uvheight_diff = yheight_diff >> 1; int uvwidth_diff = ywidth_diff >> 1;
for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj) { for (i = yinput_width; i > 0; i -= 8) { *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4; *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4; } for (i = ywidth_diff; i > 0; i -= 8) { *(U32 *)YPlane = 0; YPlane += 4; *(U32 *)YPlane = 0; YPlane += 4; } } for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj) { for (i = FrameWidth; i > 0; i -= 8) { *(U32 *)YPlane = 0; YPlane += 4; *(U32 *)YPlane = 0; YPlane += 4; } }
for (j = uvinput_height; j > 0; j--, UPlane += uvpitch_adj) { for (i = uvinput_width; i > 0; i -= 8) { *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4; *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4; } for (i = uvwidth_diff; i > 0; i -= 8) { *(U32 *)UPlane = 0x40404040; UPlane += 4; *(U32 *)UPlane = 0x40404040; UPlane += 4; } } for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj) { for (i = uvoutput_width; i > 0; i -= 8) { *(U32 *)UPlane = 0x40404040; UPlane += 4; *(U32 *)UPlane = 0x40404040; UPlane += 4; } }
for (j = uvinput_height; j > 0; j--, VPlane += uvpitch_adj) { for (i = uvinput_width; i > 0; i -= 8) { *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4; *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4; } for (i = uvwidth_diff; i > 0; i -= 8) { *(U32 *)VPlane = 0x40404040; VPlane += 4; *(U32 *)VPlane = 0x40404040; VPlane += 4; } } for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj) { for (i = uvoutput_width; i > 0; i -= 8) { *(U32 *)VPlane = 0x40404040; VPlane += 4; *(U32 *)VPlane = 0x40404040; VPlane += 4; } }
} #endif
__declspec(naked) _STATIC void IA_H26X_YUV12toEncYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 92
// | FrameHeight | + 88
// | FrameWidth | + 84
// | VPlane | + 80
// | UPlane | + 76
// | YPlane | + 72
// | lpInput | + 68
// | lpbiInput | + 64
// -----------------------------
// | return addr | + 60
// | saved ebp | + 56
// | saved ebx | + 52
// | saved esi | + 48
// | saved edi | + 44
// | ypitch_adj | + 40
// | yinput_height | + 36
// | yinput_width | + 32
// | yheight_diff | + 28
// | ywidth_diff | + 24
// | uvpitch_adj | + 20
// | uvoutput_width | + 16
// | uvinput_height | + 12
// | uvinput_width | + 8
// | uvheight_diff | + 4
// | uvwidth_diff | + 0
#define LOCALSIZE 44
#define PITCH_PARM 92
#define FRAME_HEIGHT 88
#define FRAME_WIDTH 84
#define VPLANE 80
#define UPLANE 76
#define YPLANE 72
#define LP_INPUT 68
#define LPBI_INPUT 64
#define YPITCH_ADJ 40
#define YINPUT_HEIGHT 36
#define YINPUT_WIDTH 32
#define YHEIGHT_DIFF 28
#define YWIDTH_DIFF 24
#define UVPITCH_ADJ 20
#define UVOUTPUT_WIDTH 16
#define UVINPUT_HEIGHT 12
#define UVINPUT_WIDTH 8
#define UVHEIGHT_DIFF 4
#define UVWIDTH_DIFF 0
_asm {
push ebp push ebx push esi push edi sub esp, LOCALSIZE
mov ebx, [esp + FRAME_HEIGHT] mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] // ypitch_adj = pitch - FrameWidth
mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvoutput_width = FrameWidth >> 1
mov ebp, ecx shr ebp, 1 mov [esp + UVOUTPUT_WIDTH], ebp // uvpitch_adj = pitch - (FrameWidth >> 1)
sub edx, ebp mov [esp + UVPITCH_ADJ], edx // yinput_height = lpbiInput->biHeight
// uvinput_height = yinput_height >> 1
// yinput_width = lpbiInput->biWidth
// uvinput_width = yinput_width >> 1
mov ebx, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[ebx].biHeight mov [esp + YINPUT_HEIGHT], eax shr eax, 1 mov [esp + UVINPUT_HEIGHT], eax mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth mov [esp + YINPUT_WIDTH], eax shr eax, 1 mov [esp + UVINPUT_WIDTH], eax // yheight_diff = FrameHeight - yinput_height
// uvheight_diff = yheight_diff >> 1;
mov eax, [esp + FRAME_HEIGHT] mov ebx, eax sub eax, [esp + YINPUT_HEIGHT] jns NoCrop0 xor eax, eax mov [esp + YINPUT_HEIGHT], ebx shr ebx, 1 mov [esp + UVINPUT_HEIGHT], ebx NoCrop0: mov [esp + YHEIGHT_DIFF], eax shr eax, 1 mov [esp + UVHEIGHT_DIFF], eax // ywidth_diff = FrameWidth - yinput_width
// uvwidth_diff = ywidth_diff >> 1;
mov eax, [esp + FRAME_WIDTH] xor ebx, ebx sub eax, [esp + YINPUT_WIDTH] jns NoCrop1 mov eax, [esp + FRAME_WIDTH] mov ebx, [esp + YINPUT_WIDTH] sub ebx, eax mov [esp + YINPUT_WIDTH], eax shr eax, 1 mov [esp + UVINPUT_WIDTH], eax xor eax, eax NoCrop1: mov [esp + YWIDTH_DIFF], eax shr eax, 1 mov [esp + UVWIDTH_DIFF], eax // assign (esi, lpInput)
mov esi, [esp + LP_INPUT]
// assign (edi, YPlane)
mov edi, [esp + YPLANE] // for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + YINPUT_HEIGHT] L1: // for (i = yinput_width; i > 0; i -= 8)
// assign (ebp, i)
mov ebp, [esp + YINPUT_WIDTH] L2: // *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4
// *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4
// 1
mov eax, [esi] mov edx, [esi + 4] // 2
shr eax, 1 and edx, 0xFEFEFEFE // 3
shr edx, 1 and eax, 0x7F7F7F7F // 4
lea esi, [esi + 8] mov [edi], eax // 5
sub ebp, 8 mov [edi + 4], edx // 6
lea edi, [edi + 8] jnz L2 // for (i = ywidth_diff; i > 0; i -= 8)
// *(U32 *)YPlane = 0; YPlane += 4;
// *(U32 *)YPlane = 0; YPlane += 4;
// assign (ebp, i)
mov ebp, [esp + YWIDTH_DIFF] test ebp, ebp jz L3 L4: // 1
xor eax, eax sub ebp, 8 // 2
mov [edi], eax mov [edi + 4], eax // 3
lea edi, [edi + 8] jnz L4 // j--, YPlane += ypitch_adj
L3: mov eax, [esp + YPITCH_ADJ] add edi, eax add esi, ebx dec ecx jnz L1
// for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + YHEIGHT_DIFF] test ecx, ecx jz L7 L5: // for (i = FrameWidth; i > 0; i -= 8)
// *(U32 *)YPlane = 0; YPlane += 4;
// *(U32 *)YPlane = 0; YPlane += 4;
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH] L6: // 1
xor eax, eax sub ebp, 8 // 2
mov [edi], eax mov [edi + 4], eax // 3
lea edi, [edi + 8] jnz L6 // j--, YPlane += ypitch_adj
mov eax, [esp + YPITCH_ADJ] add edi, eax dec ecx jnz L5
L7: // recompute start of input U plane
mov edx, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[edx].biHeight mov ecx, (LPBITMAPINFOHEADER)[edx].biWidth imul eax, ecx // assign (esi, lpInput)
mov esi, [esp + LP_INPUT] add esi, eax // assign (edi, UPlane)
mov edi, [esp + UPLANE] shr ebx, 1 // for (j = uvinput_height; j > 0; j--, UPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVINPUT_HEIGHT] L8: // for (i = uvinput_width; i > 0; i -= 8)
// assign (ebp, i)
mov ebp, [esp + UVINPUT_WIDTH] L9: // *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4
// *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4
// 1
mov eax, [esi] mov edx, [esi + 4] // 2
shr eax, 1 and edx, 0xFEFEFEFE // 3
shr edx, 1 and eax, 0x7F7F7F7F // 4
lea esi, [esi + 8] mov [edi], eax // 5
sub ebp, 8 mov [edi + 4], edx // 6
lea edi, [edi + 8] jnz L9 // for (i = uvwidth_diff; i > 0; i -= 8)
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVWIDTH_DIFF] test ebp, ebp jz L11 L10: // 1
mov eax, 040404040H sub ebp, 8 // 2
mov [edi], eax mov [edi + 4], eax // 3
lea edi, [edi + 8] jnz L10 // j--, UPlane += uvpitch_adj
L11: mov eax, [esp + UVPITCH_ADJ] add edi, eax add esi, ebx dec ecx jnz L8
// for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVHEIGHT_DIFF] test ecx, ecx jz L14 L12: // for (i = uvoutput_width; i > 0; i -= 8)
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVOUTPUT_WIDTH] L13: // 1
mov eax, 040404040H sub ebp, 8 // 2
mov [edi], eax mov [edi + 4], eax // 3
lea edi, [edi + 8] jnz L13 // j--, UPlane += uvpitch_adj
mov eax, [esp + UVPITCH_ADJ] add edi, eax dec ecx jnz L12
L14: // recompute start of input V plane
mov edx, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[edx].biHeight mov ecx, (LPBITMAPINFOHEADER)[edx].biWidth imul eax, ecx // assign (esi, lpInput)
mov esi, [esp + LP_INPUT] add esi, eax shr eax, 2 add esi, eax // assign (edi, VPlane)
mov edi, [esp + VPLANE] // for (j = uvinput_height; j > 0; j--, VPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVINPUT_HEIGHT] L15: // for (i = uvinput_width; i > 0; i -= 8)
// assign (ebp, i)
mov ebp, [esp + UVINPUT_WIDTH] L16: // *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4
// *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4
// 1
mov eax, [esi] mov edx, [esi + 4] // 2
shr eax, 1 and edx, 0xFEFEFEFE // 3
shr edx, 1 and eax, 0x7F7F7F7F // 4
lea esi, [esi + 8] mov [edi], eax // 5
sub ebp, 8 mov [edi + 4], edx // 6
lea edi, [edi + 8] jnz L16 // for (i = uvwidth_diff; i > 0; i -= 8)
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVWIDTH_DIFF] test ebp, ebp jz L18 L17: // 1
mov eax, 040404040H sub ebp, 8 // 2
mov [edi], eax mov [edi + 4], eax // 3
lea edi, [edi + 8] jnz L17 // j--, VPlane += uvpitch_adj
L18: mov eax, [esp + UVPITCH_ADJ] add edi, eax add esi, ebx dec ecx jnz L15
// for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVHEIGHT_DIFF] test ecx, ecx jz L21 L19: // for (i = uvoutput_width; i > 0; i -= 8)
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVOUTPUT_WIDTH] L20: // 1
mov eax, 040404040H sub ebp, 8 // 2
mov [edi], eax mov [edi + 4], eax // 3
lea edi, [edi + 8] jnz L20 // j--, VPlane += uvpitch_adj
mov eax, [esp + UVPITCH_ADJ] add edi, eax dec ecx jnz L19
L21: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef YPITCH_ADJ
#undef YINPUT_HEIGHT
#undef YINPUT_WIDTH
#undef YHEIGHT_DIFF
#undef YWIDTH_DIFF
#undef UVPITCH_ADJ
#undef UVOUTPUT_WIDTH
#undef UVINPUT_HEIGHT
#undef UVINPUT_WIDTH
#undef UVHEIGHT_DIFF
#undef UVWIDTH_DIFF
#if 0
void C_H26X_YUY2toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) {
U8 *pnext, *plast, *pbn, *peol; int width_adj, height_adj; int stretch, mark, aspect; int iBackTwoLines; int j, k; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1); int nextline = -(lpbiInput->biWidth << 1);
for (j = FrameHeight; j > 0; j -= 48) { LumaIters += 4; } width_adj = lpbiInput->biWidth - FrameWidth; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // Move from end of line N to beginning of line N-1
iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) << 1); // Point to the beginning of the last line.
pnext = lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj;
for (j = LumaIters; j > 0; j--) { for (k = 0; k < mark; k++) { for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) { if (0 == (k & 1)) { *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1; *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1; *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1; *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1; *(UPlane+0) = ((*(pnext+ 1)>>1) + (*(pnext+ 1+nextline)>>1)) >> 1; *(UPlane+1) = ((*(pnext+ 5)>>1) + (*(pnext+ 5+nextline)>>1)) >> 1; *(UPlane+2) = ((*(pnext+ 9)>>1) + (*(pnext+ 9+nextline)>>1)) >> 1; *(UPlane+3) = ((*(pnext+13)>>1) + (*(pnext+13+nextline)>>1)) >> 1; *(VPlane+0) = ((*(pnext+ 3)>>1) + (*(pnext+ 3+nextline)>>1)) >> 1; *(VPlane+1) = ((*(pnext+ 7)>>1) + (*(pnext+ 7+nextline)>>1)) >> 1; *(VPlane+2) = ((*(pnext+11)>>1) + (*(pnext+11+nextline)>>1)) >> 1; *(VPlane+3) = ((*(pnext+15)>>1) + (*(pnext+15+nextline)>>1)) >> 1; UPlane += 4; VPlane += 4; } else { *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1; *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1; *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1; *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1; } } pnext += iBackTwoLines; YPlane += ypitch_adj; if (0 == (k & 1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } if (stretch) { plast = pnext - (lpbiInput->biWidth << 1); pbn = pnext; for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8) { *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1; *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1; *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1; *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1; } YPlane += ypitch_adj; } } } #endif
__declspec(naked) _STATIC void IA_H26X_YUY2toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * BGR24Image, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | peol | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 48
#define PITCH_PARM 96
#define FRAME_HEIGHT 92
#define FRAME_WIDTH 88
#define VPLANE 84
#define UPLANE 80
#define YPLANE 76
#define LP_INPUT 72
#define LPBI_INPUT 68
#define PYPREV 44
#define PYSPACE 40
#define PYNEXT 36
#define PEOL 32
#define LOOP_J 28
#define LOOP_K 24
#define BACK_TWO_LINES 20
#define STRETCH 16
#define MARK 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = lpbiInput->biWidth - FrameWidth;
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] // aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0)
xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] shl ebp, 1 neg ebp mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput +
// ((lpbiInput->biWidth << 1) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane)
// assign (edx, UPlane)
// assign (ebp, VPlane)
mov edi, [esp + YPLANE] mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] // for (j = 0; j < LumaIters; j++)
xor eax, eax mov [esp + LOOP_J], eax L4: // for (k = 0; k < mark; k++)
xor eax, eax mov [esp + LOOP_K], eax L5: // for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8)
mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, esi mov [esp + PEOL], ecx // if (0 == (k & 1)) {
mov eax, [esp + LOOP_K] test eax, 1 jnz L6 // *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1
// *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1
// *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1
// *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1
// or graphically
// *************************************************************************************************
// Values * Y 0 * U 0 * Y 1 * V 0 * Y 2 * U 1 * Y 3 * V 1 * Y 4 * U 2 * Y 5 * V 2 * Y 6 * U 3 * Y 7 * V 3 *
// *************************************************************************************************
// Y Offsets 0 2 4 6 8 10 12 14
// U Offsets 1 5 9 13
// Y Offsets 3 7 11 15
// Register usage:
// eax - accumulate Y values
// ebx - accumulate U values
// ecx - accumulate V values
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
// edx - ptr for writing U values
// ebp - ptr for writing V values
L7: ; 1 mov al, [esi+4] ; Y2 mov bl, [esi+9] ; U2 ; 2 mov ah, [esi+6] ; Y3 mov bh, [esi+13] ; U3 ; 3 shl eax, 16 mov cl, [esi+11] ; V2 ; 4 shl ebx, 16 mov ch, [esi+15] ; V3 ; 5 shl ecx, 16 mov al, [esi] ; Y0 ; 6 mov bh, [esi+5] ; U1 mov ah, [esi+2] ; Y1 ; 7 shr eax, 1 mov bl, [esi+1] ; U0 ; 8 shr ebx, 1 mov ch, [esi+7] ; V1 ; 9 and eax, 07F7F7F7FH mov cl, [esi+3] ; V0 ; 10 shr ecx, 1 and ebx, 07F7F7F7FH ; 11 mov [edi], eax and ecx, 07F7F7F7FH ; 12 mov al, [esi+12] ; Y6 mov [edx], ebx ; 13 mov ah, [esi+14] ; Y7 mov [ebp], ecx ; 14 shl eax, 16 mov ecx, [esp + PEOL] ; 15 mov al, [esi+8] ; Y4 lea edi, [edi+8] ; 16 mov ah, [esi+10] ; Y5 lea edx, [edx+4] ; 17 shr eax, 1 lea ebp, [ebp+4] ; 18 and eax, 07F7F7F7FH lea esi, [esi+16] ; 19 mov [edi-4], eax cmp esi, ecx ; 20 jl L7
jmp L8 // } else {
// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// }
// Register usage:
// eax, ebx - accumulate Y values
// ecx - peol
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
L6: ; 1 mov al, [esi+4] ; Y2 mov bl, [esi+12] ; Y6 ; 2 mov ah, [esi+6] ; Y3 mov bh, [esi+14] ; Y7 ; 3 shl eax, 16 lea edi, [edi+8] ; 4 shl ebx, 16 mov al, [esi] ; Y0 ; 5 mov ah, [esi+2] ; Y1 mov bh, [esi+10] ; Y5 ; 6 shr eax, 1 mov bl, [esi+8] ; Y4 ; 7 shr ebx, 1 and eax, 07F7F7F7FH ; 8 mov [edi-8], eax and ebx, 07F7F7F7FH ; 9 mov [edi-8+4], ebx lea esi, [esi+16] ; 10 cmp esi, ecx jl L6 L8: // pnext += iBackTwoLines
add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj
add edi, [esp + YPITCH_ADJ] // if (0 == (k&1))
mov eax, [esp + LOOP_K] test eax, 1 jnz L9 // UPlane += uvpitch_adj
add edx, [esp + UVPITCH_ADJ] // VPlane += uvpitch_adj
add ebp, [esp + UVPITCH_ADJ] L9: mov eax, [esp + LOOP_K] inc eax mov [esp + LOOP_K], eax cmp eax, [esp + MARK] jl L5 // if (stretch)
mov eax, [esp + STRETCH] test eax, eax jz L10 // Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average.
mov [esp + UPLANE], edx mov [esp + VPLANE], ebp // plast = pnext - (lpbiInput->biWidth << 1)
// assign (plast, edx)
mov edx, esi mov eax, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[eax].biWidth shl eax, 1 sub edx, eax // pbn = pnext
// assign (pbn, ebp)
mov ebp, esi // for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8)
mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, ebp // *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1
// *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1
// *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1
// *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1
mov al, [edx+4] mov bl, [ebp+4] mov bh, [ebp+6] shl ebx, 16
L11: ; 1 mov ah, [edx+6] mov bl, [ebp] ; 2 shl eax, 16 mov bh, [ebp+2] ; 3 mov al, [edx] lea edi, [edi+4] ; 4 mov ah, [edx+2] lea edx, [edx+8] ; 5 and eax, 0xFEFEFEFE lea ebp, [ebp+8] ; 6 shr eax, 1 and ebx, 0xFEFEFEFE ; 7 shr ebx, 1 nop ; 8 add eax, ebx mov bl, [ebp+4] ; 9 shr eax, 1 mov bh, [ebp+6] ; 10 shl ebx, 16 and eax, 0x7F7F7F7F ; 11 mov [edi-4], eax mov al, [edx+4] ; 12 cmp ebp, ecx jl L11 // YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ] // Recover pts to UPlane and VPlane
mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] L10: mov eax, [esp + LOOP_J] inc eax mov [esp + LOOP_J], eax cmp eax, [esp + LUMA_ITERS] jl L4
add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
bool UYVY_to_YUV12_Flip( LPBITMAPINFOHEADER lpbiInput, U8 * pImage, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { DWORD dwFrameWidthHalf, dwFrameHeightHalf; BYTE *pRowStartY, *pRowStartSrc, *pRowStartU, *pRowStartV; int offset;
int nRowsToSkip=0, nColsToSkip=0, nRowSkipDelta=0xffffff, nColSkipDelta=0xffffff; int nSrcRowIndex, nDstRowIndex, nSrcColIndex, nDstColIndex, COLUMNSTOSKIP=0, ROWSTOSKIP=0;
if ((FrameWidth != (DWORD)(lpbiInput->biWidth)) || (FrameHeight != (DWORD)(lpbiInput->biHeight))) { nColsToSkip = COLUMNSTOSKIP = lpbiInput->biWidth - FrameWidth; nRowsToSkip = ROWSTOSKIP = lpbiInput->biHeight - FrameHeight; if ((nColsToSkip < 0) || (nRowsToSkip < 0)) { return false; }
// nXXXSkipDelta dictate how often we "skip" a row or col
if (nRowsToSkip) { nRowSkipDelta = (lpbiInput->biHeight + (nRowsToSkip - 1)) / nRowsToSkip; }
if (nColsToSkip) { nColSkipDelta = (lpbiInput->biWidth + (nColsToSkip - 1)) / nColsToSkip; } }
// quick check to make sure we're processing CIF, QCIF, or SQCIF
if ((FrameWidth % 4) || (FrameHeight % 4)) { return false; }
dwFrameWidthHalf = FrameWidth / 2; dwFrameHeightHalf = FrameHeight / 2;
nSrcRowIndex = 0; nDstRowIndex = 0;
// step 1, convert the Y values over
while ((DWORD)nDstRowIndex < FrameHeight) { // ASSERT(nSrcRowIndex < lpbiInput->biHeight);
pRowStartY = YPlane + (pitch * nDstRowIndex); pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 1;
// do we need to skip this row ?
if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0)) { nRowsToSkip--; nSrcRowIndex++; continue; }
// Copy the Y values of the input row into the destination row
nSrcColIndex = 0; nDstColIndex = 0;
nColsToSkip = COLUMNSTOSKIP;
while ((DWORD)nDstColIndex < FrameWidth) { // ASSERT(nSrcColIndex < lpbiInput->biWidth);
// do we need to skip this column ?
if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0)) { nColsToSkip--; nSrcColIndex++; continue; }
pRowStartY[nDstColIndex] = pRowStartSrc[nSrcColIndex * 2] >> 1;
nSrcColIndex++; nDstColIndex++; }
nSrcRowIndex++; nDstRowIndex++; }
nSrcRowIndex = 0; nDstRowIndex = 0; nRowsToSkip = ROWSTOSKIP;
// step 2, process U and V values
while ((DWORD)nDstRowIndex < dwFrameHeightHalf) // dest is only half as many rows as src
{ // ASSERT(nSrcRowIndex < lpbiInput->biHeight);
// don't process odd numbered rows
if (nSrcRowIndex % 2) { // if we were supposed to skip this src row anyway, make sure
// we update our decrement
if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0)) { nRowsToSkip--; }
nSrcRowIndex++;
continue; }
// do we need to skip this row ?
if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0)) { nRowsToSkip--; nSrcRowIndex++; continue; }
pRowStartU = UPlane + (pitch * nDstRowIndex); pRowStartV = VPlane + (pitch * nDstRowIndex); pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 0;
// Copy the U and V values of the input row into the destination row
nSrcColIndex = 0; nDstColIndex = 0;
nColsToSkip = COLUMNSTOSKIP; // reset column skip count
while ((DWORD)nDstColIndex < dwFrameWidthHalf) { // ASSERT(nSrcColIndex < lpbiInput->biWidth);
// skip odd numbered columns
if (nSrcColIndex % 2) {
// if we were supposed to skip this src row anyway, make sure
// we update our decrement
if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0)) { nColsToSkip--; }
nSrcColIndex++;
continue; }
// do we need to skip this column ?
if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0)) { nSrcColIndex++; nColsToSkip--; continue; }
offset = nSrcColIndex * 2; pRowStartU[nDstColIndex] = pRowStartSrc[offset] >> 1; pRowStartV[nDstColIndex] = pRowStartSrc[offset+2] >> 1;
nSrcColIndex++; nDstColIndex++; }
nSrcRowIndex++; nDstRowIndex++; }
// and we are done!
return true;
}
__declspec(naked) _STATIC void IA_H26X_UYVYtoYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * BGR24Image, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | peol | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
_asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = lpbiInput->biWidth - FrameWidth;
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] // aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0)
xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch
mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] shl ebp, 1 neg ebp mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput +
// ((lpbiInput->biWidth << 1) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane)
// assign (edx, UPlane)
// assign (ebp, VPlane)
mov edi, [esp + YPLANE] mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] // for (j = 0; j < LumaIters; j++)
xor eax, eax mov [esp + LOOP_J], eax L4: // for (k = 0; k < mark; k++)
xor eax, eax mov [esp + LOOP_K], eax L5: // for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8)
mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, esi mov [esp + PEOL], ecx // if (0 == (k & 1)) {
mov eax, [esp + LOOP_K] test eax, 1 jnz L6 // *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1
// *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1
// *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1
// *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1
// or graphically
// *************************************************************************************************
// Values * U 0 * Y 0 * V 0 * Y 1 * U 1 * Y 2 * V 1 * Y 3 * U 2 * Y 4 * V 2 * Y 5 * U 3 * Y 6 * V 3 * Y 7 *
// *************************************************************************************************
// Y Offsets 1 3 5 7 9 11 13 15
// U Offsets 0 4 8 12
// Y Offsets 2 6 10 14
// Register usage:
// eax - accumulate Y values
// ebx - accumulate U values
// ecx - accumulate V values
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
// edx - ptr for writing U values
// ebp - ptr for writing V values
L7: ; 1 mov al, [esi+5] ; Y2 mov bl, [esi+8] ; U2 ; 2 mov ah, [esi+7] ; Y3 mov bh, [esi+12] ; U3 ; 3 shl eax, 16 mov cl, [esi+10] ; V2 ; 4 shl ebx, 16 mov ch, [esi+14] ; V3 ; 5 shl ecx, 16 mov al, [esi+1] ; Y0 ; 6 mov bh, [esi+4] ; U1 mov ah, [esi+3] ; Y1 ; 7 shr eax, 1 mov bl, [esi] ; U0 ; 8 shr ebx, 1 mov ch, [esi+6] ; V1 ; 9 and eax, 07F7F7F7FH mov cl, [esi+2] ; V0 ; 10 shr ecx, 1 and ebx, 07F7F7F7FH ; 11 mov [edi], eax and ecx, 07F7F7F7FH ; 12 mov al, [esi+13] ; Y6 mov [edx], ebx ; 13 mov ah, [esi+15] ; Y7 mov [ebp], ecx ; 14 shl eax, 16 mov ecx, [esp + PEOL] ; 15 mov al, [esi+9] ; Y4 lea edi, [edi+8] ; 16 mov ah, [esi+11] ; Y5 lea edx, [edx+4] ; 17 shr eax, 1 lea ebp, [ebp+4] ; 18 and eax, 07F7F7F7FH lea esi, [esi+16] ; 19 mov [edi-4], eax cmp esi, ecx ; 20 jl L7
jmp L8 // } else {
// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// }
// Register usage:
// eax, ebx - accumulate Y values
// ecx - peol
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
L6: ; 1 mov al, [esi+5] ; Y2 mov bl, [esi+13] ; Y6 ; 2 mov ah, [esi+7] ; Y3 mov bh, [esi+15] ; Y7 ; 3 shl eax, 16 lea edi, [edi+8] ; 4 shl ebx, 16 mov al, [esi+1] ; Y0 ; 5 mov ah, [esi+3] ; Y1 mov bh, [esi+11] ; Y5 ; 6 shr eax, 1 mov bl, [esi+9] ; Y4 ; 7 shr ebx, 1 and eax, 07F7F7F7FH ; 8 mov [edi-8], eax and ebx, 07F7F7F7FH ; 9 mov [edi-8+4], ebx lea esi, [esi+16] ; 10 cmp esi, ecx jl L6 L8: // pnext += iBackTwoLines
add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj
add edi, [esp + YPITCH_ADJ] // if (0 == (k&1))
mov eax, [esp + LOOP_K] test eax, 1 jnz L9 // UPlane += uvpitch_adj
add edx, [esp + UVPITCH_ADJ] // VPlane += uvpitch_adj
add ebp, [esp + UVPITCH_ADJ] L9: mov eax, [esp + LOOP_K] inc eax mov [esp + LOOP_K], eax cmp eax, [esp + MARK] jl L5 // if (stretch)
mov eax, [esp + STRETCH] test eax, eax jz L10 // Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average.
mov [esp + UPLANE], edx mov [esp + VPLANE], ebp // plast = pnext - (lpbiInput->biWidth << 1)
// assign (plast, edx)
mov edx, esi mov eax, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[eax].biWidth shl eax, 1 sub edx, eax // pbn = pnext
// assign (pbn, ebp)
mov ebp, esi // for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8)
mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, ebp // *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1
// *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1
// *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1
// *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1
mov al, [edx+5] mov bl, [ebp+5] mov bh, [ebp+7] shl ebx, 16
L11: ; 1 mov ah, [edx+7] mov bl, [ebp+1] ; 2 shl eax, 16 mov bh, [ebp+3] ; 3 mov al, [edx+1] lea edi, [edi+4] ; 4 mov ah, [edx+3] lea edx, [edx+8] ; 5 and eax, 0xFEFEFEFE lea ebp, [ebp+8] ; 6 shr eax, 1 and ebx, 0xFEFEFEFE ; 7 shr ebx, 1 nop ; 8 add eax, ebx mov bl, [ebp+5] ; 9 shr eax, 1 mov bh, [ebp+7] ; 10 shl ebx, 16 and eax, 0x7F7F7F7F ; 11 mov [edi-4], eax mov al, [edx+5] ; 12 cmp ebp, ecx jl L11 // YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ] // Recover pts to UPlane and VPlane
mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] L10: mov eax, [esp + LOOP_J] inc eax mov [esp + LOOP_J], eax cmp eax, [esp + LUMA_ITERS] jl L4
add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret
} }
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef PEOL
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/*************************************************************
* Name: colorCnvtFrame * Description: Color convert and copy input frame. ************************************************************/ void colorCnvtFrame( T_H263EncoderCatalog * EC, LPCODINST lpCompInst, ICCOMPRESS * lpicComp, U8 * YPlane, U8 * UPlane, U8 * VPlane ) { U8 *RGBCursor = (U8 *) lpicComp->lpInput; LPBITMAPINFOHEADER lpbiInput = lpicComp->lpbiInput;
/* The Connectix Quick Cam requires RGB to YUV12 conversion.
* The B/W camera generates palette versions (8 and 4 bit). * The color camera generates RGB24 for million colors and * RGB16555 for thousands colors. */
if (BI_RGB == lpicComp->lpbiInput->biCompression) { if (24 == lpicComp->lpbiInput->biBitCount) { #if 0
if ((128 == lpbiInput->biWidth) && (96 == lpbiInput->biHeight)) { U8 YTest[12288]; U8 UTest[6144]; U8 VTest[6144]; int i, j, k; U8 R,G,B; C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YTest, UTest, VTest, EC->FrameWidth, EC->FrameHeight, 128); for (i = 0; i < 96; i++) { for (j = 0; j < 128; j++) { k = (i*128)+j; if (1 < abs(YPlane[(i*384)+j]-YTest[(i*128)+j])) { B = RGBCursor[(((95-i)*128)+j)*3]; G = RGBCursor[(((95-i)*128)+j)*3+1]; R = RGBCursor[(((95-i)*128)+j)*3+2]; } if ((0 == (i%2)) && (0 == (j%2))) { k = ((i>>1)*128)+(j>>1); if (1 < abs(UPlane[((i>>1)*384)+(j>>1)]-UTest[((i>>1)*128)+(j>>1)])) { B = RGBCursor[(((95-i)*128)+j)*3]; G = RGBCursor[(((95-i)*128)+j)*3+1]; R = RGBCursor[(((95-i)*128)+j)*3+2]; } if (1 < abs(VPlane[((i>>1)*384)+(j>>1)] != VTest[((i>>1)*128)+(j>>1)])) { B = RGBCursor[(((95-i)*128)+j)*3]; G = RGBCursor[(((95-i)*128)+j)*3+1]; R = RGBCursor[(((95-i)*128)+j)*3+2]; } } } } } #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else
IA_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else if(16 == lpicComp->lpbiInput->biBitCount) { // To use a common routine for all possible combinations of RGB16,
// a bitfield number is passed. This number identifies the proper bit shift
// and masking values to extract the color information
// from the 16-bit pixel words.
//
// number shift mask
// B, G, R
// ------ ----------- ----------------
// 555 2, 3, 8 0x7C, 0x7C, 0x7C
// 664 3, 3, 9 0x78, 0x7E, 0x7E
// 565 2, 4, 9 0x7C, 0x7E, 0x7C
// 655 2, 3, 9 0x7C, 0x7C, 0x7E
//
// Only 555 falls under BI_RGB. The others are specified using the
// BI_BITFIELDS compression specification. For BI_BITFIELDS, call
// Build16bitModeID to get the actual bitfield number. This routine requires the
// three array elements in the bmiColors field of a BITMAPINFO object.
//
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_BGR16toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, 555, PITCH); #else
IA_H26X_BGR16555toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else if(8 == lpicComp->lpbiInput->biBitCount) { #if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, 8, PITCH); #else
IA_H26X_CLUT8toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else if(4 == lpicComp->lpbiInput->biBitCount) { #if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, 4, PITCH); #else
IA_H26X_CLUT4toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else { DBOUT("ERROR: Unexpected input format detected."); } } else if (FOURCC_YVU9 == lpicComp->lpbiInput->biCompression) { #if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else
IA_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else if ((FOURCC_YUV12 == lpicComp->lpbiInput->biCompression) || (FOURCC_IYUV == lpicComp->lpbiInput->biCompression)) { #if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else
IA_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else if (FOURCC_YUY2 == lpicComp->lpbiInput->biCompression) { #if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
#if 0
C_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else
IA_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else if (FOURCC_UYVY == lpicComp->lpbiInput->biCompression) { #if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif
UYVY_to_YUV12_Flip(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH);
// IA_H26X_UYVYtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
// EC->FrameWidth, EC->FrameHeight, PITCH);
#if defined(_CODEC_STATS)
if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif
} else { DBOUT("ERROR: Unexpected input format detected."); } }
|