/* ************************************************************************* ** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */ //////////////////////////////////////////////////////////////////////////// // // $Author: MDUDA $ // $Date: 21 Nov 1996 17:33:56 $ // $Archive: S:\h26x\src\enc\excolcnv.cpv $ // $Header: S:\h26x\src\enc\excolcnv.cpv 1.45 21 Nov 1996 17:33:56 MDUDA $ // $Log: S:\h26x\src\enc\excolcnv.cpv $ // // Rev 1.45 21 Nov 1996 17:33:56 MDUDA // Added more non-compressed YUV12 support (RGB16 and RGB24). // Also rewrote IA_YUV12toEncYUV12 to be more readable. // // Rev 1.44 31 Oct 1996 10:05:48 KLILLEVO // changed from DBOUT to DbgLog // // Rev 1.43 22 Oct 1996 16:44:22 MDUDA // Added IA support for YUY2 input color conversion and cleaned up C version. // Now using IA version. // // Rev 1.42 18 Oct 1996 14:31:32 MDUDA // // Added a C-version of YUY2 input color conversion. // // Rev 1.41 11 Oct 1996 16:04:50 MDUDA // Using new RGB to YUV lookup tables. // // Rev 1.40 03 Oct 1996 10:43:58 AGUPTA2 // Got rid of segment directives; made tables read-only. // // Rev 1.39 13 Sep 1996 13:34:04 MDUDA // Fixed YVU9 bug where input = output frame size was not colored // (U and V planes) properly. // // Rev 1.38 11 Sep 1996 15:45:06 MDUDA // Modified RGB look-up tables and added C_H26X_YUV12toEncYUV12 and // IA_H26X_YUV12toEncYUV12. // // Rev 1.37 03 Sep 1996 14:54:46 MDUDA // Fixed problem causing VC++ 4.1 internal compiler error. Replaced // inline assembler constructs such as [ebx.biWidth] with // (LPBITMAPINFOHEADER)[ebx].biWidth. // // Rev 1.36 29 Aug 1996 16:31:14 MDUDA // Added Pentium assembler versions for all RGB conversion routines. // Also, rewrote YVU9 support to allow input frame sizes other // than 160x120 and 240x180. // // Rev 1.35 16 Aug 1996 12:17:48 MDUDA // Fixed bug where U and V values in the BGR converters were treated as unsign // values. Also did some general cleanup of BGR converters in preparation for // doing Pentium assembler version. // // Rev 1.34 13 Aug 1996 10:35:38 MDUDA // Added support for RGB4. Generalized RGB LUT support for 4-bit and // and 8-bit pixels into a single routine. // // Rev 1.33 09 Aug 1996 09:45:02 MDUDA // Added support for RGB16 format on input. This is for the color // Quick Cam. Also, generalized RGB16 for other bit combinations. // However, these can only be specified under BI_BITFIELDS format. // // Rev 1.32 02 Aug 1996 13:44:48 MDUDA // modified H26X_BGR24toYUV12 to crop and stretch 240x180 and 160x120 // frames // // Rev 1.31 01 Aug 1996 14:03:50 MDUDA // // Optimized H26X_YVU9toYUV12 by rewriting function in assembler code. Used in // _asm. Also re-arranged functions so that colorCnvtFrame is at the end of // the file. // // Rev 1.30 22 Jul 1996 13:28:22 BECHOLS // Added a CLUT8 to YUV12 color convertor (CC). This CC crops and stretches // either the 240x180 or the 160x120 image size to produce QCIF and SubQCIF // image sizes respectively. // // Rev 1.29 11 Jul 1996 15:47:02 MDUDA // // Modified H263_YVU9toYUV12 to create subQCIF and QCIF from // 160x120 and 240x180 images, respectively. To fit the new // formats, the original images are cropped and stretched using a // dither pattern for the color planes. // // Rev 1.28 14 May 1996 12:04:08 KLILLEVO // changed RGB->YUV color conversion to use the inverse // if the output YUV->RGB conversion instead of the conversion // "recommended by the CCIR". Compression performance for RGB // input was significantly improved (33% less bits for same // fixed QP) // // Rev 1.27 04 May 1996 21:55:20 BECHOLS // For RGB24 to YVU12 conversion, I unrolled the inner loop by 8 and changed // the writes to DWORD vs. BYTE writes. This resulted in a 30% reduction in // the execution time. // // Rev 1.26 10 Apr 1996 16:44:14 RHAZRA // Fixed a bug in 320x240 mode for the H26X_YUV12toEncYUV12() function. // DWORD should be and-ed with 0x7f7f7f7f and not 0x7f7f7f. // // Rev 1.25 27 Mar 1996 15:10:08 SCDAY // Optimized H26X_YUV12toEncYUV12 'C' code to read/write DWORDs // // Rev 1.24 08 Jan 1996 17:46:14 unknown // // Correct logic on bIs320x240 check // // Rev 1.23 05 Jan 1996 17:34:38 RMCKENZX // corrected chroma pad value to 0x40 to achieve black padding // // Rev 1.22 05 Jan 1996 17:29:46 RMCKENZX // Added code to pad out 320x240 stills to 352x288 // full CIF images. // // Rev 1.21 04 Jan 1996 18:37:20 TRGARDOS // Added code to permit 320x240 input and then set a boolean // bIs320x240. // // Rev 1.20 02 Jan 1996 17:09:04 TRGARDOS // Moved colorCnvFrame into this file and made the // color convertor functions static. // // Rev 1.19 27 Dec 1995 15:32:56 RMCKENZX // Added copyright notice // // Rev 1.18 06 Dec 1995 09:35:42 TRGARDOS // Added Brian's fix to the input color convertor to avoid // overflow of the chars. // // Rev 1.17 27 Nov 1995 16:09:04 TRGARDOS // Removed two unused variables to get rid of compiler warnings. // // Rev 1.16 30 Oct 1995 14:34:12 TRGARDOS // Fixed 240x180 to center clip. // // Rev 1.15 30 Oct 1995 12:03:16 TRGARDOS // Added color convertor support for YUV9 240x180. // // Rev 1.14 28 Oct 1995 15:39:28 TRGARDOS // Fixed color conversion problem from YVU9 to YVU12. // // Rev 1.13 12 Oct 1995 17:40:12 TRGARDOS // Fixed YUV12 input color convertor. // // Rev 1.12 12 Oct 1995 12:04:16 TRGARDOS // Changed some variable names in YUV12 convertor. // // Rev 1.11 10 Oct 1995 16:34:12 TRGARDOS // Added YUV12 input support. // // Rev 1.10 28 Sep 1995 17:02:36 DBRUCKS // fix colorIn to not swap left to right // // Rev 1.9 15 Sep 1995 16:37:38 TRGARDOS // // // Rev 1.8 13 Sep 1995 17:09:22 TRGARDOS // // Finished adding encoder support for YVU9 160x120 frames. // // Rev 1.7 11 Sep 1995 11:14:06 DBRUCKS // add h261 ifdef // // Rev 1.6 07 Sep 1995 09:27:54 TRGARDOS // Added YVU9 to YVU12 color convertor. // // Rev 1.5 05 Sep 1995 15:50:46 TRGARDOS // Added color back in to convertors. // // Rev 1.4 01 Sep 1995 17:51:42 TRGARDOS // Fixed bugs in color converter. // // Rev 1.3 01 Sep 1995 10:13:42 TRGARDOS // Debugging bit stream errors. // // Rev 1.2 30 Aug 1995 12:42:26 TRGARDOS // Fixed bugs in intra AC coef VLC coding. // // Rev 1.1 02 Aug 1995 17:28:06 TRGARDOS // // Cleaned up stuff to get stub working under new // version control system. // // Rev 1.0 31 Jul 1995 13:07:10 DBRUCKS // Initial revision. // // Rev 1.0 17 Jul 1995 14:46:16 CZHU // Initial revision. // // Rev 1.0 17 Jul 1995 14:14:22 CZHU // Initial revision. ;//////////////////////////////////////////////////////////////////////////// /* CCIR 601 Specifies a conversion from RGB to YCrCb. For what we call U and V, they are equivalent as U = Cb, V = Cr. From CCIR 601-2 Annex II, we can go from RGB with values in the range of 0-255, to YUV values in the same range by the equation: Y = ( 77*R + 150*G + 29*B ) >> 8; V = ( 131*R - 110*G - 21*B ) >> 8 + 128; // Cr U = ( (-44)*R - 87*G + 131*B ) >> 8 + 128; // Cb Has now changed to the inverse of the YUV->RGB on the output, since the old version produced way too many bits. The new version is: Y = ( 16836*R + 33056*G + 6416*B ) >> 16 + 16; V = ( 28777*R - 24117*G - 4660*B ) >> 16 + 128; // Cr U = ( (-9726)*R - 19064*G + 28790*B ) >> 16 + 128; // Cb */ #include "precomp.h" #if !defined(H263P) && !defined(USE_BILINEAR_MSH26X) // { H263P #if defined(_CODEC_STATS) static const double RDTSC_SHIFT_32 = 4294967296.0; static double PENTIUM_TIMER() { unsigned long int a, b; double temp1, temp2, result; __asm { _emit 0x0f _emit 0x31 mov a, eax mov b, edx } temp1 = (double) a; temp2 = (double) (b & 0xFFFF); if (RDTSC_CLOCK_FREQ) { result = (temp1 + temp2 * RDTSC_SHIFT_32) / RDTSC_CLOCK_FREQ; } else { result = 0.0; } return( result * 1000.0 ); } #endif // Set all local functions to "static", and then set it accordingly if // VTune statistics are to be collected. VTune doesn't recognize static functions // so we need some way to turn off the static attribute if VTune is to be run // on the executable. For now, simply use a define of _VTUNE to build the driver. #if defined(_VTUNE) #define _STATIC #else #define _STATIC static #endif // These are the look-up tables for the RGB converters. They are 8 bytes/entry // to allow addressing via the scale by 8 indexed addressing mode. A pseudo-SIMD // arrangement is used in these tables. Since all R, G and B contributions to the // Y value are positive and fit in 15 bits, these are stored in the lower 16-bits // of the YU word. In some cases, the U contribution is negative so it is placed // in the upper 16 bits of the YU word. When a Y value is calculated, the U value // is calculated in parallel. The V contribution is negative in some cases, but it // gets its own word. // This is the code that was used to generate the tables. #if 0 #define YRCoef 16836 #define YGCoef 33056 #define YBCoef 6416 #define URCoef 9726 #define UGCoef 19064 #define UBCoef 28790 #define VRCoef 28777 #define VGCoef 24117 #define VBCoef 4660 #include void main() { int i,j; printf("struct YUV {\n"); printf(" int YU;\n"); printf(" int V;\n"); printf("};\n\n"); printf("struct YUV RYUV[] = {\n"); for (i = 0; i < 64; i++) { for (j = 0; j < 4; j += 2) { printf("{0x%.8x, 0x%.8x}, ", ((YRCoef*((i*4)+j+1))>>9) | ((-(((URCoef*((i*4)+j+1)))>>9))<<16), ((VRCoef*((i*4)+j+1))>>9)); } printf("\n"); } printf("};\n"); printf("struct YUV GYUV[] = {\n"); for (i = 0; i < 64; i++) { for (j = 0; j < 4; j += 2) { printf("{0x%.8x, 0x%.8x}, ", ((YGCoef*((i*4)+j+1))>>9) | ((-(((UGCoef*((i*4)+j+1)))>>9))<<16), -((VGCoef*((i*4)+j+1))>>9)); } printf("\n"); } printf("};\n"); printf("struct YUV BYUV[] = {\n"); for (i = 0; i < 64; i++) { for (j = 0; j < 4; j += 2) { printf("{0x%.8x, 0x%.8x}, ", ((YBCoef*((i*4)+j+1))>>9) | (((UBCoef*((i*4)+j+1))>>9)<<16), -((VBCoef*((i*4)+j+1))>>9)); } printf("\n"); } printf("};\n"); } #endif struct YUV { int YU; int V; }; const struct YUV RYUV[] = { {0xffee0020, 0x00000038}, {0xffc80062, 0x000000a8}, {0xffa200a4, 0x00000119}, {0xff7c00e6, 0x00000189}, {0xff560127, 0x000001f9}, {0xff300169, 0x0000026a}, {0xff0a01ab, 0x000002da}, {0xfee401ed, 0x0000034b}, {0xfebe022f, 0x000003bb}, {0xfe980270, 0x0000042b}, {0xfe7202b2, 0x0000049c}, {0xfe4c02f4, 0x0000050c}, {0xfe260336, 0x0000057d}, {0xfe000377, 0x000005ed}, {0xfdda03b9, 0x0000065d}, {0xfdb403fb, 0x000006ce}, {0xfd8e043d, 0x0000073e}, {0xfd68047e, 0x000007af}, {0xfd4204c0, 0x0000081f}, {0xfd1c0502, 0x0000088f}, {0xfcf60544, 0x00000900}, {0xfcd00585, 0x00000970}, {0xfcaa05c7, 0x000009e1}, {0xfc840609, 0x00000a51}, {0xfc5e064b, 0x00000ac2}, {0xfc38068d, 0x00000b32}, {0xfc1206ce, 0x00000ba2}, {0xfbec0710, 0x00000c13}, {0xfbc60752, 0x00000c83}, {0xfba00794, 0x00000cf4}, {0xfb7a07d5, 0x00000d64}, {0xfb540817, 0x00000dd4}, {0xfb2e0859, 0x00000e45}, {0xfb08089b, 0x00000eb5}, {0xfae208dc, 0x00000f26}, {0xfabc091e, 0x00000f96}, {0xfa960960, 0x00001006}, {0xfa7009a2, 0x00001077}, {0xfa4a09e3, 0x000010e7}, {0xfa240a25, 0x00001158}, {0xf9fe0a67, 0x000011c8}, {0xf9d80aa9, 0x00001239}, {0xf9b20aeb, 0x000012a9}, {0xf98c0b2c, 0x00001319}, {0xf9660b6e, 0x0000138a}, {0xf9400bb0, 0x000013fa}, {0xf91a0bf2, 0x0000146b}, {0xf8f40c33, 0x000014db}, {0xf8ce0c75, 0x0000154b}, {0xf8a80cb7, 0x000015bc}, {0xf8820cf9, 0x0000162c}, {0xf85c0d3a, 0x0000169d}, {0xf8360d7c, 0x0000170d}, {0xf8100dbe, 0x0000177d}, {0xf7ea0e00, 0x000017ee}, {0xf7c40e41, 0x0000185e}, {0xf79e0e83, 0x000018cf}, {0xf7780ec5, 0x0000193f}, {0xf7520f07, 0x000019af}, {0xf72c0f49, 0x00001a20}, {0xf7060f8a, 0x00001a90}, {0xf6e00fcc, 0x00001b01}, {0xf6ba100e, 0x00001b71}, {0xf6941050, 0x00001be2}, {0xf66e1091, 0x00001c52}, {0xf64810d3, 0x00001cc2}, {0xf6221115, 0x00001d33}, {0xf5fc1157, 0x00001da3}, {0xf5d61198, 0x00001e14}, {0xf5b011da, 0x00001e84}, {0xf58a121c, 0x00001ef4}, {0xf564125e, 0x00001f65}, {0xf53e12a0, 0x00001fd5}, {0xf51812e1, 0x00002046}, {0xf4f21323, 0x000020b6}, {0xf4cc1365, 0x00002126}, {0xf4a613a7, 0x00002197}, {0xf48013e8, 0x00002207}, {0xf45a142a, 0x00002278}, {0xf434146c, 0x000022e8}, {0xf40e14ae, 0x00002359}, {0xf3e814ef, 0x000023c9}, {0xf3c21531, 0x00002439}, {0xf39c1573, 0x000024aa}, {0xf37615b5, 0x0000251a}, {0xf35015f6, 0x0000258b}, {0xf32a1638, 0x000025fb}, {0xf304167a, 0x0000266b}, {0xf2de16bc, 0x000026dc}, {0xf2b816fe, 0x0000274c}, {0xf292173f, 0x000027bd}, {0xf26c1781, 0x0000282d}, {0xf24617c3, 0x0000289d}, {0xf2201805, 0x0000290e}, {0xf1fa1846, 0x0000297e}, {0xf1d41888, 0x000029ef}, {0xf1ae18ca, 0x00002a5f}, {0xf188190c, 0x00002acf}, {0xf162194d, 0x00002b40}, {0xf13c198f, 0x00002bb0}, {0xf11619d1, 0x00002c21}, {0xf0f01a13, 0x00002c91}, {0xf0ca1a54, 0x00002d02}, {0xf0a41a96, 0x00002d72}, {0xf07e1ad8, 0x00002de2}, {0xf0581b1a, 0x00002e53}, {0xf0321b5c, 0x00002ec3}, {0xf00c1b9d, 0x00002f34}, {0xefe61bdf, 0x00002fa4}, {0xefc01c21, 0x00003014}, {0xef9a1c63, 0x00003085}, {0xef741ca4, 0x000030f5}, {0xef4e1ce6, 0x00003166}, {0xef281d28, 0x000031d6}, {0xef021d6a, 0x00003246}, {0xeedc1dab, 0x000032b7}, {0xeeb61ded, 0x00003327}, {0xee901e2f, 0x00003398}, {0xee6a1e71, 0x00003408}, {0xee441eb2, 0x00003479}, {0xee1e1ef4, 0x000034e9}, {0xedf81f36, 0x00003559}, {0xedd21f78, 0x000035ca}, {0xedac1fba, 0x0000363a}, {0xed861ffb, 0x000036ab}, {0xed60203d, 0x0000371b}, {0xed3a207f, 0x0000378b}, {0xed1420c1, 0x000037fc}, }; const struct YUV GYUV[] = { {0xffdb0040, 0xffffffd1}, {0xff9100c1, 0xffffff73}, {0xff460142, 0xffffff15}, {0xfefc01c3, 0xfffffeb7}, {0xfeb10245, 0xfffffe59}, {0xfe6702c6, 0xfffffdfa}, {0xfe1c0347, 0xfffffd9c}, {0xfdd203c8, 0xfffffd3e}, {0xfd880449, 0xfffffce0}, {0xfd3d04ca, 0xfffffc82}, {0xfcf3054b, 0xfffffc23}, {0xfca805cc, 0xfffffbc5}, {0xfc5e064e, 0xfffffb67}, {0xfc1306cf, 0xfffffb09}, {0xfbc90750, 0xfffffaaa}, {0xfb7e07d1, 0xfffffa4c}, {0xfb340852, 0xfffff9ee}, {0xfae908d3, 0xfffff990}, {0xfa9f0954, 0xfffff932}, {0xfa5409d5, 0xfffff8d3}, {0xfa0a0a57, 0xfffff875}, {0xf9bf0ad8, 0xfffff817}, {0xf9750b59, 0xfffff7b9}, {0xf92a0bda, 0xfffff75b}, {0xf8e00c5b, 0xfffff6fc}, {0xf8960cdc, 0xfffff69e}, {0xf84b0d5d, 0xfffff640}, {0xf8010dde, 0xfffff5e2}, {0xf7b60e60, 0xfffff584}, {0xf76c0ee1, 0xfffff525}, {0xf7210f62, 0xfffff4c7}, {0xf6d70fe3, 0xfffff469}, {0xf68c1064, 0xfffff40b}, {0xf64210e5, 0xfffff3ad}, {0xf5f71166, 0xfffff34e}, {0xf5ad11e7, 0xfffff2f0}, {0xf5621269, 0xfffff292}, {0xf51812ea, 0xfffff234}, {0xf4cd136b, 0xfffff1d6}, {0xf48313ec, 0xfffff177}, {0xf439146d, 0xfffff119}, {0xf3ee14ee, 0xfffff0bb}, {0xf3a4156f, 0xfffff05d}, {0xf35915f0, 0xffffeffe}, {0xf30f1672, 0xffffefa0}, {0xf2c416f3, 0xffffef42}, {0xf27a1774, 0xffffeee4}, {0xf22f17f5, 0xffffee86}, {0xf1e51876, 0xffffee27}, {0xf19a18f7, 0xffffedc9}, {0xf1501978, 0xffffed6b}, {0xf10519f9, 0xffffed0d}, {0xf0bb1a7b, 0xffffecaf}, {0xf0701afc, 0xffffec50}, {0xf0261b7d, 0xffffebf2}, {0xefdb1bfe, 0xffffeb94}, {0xef911c7f, 0xffffeb36}, {0xef471d00, 0xffffead8}, {0xeefc1d81, 0xffffea79}, {0xeeb21e02, 0xffffea1b}, {0xee671e84, 0xffffe9bd}, {0xee1d1f05, 0xffffe95f}, {0xedd21f86, 0xffffe901}, {0xed882007, 0xffffe8a2}, {0xed3d2088, 0xffffe844}, {0xecf32109, 0xffffe7e6}, {0xeca8218a, 0xffffe788}, {0xec5e220b, 0xffffe72a}, {0xec13228d, 0xffffe6cb}, {0xebc9230e, 0xffffe66d}, {0xeb7e238f, 0xffffe60f}, {0xeb342410, 0xffffe5b1}, {0xeaea2491, 0xffffe552}, {0xea9f2512, 0xffffe4f4}, {0xea552593, 0xffffe496}, {0xea0a2614, 0xffffe438}, {0xe9c02696, 0xffffe3da}, {0xe9752717, 0xffffe37b}, {0xe92b2798, 0xffffe31d}, {0xe8e02819, 0xffffe2bf}, {0xe896289a, 0xffffe261}, {0xe84b291b, 0xffffe203}, {0xe801299c, 0xffffe1a4}, {0xe7b62a1d, 0xffffe146}, {0xe76c2a9f, 0xffffe0e8}, {0xe7212b20, 0xffffe08a}, {0xe6d72ba1, 0xffffe02c}, {0xe68c2c22, 0xffffdfcd}, {0xe6422ca3, 0xffffdf6f}, {0xe5f82d24, 0xffffdf11}, {0xe5ad2da5, 0xffffdeb3}, {0xe5632e26, 0xffffde55}, {0xe5182ea8, 0xffffddf6}, {0xe4ce2f29, 0xffffdd98}, {0xe4832faa, 0xffffdd3a}, {0xe439302b, 0xffffdcdc}, {0xe3ee30ac, 0xffffdc7e}, {0xe3a4312d, 0xffffdc1f}, {0xe35931ae, 0xffffdbc1}, {0xe30f322f, 0xffffdb63}, {0xe2c432b1, 0xffffdb05}, {0xe27a3332, 0xffffdaa6}, {0xe22f33b3, 0xffffda48}, {0xe1e53434, 0xffffd9ea}, {0xe19b34b5, 0xffffd98c}, {0xe1503536, 0xffffd92e}, {0xe10635b7, 0xffffd8cf}, {0xe0bb3638, 0xffffd871}, {0xe07136ba, 0xffffd813}, {0xe026373b, 0xffffd7b5}, {0xdfdc37bc, 0xffffd757}, {0xdf91383d, 0xffffd6f8}, {0xdf4738be, 0xffffd69a}, {0xdefc393f, 0xffffd63c}, {0xdeb239c0, 0xffffd5de}, {0xde673a41, 0xffffd580}, {0xde1d3ac3, 0xffffd521}, {0xddd23b44, 0xffffd4c3}, {0xdd883bc5, 0xffffd465}, {0xdd3d3c46, 0xffffd407}, {0xdcf33cc7, 0xffffd3a9}, {0xdca93d48, 0xffffd34a}, {0xdc5e3dc9, 0xffffd2ec}, {0xdc143e4a, 0xffffd28e}, {0xdbc93ecc, 0xffffd230}, {0xdb7f3f4d, 0xffffd1d2}, {0xdb343fce, 0xffffd173}, {0xdaea404f, 0xffffd115}, }; const struct YUV BYUV[] = { {0x0038000c, 0xfffffff7}, {0x00a80025, 0xffffffe5}, {0x0119003e, 0xffffffd3}, {0x01890057, 0xffffffc1}, {0x01fa0070, 0xffffffaf}, {0x026a0089, 0xffffff9c}, {0x02da00a2, 0xffffff8a}, {0x034b00bb, 0xffffff78}, {0x03bb00d5, 0xffffff66}, {0x042c00ee, 0xffffff54}, {0x049c0107, 0xffffff41}, {0x050d0120, 0xffffff2f}, {0x057d0139, 0xffffff1d}, {0x05ee0152, 0xffffff0b}, {0x065e016b, 0xfffffef9}, {0x06cf0184, 0xfffffee6}, {0x073f019d, 0xfffffed4}, {0x07b001b6, 0xfffffec2}, {0x082001cf, 0xfffffeb0}, {0x089001e8, 0xfffffe9e}, {0x09010201, 0xfffffe8b}, {0x0971021a, 0xfffffe79}, {0x09e20233, 0xfffffe67}, {0x0a52024c, 0xfffffe55}, {0x0ac30266, 0xfffffe43}, {0x0b33027f, 0xfffffe30}, {0x0ba40298, 0xfffffe1e}, {0x0c1402b1, 0xfffffe0c}, {0x0c8502ca, 0xfffffdfa}, {0x0cf502e3, 0xfffffde8}, {0x0d6602fc, 0xfffffdd5}, {0x0dd60315, 0xfffffdc3}, {0x0e46032e, 0xfffffdb1}, {0x0eb70347, 0xfffffd9f}, {0x0f270360, 0xfffffd8c}, {0x0f980379, 0xfffffd7a}, {0x10080392, 0xfffffd68}, {0x107903ab, 0xfffffd56}, {0x10e903c4, 0xfffffd44}, {0x115a03dd, 0xfffffd31}, {0x11ca03f7, 0xfffffd1f}, {0x123b0410, 0xfffffd0d}, {0x12ab0429, 0xfffffcfb}, {0x131c0442, 0xfffffce9}, {0x138c045b, 0xfffffcd6}, {0x13fc0474, 0xfffffcc4}, {0x146d048d, 0xfffffcb2}, {0x14dd04a6, 0xfffffca0}, {0x154e04bf, 0xfffffc8e}, {0x15be04d8, 0xfffffc7b}, {0x162f04f1, 0xfffffc69}, {0x169f050a, 0xfffffc57}, {0x17100523, 0xfffffc45}, {0x1780053c, 0xfffffc33}, {0x17f10555, 0xfffffc20}, {0x1861056e, 0xfffffc0e}, {0x18d20588, 0xfffffbfc}, {0x194205a1, 0xfffffbea}, {0x19b205ba, 0xfffffbd8}, {0x1a2305d3, 0xfffffbc5}, {0x1a9305ec, 0xfffffbb3}, {0x1b040605, 0xfffffba1}, {0x1b74061e, 0xfffffb8f}, {0x1be50637, 0xfffffb7d}, {0x1c550650, 0xfffffb6a}, {0x1cc60669, 0xfffffb58}, {0x1d360682, 0xfffffb46}, {0x1da7069b, 0xfffffb34}, {0x1e1706b4, 0xfffffb22}, {0x1e8806cd, 0xfffffb0f}, {0x1ef806e6, 0xfffffafd}, {0x1f6806ff, 0xfffffaeb}, {0x1fd90719, 0xfffffad9}, {0x20490732, 0xfffffac7}, {0x20ba074b, 0xfffffab4}, {0x212a0764, 0xfffffaa2}, {0x219b077d, 0xfffffa90}, {0x220b0796, 0xfffffa7e}, {0x227c07af, 0xfffffa6c}, {0x22ec07c8, 0xfffffa59}, {0x235d07e1, 0xfffffa47}, {0x23cd07fa, 0xfffffa35}, {0x243e0813, 0xfffffa23}, {0x24ae082c, 0xfffffa11}, {0x251e0845, 0xfffff9fe}, {0x258f085e, 0xfffff9ec}, {0x25ff0877, 0xfffff9da}, {0x26700890, 0xfffff9c8}, {0x26e008aa, 0xfffff9b6}, {0x275108c3, 0xfffff9a3}, {0x27c108dc, 0xfffff991}, {0x283208f5, 0xfffff97f}, {0x28a2090e, 0xfffff96d}, {0x29130927, 0xfffff95b}, {0x29830940, 0xfffff948}, {0x29f40959, 0xfffff936}, {0x2a640972, 0xfffff924}, {0x2ad4098b, 0xfffff912}, {0x2b4509a4, 0xfffff8ff}, {0x2bb509bd, 0xfffff8ed}, {0x2c2609d6, 0xfffff8db}, {0x2c9609ef, 0xfffff8c9}, {0x2d070a08, 0xfffff8b7}, {0x2d770a21, 0xfffff8a4}, {0x2de80a3b, 0xfffff892}, {0x2e580a54, 0xfffff880}, {0x2ec90a6d, 0xfffff86e}, {0x2f390a86, 0xfffff85c}, {0x2faa0a9f, 0xfffff849}, {0x301a0ab8, 0xfffff837}, {0x308a0ad1, 0xfffff825}, {0x30fb0aea, 0xfffff813}, {0x316b0b03, 0xfffff801}, {0x31dc0b1c, 0xfffff7ee}, {0x324c0b35, 0xfffff7dc}, {0x32bd0b4e, 0xfffff7ca}, {0x332d0b67, 0xfffff7b8}, {0x339e0b80, 0xfffff7a6}, {0x340e0b99, 0xfffff793}, {0x347f0bb2, 0xfffff781}, {0x34ef0bcc, 0xfffff76f}, {0x35600be5, 0xfffff75d}, {0x35d00bfe, 0xfffff74b}, {0x36400c17, 0xfffff738}, {0x36b10c30, 0xfffff726}, {0x37210c49, 0xfffff714}, {0x37920c62, 0xfffff702}, {0x38020c7b, 0xfffff6f0}, }; #define COEF_WIDTH 8 #define SHIFT_WIDTH COEF_WIDTH // // All of the RGB converters follow the template given below. The converters make // some assumptions about the frame size. All output frame sizes are assumed to // have a frame height that is a multiple of 48. Also, the output frame width // is assumed to be a multiple of 8. If the input frame size is equal // to the output frame size, no stretching or cropping is done. Otherwise, the // image is cropped and stretched for an 11:12 aspect ratio. // #if 0 void rgb_color_converter() { for (j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= m, pnext += n) { compute m Y values using look-up tables if (0 == (k&1)) { compute m/2 U,V values using look-up tables } } if ((0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8 { t = *pyprev++ & 0xFEFEFEFE; t += *pynext++ & 0xFEFEFEFE; *pyspace++ = t; t = *pyprev++ & 0xFEFEFEFE; t += *pynext++ & 0xFEFEFEFE; *pyspace++ = t; } } pnext += iBackTwoLines; py += ypitch_adj; if (0 == (k&1)) { pu += uvpitch_adj; pv += uvpitch_adj; } } if (stretch) { pyprev = py - pitch; pyspace = py; pynext = py + pitch; } } if (stretch) { for (i = FrameWidth; i > 0; i -= 4 { *pyspace++ = *pyprev++; } } } #endif // // For the IA versions, the strategy is to compute the Y value for an odd RGB value // followed by computing the Y value for the corresponding even RGB value. The registers // are then set with the proper values to compute U and V values for the even RGB // value. This avoids repeating the shifting and masking needed to extract the Red, // Green and Blue components. // /***************************************************************************** * * H26X_BGR24toYUV12() * * Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory * with pitch defined by the constant PITCH. The input data is stored in * the order B,G,R,B,G,R... * */ #if defined(_CODEC_STATS) #define NOC_SHIFT_WIDTH 7 void NOC_H26X_BGR24toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm; int t; int i, j, k; int iBackTwoLines; int stretch, mark, aspect; int height_adj, width_adj; int LumaIters = 0; int ypitch_adj = 0; int uvpitch_adj = 0; // This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12). for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = (lpbiInput->biWidth - FrameWidth) >> 1; width_adj += (width_adj << 1); aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // The input image is upside down - process the lines in reverse order. // Move from end of line N to beginning of line N-1 iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 2); iBackTwoLines += (iBackTwoLines << 1); // Point to the beginning of the last line. pnext = (U32 *) (lpInput + ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj); for ( j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= 4, pnext += 3) { tm = pnext[0]; t = BYUV[tm>>25].YU; tm = pnext[1]; t += (GYUV[(tm>>1)&0x7F].YU + RYUV[(tm>>9)&0x7F].YU); *(YPlane+1) = (U8)((t>>NOC_SHIFT_WIDTH)+16); tm = pnext[0]; t = (BYUV[(tm>>1)&0x7F].YU + GYUV[(tm>>9)&0x7F].YU + RYUV[(tm>>17)&0x7F].YU); *YPlane = (U8)((t>>NOC_SHIFT_WIDTH)+16); if (0 == (k&1)) { *UPlane++ = (U8)((t>>23)+128); t = (RYUV[(tm>>17)&0x7F].V + GYUV[(tm>>9)&0x7F].V + BYUV[(tm>>1)&0x7F].V); *VPlane++ = (U8)((t>>NOC_SHIFT_WIDTH)+128); } tm = pnext[2]; t = (BYUV[(tm>>9)&0x7F].YU + GYUV[(tm>>17)&0x7F].YU + RYUV[tm>>25].YU); *(YPlane+3) = (U8)((t>>NOC_SHIFT_WIDTH)+16); tm = pnext[1]; t = BYUV[(tm>>17)&0x7F].YU + GYUV[tm>>25].YU; tm = pnext[2]; t += RYUV[(tm>>1)&0x7F].YU; *(YPlane+2) = (U8)((t>>NOC_SHIFT_WIDTH)+16); YPlane += 4; if (0 == (k&1)) { *UPlane++ = (U8)((t>>23)+128); t = RYUV[(tm>>1)&0x7F].V; tm = pnext[1]; t += GYUV[tm>>25].V + BYUV[(tm>>17)&0x7F].V; *VPlane++ = (U8)((t>>NOC_SHIFT_WIDTH)+128); } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines. if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } // end of for k if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } // end of for j if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of NOC_H26X_BGR24toYUV12() #endif #if 0 _STATIC void C_H26X_BGR24toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm; int t; int i, j, k; int iBackTwoLines; int stretch, mark, aspect; int height_adj, width_adj; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1); // This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12). for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = (lpbiInput->biWidth - FrameWidth) >> 1; width_adj += (width_adj << 1); aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // The input image is upside down - process the lines in reverse order. // Move from end of line N to beginning of line N-1 iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 2); iBackTwoLines += (iBackTwoLines << 1); // Point to the beginning of the last line. pnext = (U32 *) (lpInput + ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj); for ( j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= 4, pnext += 3) { tm = pnext[0]; t = BYUV[tm>>25].YU; tm = pnext[1]; t += (GYUV[(tm>>1)&0x7F].YU + RYUV[(tm>>9)&0x7F].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); tm = pnext[0]; t = (BYUV[(tm>>1)&0x7F].YU + GYUV[(tm>>9)&0x7F].YU + RYUV[(tm>>17)&0x7F].YU); *YPlane = (U8)((t>>SHIFT_WIDTH)+8); if (0 == (k&1)) { *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>17)&0x7F].V + GYUV[(tm>>9)&0x7F].V + BYUV[(tm>>1)&0x7F].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); } tm = pnext[2]; t = (BYUV[(tm>>9)&0x7F].YU + GYUV[(tm>>17)&0x7F].YU + RYUV[tm>>25].YU); *(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8); tm = pnext[1]; t = BYUV[(tm>>17)&0x7F].YU + GYUV[tm>>25].YU; tm = pnext[2]; t += RYUV[(tm>>1)&0x7F].YU; *(YPlane+2) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 4; if (0 == (k&1)) { *UPlane++ = (U8)((t>>24)+64); t = RYUV[(tm>>1)&0x7F].V; tm = pnext[1]; t += GYUV[tm>>25].V + BYUV[(tm>>17)&0x7F].V; *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines. if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } // end of for k if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } // end of for j if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of C_H26X_BGR24toYUV12() #endif __declspec(naked) _STATIC void IA_H26X_BGR24toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * BGR24Image, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | + 96 // | FrameHeight | + 92 // | FrameWidth | + 88 // | VPlane | + 84 // | UPlane | + 80 // | YPlane | + 76 // | lpInput | + 72 // | lpbiInput | + 68 // ---------------------------- // | return addr | + 64 // | saved ebp | + 60 // | saved ebx | + 56 // | saved esi | + 52 // | saved edi | + 48 // | pyprev | + 44 // | pyspace | + 40 // | pynext | + 36 // | i | + 32 // | j | + 28 // | k | + 24 // | iBackTwoLines | + 20 // | stretch | + 16 // | mark | + 12 // | LumaIters | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 #define LOCALSIZE 48 #define PITCH_PARM 96 #define FRAME_HEIGHT 92 #define FRAME_WIDTH 88 #define VPLANE 84 #define UPLANE 80 #define YPLANE 76 #define LP_INPUT 72 #define LPBI_INPUT 68 #define PYPREV 44 #define PYSPACE 40 #define PYNEXT 36 #define LOOP_I 32 #define LOOP_J 28 #define LOOP_K 24 #define BACK_TWO_LINES 20 #define STRETCH 16 #define MARK 12 #define LUMA_ITERS 8 #define YPITCH_ADJ 4 #define UVPITCH_ADJ 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // assign (ebx, lpbiInput) mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (edx, pitch) mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (edx, LumaIters) xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = (lpbiInput->biWidth - FrameWidth) >> 1 // width_adj += width_adj << 1 // assign (esi, width_adj) mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] mov eax, esi shr eax, 1 add esi, eax // aspect = (width_adj ? LumaIters : 0) // assign (edi, aspect) // kill (edx, LumaIters) mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (edx, height_adj) L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0) xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -(lpbiInput->biWidth + FrameWidth) // iBackTwoLines += (iBackTwoLines << 1) mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] neg ebp mov eax, ebp shl eax, 1 add ebp, eax mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput + // ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) * // ((FrameHeight - aspect - 1) + height_adj)) + // width_adj // kill (ebx, lpbiInput) // kill (ecx, FrameWidth) // kill (edx, height_adj) // kill (esi, width_adj) // kill (edi, aspect) // assign (esi, pnext) mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 add eax, (LPBITMAPINFOHEADER)[ebx].biWidth mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane) mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++) xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++) L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 4, pnext += 12) L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts in the U pipe jmp L6 L6: // --------------------- // | B2 | R1 | G1 | B1 | pnext[0] // --------------------- // | G3 | B3 | R2 | G2 | pnext[1] // --------------------- // | R4 | G4 | B4 | R3 | pnext[2] // --------------------- // t0 = pnext[0] // t1 = pnext[1] // t = ( BYUV[t0>>25].YU + // GYUV[(t1>> 1)&0x7F].YU + // RYUV[(t1>> 9)&0x7F].YU ) // *(YPlane+1) = ((t>>8)+8) // t = ( BYUV[(t0>> 1)&0x7F].YU + // GYUV[(t0>> 9)&0x7F].YU + // RYUV[(t0>>17)&0x7F].YU ) // *YPlane = ((t>>8)+8) // assign(eax: B2,Y1,Y2,U) // assign(ebx: B1,V) // assign(ecx: G2,G1) // assign(edx: R2,R1) // assign(ebp: B1) // 1 mov eax, [esi] mov ecx, [esi + 4] // 2 mov ebx, eax mov edx, ecx // 3 shr eax, 25 and ecx, 0xFE // 4 shr ecx, 1 and edx, 0xFE00 // 5 shr edx, 9 and ebx, 0xFEFEFE // 6 mov eax, [BYUV+eax*8].YU nop // 7 add eax, [GYUV+ecx*8].YU mov ecx, ebx // 8 add eax, [RYUV+edx*8].YU mov edx, ebx // 9 and ebx, 0xFE add eax, 0x800 // 10 sar eax, 8 nop // 11 shr ebx, 1 nop // 12 shr ecx, 9 mov [edi + 1], al // 13 shr edx, 17 and ecx, 0x7F // 14 mov eax, [BYUV+ebx*8].YU and edx, 0x7F // 15 add eax, [GYUV+ecx*8].YU mov ebp, ebx // 16 add eax, [RYUV+edx*8].YU nop // 17 sar eax, 8 mov ebx, [esp + LOOP_K] // 18 add eax, 8 and ebx, 1 // 19 mov [edi], al jnz L9 // At this point, ebp: B1, ecx: G1, edx: R1 // t0 = pnext[0] // *UPlane++ = ((t>>24)+64) // t = ( RYUV[(t0>>17)&0x7F].V + // GYUV[(t0>> 9)&0x7F].V + // BYUV[(t0>> 1)&0x7F].V ) // *VPlane++ = ((t>>8)+64) // 20 mov ebx, [RYUV+edx*8].V mov edx, [esp + UPLANE] // 21 sar eax, 16 add ebx, [GYUV+ecx*8].V // 22 add eax, 64 add ebx, [BYUV+ebp*8].V // 23 mov [edx], al inc edx // 24 mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 25 sar ebx, 8 inc edx // 26 add ebx, 64 mov [esp + VPLANE], edx // 27 mov [edx - 1], bl nop L9: // --------------------- // | B2 | R1 | G1 | B1 | pnext[0] // --------------------- // | G3 | B3 | R2 | G2 | pnext[1] // --------------------- // | R4 | G4 | B4 | R3 | pnext[2] // --------------------- // t1 = pnext[1] // t2 = pnext[2] // t = ( BYUV[(t2>> 9)&0x7F].YU + // GYUV[(t2>>17)&0x7F].YU + // RYUV[t2>>25].YR ) // *(YPlane+3) = ((t>>8)+8) // t = ( BYUV[(t1>>17)&0x7F].YU + // GYUV[t1>>25].YU + // RYUV[(t2>> 1)&0x7F].YU ) // *(YPlane+2) = ((t>>8)+8) // YPlane += 4 // assign(eax: B4,Y3,Y4,U) // assign(ebx: R3,V) // assign(ecx: G4,G3) // assign(edx: R4/B3) // assign(ebp: R3) // 28 mov ebp, [esi + 4] mov ebx, [esi + 8] // 29 mov eax, ebx mov ecx, ebx // 30 shr eax, 9 mov edx, ebx // 31 shr ecx, 17 and eax, 0x7F // 32 shr edx, 25 and ecx, 0x7F // 33 mov eax, [BYUV+eax*8].YU nop // 34 add eax, [GYUV+ecx*8].YU and ebx, 0xFE // 35 add eax, [RYUV+edx*8].YU mov ecx, ebp // 36 shr ebx, 1 add eax, 0x800 // 37 sar eax, 8 mov edx, ebp // 38 shr edx, 17 mov [edi + 3], al // 39 shr ecx, 25 and edx, 0x7F // 40 mov eax, [RYUV+ebx*8].YU mov ebp, ebx // 41 add eax, [GYUV+ecx*8].YU nop // 42 add eax, [BYUV+edx*8].YU nop // 43 sar eax, 8 mov ebx, [esp + LOOP_K] // 44 add eax, 8 and ebx, 1 // 45 mov [edi + 2], al jnz L16 // At this point, ebp: R3, ecx: G3, edx: B3 // t1 = pnext[1] // t2 = pnext[2] // *UPlane++ = ((t>>16)+64) // t = ( RYUV[(t2>> 1)&0x7F].V + // GYUV[t1>>25].V + // BYUV[(t1>>17)&0x7F].V ) // *VPlane++ = ((t>>8)+64) // 46 mov ebx, [BYUV+edx*8].V mov edx, [esp + UPLANE] // 47 sar eax, 16 add ebx, [GYUV+ecx*8].V // 48 add eax, 64 add ebx, [RYUV+ebp*8].V // 49 mov [edx], al inc edx // 50 mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 51 sar ebx, 8 inc edx // 52 add ebx, 64 mov [esp + VPLANE], edx // 53 mov [edx - 1], bl nop L16: // 54 mov eax, [esp + LOOP_I] lea esi, [esi + 12] // 55 sub eax, 4 lea edi, [edi + 4] // 56 mov [esp + LOOP_I], eax jnz L6 // if (stretch && (0 == k) && j) mov eax, [esp + STRETCH] test eax, eax jz L21 mov eax, [esp + LOOP_K] test eax, eax jnz L21 mov eax, [esp + LOOP_J] test eax, eax jz L21 // spill YPlane ptr mov [esp + YPLANE], edi nop // for (i = FrameWidth; i > 0; i -= 8) // assign (ebx, pyprev) // assign (ecx, t) // assign (edx, pynext) // assign (edi, pyspace) // assign (ebp, i) // make sure offsets are such that there are no bank conflicts here mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE] mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH] // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t L22: // 1 mov eax, [ebx] lea ebx, [ebx + 4] // 2 mov ecx, [edx] lea edx, [edx + 4] // 3 shr ecx, 1 and eax, 0xFEFEFEFE // 4 shr eax, 1 and ecx, 0x7F7F7F7F // 5 add eax, ecx mov ecx, [ebx] // 6 shr ecx, 1 mov [edi], eax // 7 mov eax, [edx] and ecx, 0x7F7F7F7F // 8 shr eax, 1 lea edi, [edi + 4] // 9 and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10 lea edx, [edx + 4] add eax, ecx // 11 mov [edi], eax lea edi, [edi + 4] // 12 sub ebp, 8 jnz L22 // kill (ebx, pyprev) // kill (ecx, t) // kill (edx, pynext) // kill (edi, pyspace) // kill (ebp, i) // restore YPlane mov edi, [esp + YPLANE] // pnext += iBackTwoLines L21: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj; add edi, [esp + YPITCH_ADJ] // if(0 == (k&1)) mov eax, [esp + LOOP_K] and eax, 1 jnz L23 // UPlane += uvpitch_adj; // VPlane += uvpitch_adj; mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax L23: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5 // if (stretch) cmp DWORD PTR [esp + STRETCH], 0 je L24 // pyprev = YPlane - pitch mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane mov [esp + PYSPACE], edi // pynext = (YPlane += pitch) add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi L24: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4 // kill (esi, pnext) // kill (edi, YPlane) // if (stretch) mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L26 // for (i = FrameWidth; i > 0; i -= 4) // assign (esi, pyprev) // assign (edi, pyspace) // assign (ebp, i) mov ebp, [esp + FRAME_WIDTH] mov edi, [esp + PYSPACE] L25: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L25 // kill (esi, pyprev) // kill (edi, pyspace) // kill (ebp, i) L26: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef PYPREV #undef PYSPACE #undef PYNEXT #undef LOOP_I #undef LOOP_J #undef LOOP_K #undef BACK_TWO_LINES #undef STRETCH #undef MARK #undef LUMA_ITERS #undef YPITCH_ADJ #undef UVPITCH_ADJ #if defined(_CODEC_STATS) #define NOC_SHIFT_WIDTH 7 void NOC_H26X_BGR16toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm; int t; int i, j, k; int iBackTwoLines; int stretch, mark, aspect; int width_adj, height_adj; int LumaIters = 0; int ypitch_adj = 0; int uvpitch_adj = 0; // This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12). for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = lpbiInput->biWidth - FrameWidth; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // The input image is upside down - process the lines in reverse order. // Move from end of line N to beginning of line N-1 iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 1); // Point to the beginning of the last line. pnext = (U32 *)(lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj); for ( j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= 2, pnext++) { tm = *pnext; // 555 2, 3, 8 0x7C, 0x7C, 0x7C t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>19)&0x7C].YU + RYUV[(tm>>24)&0x7C].YU); *(YPlane+1) = (U8)((t>>NOC_SHIFT_WIDTH)+16); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>3)&0x7C].YU + RYUV[(tm>>8)&0x7C].YU); *(YPlane) = (U8)((t>>NOC_SHIFT_WIDTH)+16); YPlane += 2; if (0 == (k&1)) { // 555 2, 3, 8 0x7C, 0x7C, 0x7C *UPlane++ = (U8)((t>>23)+128); t = (RYUV[(tm>>8)&0x7C].V + GYUV[(tm>>3)&0x7C].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>NOC_SHIFT_WIDTH)+128); } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines. if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } // end of for k if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } // end of for j if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of NOC_H26X_BGR16toYUV12 #endif #if 0 _STATIC void C_H26X_BGR16toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, UN bitfield, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm; int t; int i, j, k; int iBackTwoLines; int stretch, mark, aspect; int width_adj, height_adj; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1); // This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12). for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = lpbiInput->biWidth - FrameWidth; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // The input image is upside down - process the lines in reverse order. // Move from end of line N to beginning of line N-1 iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 1); // Point to the beginning of the last line. pnext = (U32 *)(lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj); for ( j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= 2, pnext++) { tm = *pnext; switch (bitfield) { // 555 2, 3, 8 0x7C, 0x7C, 0x7C case 555: t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>19)&0x7C].YU + RYUV[(tm>>24)&0x7C].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>3)&0x7C].YU + RYUV[(tm>>8)&0x7C].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; #if 0 // Beware - untested code ahead // 664 3, 3, 9 0x78, 0x7E, 0x7E case 664: t = (BYUV[(tm>>13)&0x78].YU + GYUV[(tm>>19)&0x7E].YU + RYUV[(tm>>25)&0x7E].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<3)&0x78].YU + GYUV[(tm>>3)&0x7E].YU + RYUV[(tm>>9)&0x7E].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; // 565 2, 4, 9 0x7C, 0x7E, 0x7C case 565: t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>20)&0x7E].YU + RYUV[(tm>>25)&0x7C].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>4)&0x7E].YU + RYUV[(tm>>9)&0x7C].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; // 655 2, 3, 9 0x7C, 0x7C, 0x7E case 655: t = (BYUV[(tm>>14)&0x7C].YU + GYUV[(tm>>19)&0x7C].YU + RYUV[(tm>>25)&0x7E].YU); *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8); t = (BYUV[(tm<<2)&0x7C].YU + GYUV[(tm>>3)&0x7C].YU + RYUV[(tm>>9)&0x7E].YU); *(YPlane) = (U8)((t>>SHIFT_WIDTH)+8); YPlane += 2; break; #endif } if (0 == (k&1)) { switch (bitfield) { // 555 2, 3, 8 0x7C, 0x7C, 0x7C case 555: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>8)&0x7C].V + GYUV[(tm>>3)&0x7C].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; #if 0 // Beware - untested code ahead // 664 3, 3, 9 0x78, 0x7E, 0x7E case 664: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>9)&0x7E].V + GYUV[(tm>>3)&0x7E].V + BYUV[(tm<<3)&0x78].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; // 565 2, 4, 9 0x7C, 0x7E, 0x7C case 565: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>9)&0x7C].V + GYUV[(tm>>4)&0x7E].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; // 655 2, 3, 9 0x7C, 0x7C, 0x7E case 655: *UPlane++ = (U8)((t>>24)+64); t = (RYUV[(tm>>9)&0x7E].V + GYUV[(tm>>3)&0x7C].V + BYUV[(tm<<2)&0x7C].V); *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64); break; #endif } } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines. if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } // end of for k if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } // end of for j if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of C_H26X_BGR16toYUV12 #endif __declspec(naked) _STATIC void IA_H26X_BGR16555toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | + 96 // | FrameHeight | + 92 // | FrameWidth | + 88 // | VPlane | + 84 // | UPlane | + 80 // | YPlane | + 76 // | lpInput | + 72 // | lpbiInput | + 68 // ---------------------------- // | return addr | + 64 // | saved ebp | + 60 // | saved ebx | + 56 // | saved esi | + 52 // | saved edi | + 48 // | pyprev | + 44 // | pyspace | + 40 // | pynext | + 36 // | i | + 32 // | j | + 28 // | k | + 24 // | iBackTwoLines | + 20 // | stretch | + 16 // | mark | + 12 // | LumaIters | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 #define LOCALSIZE 48 #define PITCH_PARM 96 #define FRAME_HEIGHT 92 #define FRAME_WIDTH 88 #define VPLANE 84 #define UPLANE 80 #define YPLANE 76 #define LP_INPUT 72 #define LPBI_INPUT 68 #define PYPREV 44 #define PYSPACE 40 #define PYNEXT 36 #define LOOP_I 32 #define LOOP_J 28 #define LOOP_K 24 #define BACK_TWO_LINES 20 #define STRETCH 16 #define MARK 12 #define LUMA_ITERS 8 #define YPITCH_ADJ 4 #define UVPITCH_ADJ 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // assign (ebx, lpbiInput) mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (edx, pitch) mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (edx, LumaIters) xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = lpbiInput->biWidth - FrameWidth // assign (esi, width_adj) mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] // aspect = (width_adj ? LumaIters : 0) // assign (edi, aspect) // kill (edx, LumaIters) mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (edx, height_adj) L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0) xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1) mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] shl ebp, 1 neg ebp mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput + // ((lpbiInput->biWidth << 1) * // ((FrameHeight - aspect - 1) + height_adj)) + // width_adj // kill (ebx, lpbiInput) // kill (ecx, FrameWidth) // kill (edx, height_adj) // kill (esi, width_adj) // kill (edi, aspect) // assign (esi, pnext) mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane) mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++) xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++) L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 2, pnext += 4) L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts on the U pipe jmp L6 L6: // tm = pnext[0] // t = ( BYUV[(tm>>14)&0x7C].YU + // GYUV[(tm>>19)&0x7C].YU + // RYUV[(tm>>24)&0x7C].YU ) // *(YPlane+1) = (U8)((t>>8)+8) // t = ( BYUV[(tm<< 2)&0x7C].YU + // GYUV[(tm>> 8)&0x7C].YU + // RYUV[(tm>>13)&0x7C].YU ) // *YPlane = (U8)((t>>8)+8) // YPlane += 2 // assign(eax: B2/Y1/Y2/U) // assign(ebx: B1/V) // assign(ecx: G2/G1) // assign(edx: R2/R1) // assign(ebp: B1) // 1 mov eax, [esi] nop // 2 mov ebx, eax mov ecx, eax // 3 shr eax, 14 mov edx, ebx // 4 shr ecx, 19 and eax, 0x7C // 5 shr edx, 24 and ecx, 0x7C // 6 mov eax, [BYUV+eax*8].YU and edx, 0x7C // 7 add eax, [GYUV+ecx*8].YU mov ecx, ebx // 8 add eax, [RYUV+edx*8].YU mov edx, ebx // 9 sar eax, 8 and ebx, 0x1F // 10 shl ebx, 2 add eax, 8 // 11 shr ecx, 3 mov [edi + 1], al // 12 shr edx, 8 and ecx, 0x7C // 13 mov eax, [BYUV+ebx*8].YU and edx, 0x7C // 14 add eax, [GYUV+ecx*8].YU mov ebp, ebx // 15 add eax, [RYUV+edx*8].YU nop // 16 sar eax, 8 mov ebx, [esp + LOOP_K] // 17 add eax, 8 and ebx, 1 // 18 mov [edi], al jnz L9 // At this point, ebp: B1, ecx: G1, edx: R1 // *UPlane++ = (U8)((t>>24)+64) // t = ( VBGR[(t>>13)&0x7C].VR + // VBGR[(t>> 8)&0x7C].VG + // VBGR[(t<< 2)&0x7C].VB ) // *VPlane++ = (U8)((t>>8)+64) // 19 mov ebx, [RYUV+edx*8].V mov edx, [esp + UPLANE] // 20 sar eax, 16 add ebx, [GYUV+ecx*8].V // 21 add eax, 64 add ebx, [BYUV+ebp*8].V // 22 mov [edx], al inc edx // 23 mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 24 sar ebx, 8 inc edx // 25 add ebx, 64 mov [esp + VPLANE], edx // 26 mov [edx - 1], bl nop L9: // 27 mov eax, [esp + LOOP_I] lea esi, [esi + 4] // 28 sub eax, 2 lea edi, [edi + 2] // 29 mov [esp + LOOP_I], eax jnz L6 // if (stretch && (0 == k) && j) mov eax, [esp + STRETCH] test eax, eax jz L14 mov eax, [esp + LOOP_K] test eax, eax jnz L14 mov eax, [esp + LOOP_J] test eax, eax jz L14 // spill YPlane ptr mov [esp + YPLANE], edi nop // for (i = FrameWidth; i > 0; i -= 8) // assign (ebx, pyprev) // assign (ecx, t) // assign (edx, pynext) // assign (edi, pyspace) // assign (ebp, i) // make sure offsets are such that there are no bank conflicts here mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE] mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH] // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t L15: // 1 mov eax, [ebx] lea ebx, [ebx + 4] // 2 mov ecx, [edx] lea edx, [edx + 4] // 3 shr ecx, 1 and eax, 0xFEFEFEFE // 4 shr eax, 1 and ecx, 0x7F7F7F7F // 5 add eax, ecx mov ecx, [ebx] // 6 shr ecx, 1 mov [edi], eax // 7 mov eax, [edx] and ecx, 0x7F7F7F7F // 8 shr eax, 1 lea edi, [edi + 4] // 9 and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10 lea edx, [edx + 4] add eax, ecx // 11 mov [edi], eax lea edi, [edi + 4] // 12 sub ebp, 8 jnz L15 // kill (ebx, pyprev) // kill (ecx, t) // kill (edx, pynext) // kill (edi, pyspace) // kill (ebp, i) // restore YPlane mov edi, [esp + YPLANE] // pnext += iBackTwoLines L14: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj; add edi, [esp + YPITCH_ADJ] // if(0 == (k&1)) mov eax, [esp + LOOP_K] and eax, 1 jnz L16 // UPlane += uvpitch_adj; // VPlane += uvpitch_adj; mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax L16: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5 // if (stretch) cmp DWORD PTR [esp + STRETCH], 0 je L17 // pyprev = YPlane - pitch mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane mov [esp + PYSPACE], edi // pynext = (YPlane += pitch) add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi L17: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4 // kill (esi, pnext) // kill (edi, YPlane) // if (stretch) mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L19 // for (i = FrameWidth; i > 0; i -= 4) // assign (esi, pyprev) // assign (edi, pyspace) // assign (ebp, i) mov ebp, [esp + FRAME_WIDTH] mov edi, [esp + PYSPACE] L18: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L18 // kill (esi, pyprev) // kill (edi, pyspace) // kill (ebp, i) L19: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef PYPREV #undef PYSPACE #undef PYNEXT #undef LOOP_I #undef LOOP_J #undef LOOP_K #undef BACK_TWO_LINES #undef STRETCH #undef MARK #undef LUMA_ITERS #undef YPITCH_ADJ #undef UVPITCH_ADJ /***************************************************************************** * * H26X_CLUTtoYUV12() * * Convert from CLUT8/CLUT4 to YUV12 (YCrCb 4:2:0) and copy to destination memory * with pitch defined by the constant PITCH. * * This is needed to support the quickcam. */ #if 0 _STATIC void C_H26X_CLUTtoYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, UN pixel_bits, const int pitch) { U32 *pnext, *pyprev, *pyspace, *pynext; U32 tm, tn; int t; int i, j, k, m, n; int iNextLine, iBackTwoLines; int stretch, mark, aspect; int width_adj, height_adj; int yshift, uvshift; int pixel_mask, loop_cnt, loop_limit; RGBQUAD *lpCEntry, *lpCTable = (RGBQUAD *)((U8 *)lpbiInput + sizeof(BITMAPINFOHEADER)); int LumaIters = 0; int ypitch_adj = (pitch - FrameWidth); int uvpitch_adj = (pitch - (FrameWidth >> 1)); ASSERT((8 == pixel_bits) || (4 == pixel_bits)); // This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12). for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1); aspect = (width_adj ? LumaIters : 0); height_adj = ((lpbiInput->biHeight - (FrameHeight - aspect)) >> 1); stretch = (height_adj ? 1 : 0); mark = 12 - stretch; iNextLine = lpbiInput->biWidth; iBackTwoLines = -((iNextLine + (int)FrameWidth) >> 2); if (8 == pixel_bits) { yshift = 8; uvshift = 16; pixel_mask = 0xFF; loop_cnt = 2; loop_limit = 4; } else { yshift = 4; uvshift = 8; pixel_mask = 0xF; loop_cnt = 1; loop_limit = 8; width_adj >>= 1; iNextLine >>= 1; iBackTwoLines >>= 1; } // The input image is upside down - process the lines in reverse order. // Point to the beginning of the last line. pnext = (U32 *)(lpInput + (iNextLine * ((FrameHeight - aspect - 1) + height_adj)) + width_adj); for (j = 0; j < LumaIters; j++) { for (k = 0; k < mark; k++) { for (i = FrameWidth; i > 0; i -= 8) { for (n = 0; n < loop_cnt; n++) { tm = *pnext++; tm = ((4 == pixel_bits) ? ( ((tm >> 4) & 0x0F0F0F0F) | ((tm << 4) & 0xF0F0F0F0) ) : tm); tn = tm; for (m = 0; m < loop_limit; m += 4) { lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; lpCEntry = &lpCTable[tm&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU ); *YPlane++ = (U8)((t>>8)+8); tm >>= yshift; } if (0 == (k&1)) { for (m = 0; m < loop_limit; m += 2, tn >>= uvshift) { lpCEntry = &lpCTable[tn&pixel_mask]; t = ( BYUV[lpCEntry->rgbBlue>>1].YU + RYUV[lpCEntry->rgbRed>>1].YU + GYUV[lpCEntry->rgbGreen>>1].YU ); *UPlane++ = (U8)((t>>24)+64); t = ( RYUV[lpCEntry->rgbRed>>1].V + GYUV[lpCEntry->rgbGreen>>1].V + BYUV[lpCEntry->rgbBlue>>1].V ); *VPlane++ = (U8)((t>>8)+64); } } } } if (stretch && (0 == k) && j) { for (i = FrameWidth; i > 0; i -= 8) { tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; tm = ((*pyprev++ & 0xFEFEFEFE) >> 1); tm += ((*pynext++ & 0xFEFEFEFE) >> 1); *pyspace++ = tm; } } pnext += iBackTwoLines; YPlane += ypitch_adj; // Increment after even lines. if(0 == (k&1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } if (stretch) { pyprev = (U32 *)(YPlane - pitch); pyspace = (U32 *)YPlane; pynext = (U32 *)(YPlane += pitch); } } if (stretch) { for (i = FrameWidth; i > 0; i -= 4) { *pyspace++ = *pyprev++; } } } // end of H26X_CLUTtoYUV12() #endif __declspec(naked) _STATIC void IA_H26X_CLUT8toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | +100 // | FrameHeight | + 96 // | FrameWidth | + 92 // | VPlane | + 88 // | UPlane | + 84 // | YPlane | + 80 // | lpInput | + 76 // | lpbiInput | + 72 // ---------------------------- // | return addr | + 68 // | saved ebp | + 64 // | saved ebx | + 60 // | saved esi | + 56 // | saved edi | + 52 // | pyprev | + 48 // | pyspace | + 44 // | pynext | + 40 // | i | + 36 // | j | + 32 // | k | + 28 // | iBackTwoLines | + 24 // | stretch | + 20 // | mark | + 16 // | lpCEntry | + 12 // | lpCTable | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 #define LOCALSIZE 52 #define PITCH_PARM 100 #define FRAME_HEIGHT 96 #define FRAME_WIDTH 92 #define VPLANE 88 #define UPLANE 84 #define YPLANE 80 #define LP_INPUT 76 #define LPBI_INPUT 72 #define PYPREV 48 #define PYSPACE 44 #define PYNEXT 40 #define LOOP_I 36 #define LOOP_J 32 #define LOOP_K 28 #define BACK_TWO_LINES 24 #define STRETCH 20 #define MARK 16 #define LUMA_ITERS 12 #define LPCTABLE 8 #define YPITCH_ADJ 4 #define UVPITCH_ADJ 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER) // assign (ebx, lpbiInput) mov eax, [esp + LPBI_INPUT] mov ebx, eax add eax, TYPE BITMAPINFOHEADER mov [esp + LPCTABLE], eax // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (ecx, FrameWidth) // kill (edx, pitch) shr ecx, 1 sub edx, ecx mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (ecx, LumaIters) xor ecx, ecx mov eax, [esp + FRAME_HEIGHT] L1: lea ecx, [ecx + 4] sub eax, 48 jnz L1 // width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1 // assign (edx, width_adj) mov edx, (LPBITMAPINFOHEADER)[ebx].biWidth sub edx, [esp + FRAME_WIDTH] shr edx, 1 // aspect = (width_adj ? LumaIters : 0) // assign (esi, aspect) // kill (ecx, LumaIters) mov [esp + LUMA_ITERS], ecx xor esi, esi test edx, edx jz L2 mov esi, ecx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (ecx, height_adj) L2: mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight sub ecx, [esp + FRAME_HEIGHT] add ecx, esi shr ecx, 1 // stretch = (height_adj ? 1 : 0) xor eax, eax test ecx, ecx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov edi, 12 sub edi, eax mov [esp + MARK], edi // iNextLine = lpbiInput->biWidth // kill (ebx, lpbiInput) // assign (ebx, iNextLine) mov ebx, (LPBITMAPINFOHEADER)[ebx].biWidth // iBackTwoLines = -(iNextline + FrameWidth) mov edi, [esp + FRAME_WIDTH] add edi, ebx neg edi mov [esp + BACK_TWO_LINES], edi // pnext = lpInput + // (iNextLine*((FrameHeight-aspect-1) + height_adj)) + // width_adj // kill (ebx, iNextLine) // kill (ecx, height_adj) // kill (edx, width_adj) // kill (esi, aspect) // assign (esi, pnext) mov eax, [esp + FRAME_HEIGHT] sub eax, esi dec eax add eax, ecx mov esi, [esp + LP_INPUT] add esi, edx imul ebx add esi, eax // assign (edi, YPlane) mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++) xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++) L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 2, pnext += 2) L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts on the U pipe jmp L6 L6: // lpCEntry = &lpCTable[*(pnext+1)] // t = ( BYUV[lpCEntry->rgbBlue>>1].YU + // GYUV[lpCEntry->rgbGreen>>1].YU + // RYUV[lpCEntry->rgbRed>>1].YU ) // *(YPlane+1) = (U8)((t>>8)+8) // lpCEntry = &lpCTable[*pnext] // t = ( BYUV[lpCEntry->rgbBlue>>1].YU + // GYUV[lpCEntry->rgbGreen>>1].YU + // RYUV[lpCEntry->rgbRed>>1].YU ) // *YPlane = (U8)((t>>8)+8) // YPlane += 2 // *UPlane++ = (U8)((t>>24)+64) // t = ( VBGR[lpCEntry->rgbRed>>1].V + // VBGR[lpCEntry->rgbGreen>>1].V + // VBGR[lpCEntry->rgbBlue>>1].V ) // *VPlane++ = (U8)((t>>8)+64) // assign (ebp: lpCEntry,B1) // assign (eax: P2,B2,Y2,Y1,U) // assign (ebx: B1,V) // assign (ecx: G2,G1) // assign (edx: R2,R1) // 1 xor eax, eax mov ebp, [esp + LPCTABLE] // 2 mov al, [esi + 1] xor ecx, ecx // 3 lea ebx, [ebp+eax*4] xor edx, edx // 4 mov al, (LPRGBQUAD)[ebx].rgbBlue nop // 5 mov cl, (LPRGBQUAD)[ebx].rgbGreen and al, 0xFE // 6 mov dl, (LPRGBQUAD)[ebx].rgbRed and cl, 0xFE // 7 mov eax, [BYUV+eax*4].YU and dl, 0xFE // 8 add eax, [GYUV+ecx*4].YU xor ebx, ebx // 9 add eax, [RYUV+edx*4].YU mov bl, [esi] // 10 sar eax, 8 lea ebp, [ebp+ebx*4] // 11 add eax, 8 nop // 12 mov [edi + 1], al mov bl, (LPRGBQUAD)[ebp].rgbBlue // 13 mov cl, (LPRGBQUAD)[ebp].rgbGreen and bl, 0xFE // 14 mov dl, (LPRGBQUAD)[ebp].rgbRed and cl, 0xFE // 15 mov eax, [BYUV+ebx*4].YU and dl, 0xFE // 16 add eax, [GYUV+ecx*4].YU mov ebp, ebx // 17 add eax, [RYUV+edx*4].YU nop // 18 sar eax, 8 mov ebx, [esp + LOOP_K] // 19 add eax, 8 and ebx, 1 // 20 mov [edi], al jnz L9 // 21 mov ebx, [RYUV+edx*4].V mov edx, [esp + UPLANE] // 22 sar eax, 16 add ebx, [GYUV+ecx*4].V // 23 add eax, 64 add ebx, [BYUV+ebp*4].V // 24 mov [edx], al inc edx // 25 mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 26 sar ebx, 8 inc edx // 27 add ebx, 64 mov [esp + VPLANE], edx // 28 mov [edx - 1], bl nop L9: // 29 mov eax, [esp + LOOP_I] lea esi, [esi + 2] // 30 sub eax, 2 lea edi, [edi + 2] // 31 mov [esp + LOOP_I], eax jnz L6 // only esi (pnext) is live at this point (after line loop) // if (stretch && (0 == k) && j) mov eax, [esp + STRETCH] test eax, eax jz L14 mov eax, [esp + LOOP_K] test eax, eax jnz L14 mov eax, [esp + LOOP_J] test eax, eax jz L14 // spill YPlane ptr mov [esp + YPLANE], edi nop // for (i = FrameWidth; i > 0; i -= 8) // assign (ebx, pyprev) // assign (ecx, t) // assign (edx, pynext) // assign (edi, pyspace) // assign (ebp, i) // make sure offsets are such that there are no bank conflicts here mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE] mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH] // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t L15: // 1 mov eax, [ebx] lea ebx, [ebx + 4] // 2 mov ecx, [edx] lea edx, [edx + 4] // 3 shr ecx, 1 and eax, 0xFEFEFEFE // 4 shr eax, 1 and ecx, 0x7F7F7F7F // 5 add eax, ecx mov ecx, [ebx] // 6 shr ecx, 1 mov [edi], eax // 7 mov eax, [edx] and ecx, 0x7F7F7F7F // 8 shr eax, 1 lea edi, [edi + 4] // 9 and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10 lea edx, [edx + 4] add eax, ecx // 11 mov [edi], eax lea edi, [edi + 4] // 12 sub ebp, 8 jnz L15 // kill (ebx, pyprev) // kill (ecx, t) // kill (edx, pynext) // kill (edi, pyspace) // kill (ebp, i) // restore YPlane mov edi, [esp + YPLANE] // pnext += iBackTwoLines L14: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj; add edi, [esp + YPITCH_ADJ] // if(0 == (k&1)) mov eax, [esp + LOOP_K] and eax, 1 jnz L16 // UPlane += uvpitch_adj; // VPlane += uvpitch_adj; mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax L16: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5 // if (stretch) cmp DWORD PTR [esp + STRETCH], 0 je L17 // pyprev = YPlane - pitch mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane mov [esp + PYSPACE], edi // pynext = (YPlane += pitch) add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi L17: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4 // kill (esi, pnext) // kill (edi, YPlane) // if (stretch) mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L19 // for (i = FrameWidth; i > 0; i -= 4) // assign (esi, pyprev) // assign (edi, pyspace) // assign (ebp, i) mov ebp, [esp + FRAME_WIDTH] mov edi, [esp + PYSPACE] L18: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L18 // kill (esi, pyprev) // kill (edi, pyspace) // kill (ebp, i) L19: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef PYPREV #undef PYSPACE #undef PYNEXT #undef LOOP_I #undef LOOP_J #undef LOOP_K #undef BACK_TWO_LINES #undef STRETCH #undef MARK #undef LUMA_ITERS #undef LPCTABLE #undef YPITCH_ADJ #undef UVPITCH_ADJ __declspec(naked) _STATIC void IA_H26X_CLUT4toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * lpInput, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | +100 // | FrameHeight | + 96 // | FrameWidth | + 92 // | VPlane | + 88 // | UPlane | + 84 // | YPlane | + 80 // | lpInput | + 76 // | lpbiInput | + 72 // ---------------------------- // | return addr | + 68 // | saved ebp | + 64 // | saved ebx | + 60 // | saved esi | + 56 // | saved edi | + 52 // | pyprev | + 48 // | pyspace | + 44 // | pynext | + 40 // | i | + 36 // | j | + 32 // | k | + 28 // | iBackTwoLines | + 24 // | stretch | + 20 // | mark | + 16 // | lpCEntry | + 12 // | lpCTable | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 #define LOCALSIZE 52 #define PITCH_PARM 100 #define FRAME_HEIGHT 96 #define FRAME_WIDTH 92 #define VPLANE 88 #define UPLANE 84 #define YPLANE 80 #define LP_INPUT 76 #define LPBI_INPUT 72 #define PYPREV 48 #define PYSPACE 44 #define PYNEXT 40 #define LOOP_I 36 #define LOOP_J 32 #define LOOP_K 28 #define BACK_TWO_LINES 24 #define STRETCH 20 #define MARK 16 #define LUMA_ITERS 12 #define LPCTABLE 8 #define YPITCH_ADJ 4 #define UVPITCH_ADJ 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER) // assign (ebx, lpbiInput) mov eax, [esp + LPBI_INPUT] mov ebx, eax add eax, TYPE BITMAPINFOHEADER mov [esp + LPCTABLE], eax // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (ecx, FrameWidth) // kill (edx, pitch) shr ecx, 1 sub edx, ecx mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (ecx, LumaIters) xor ecx, ecx mov eax, [esp + FRAME_HEIGHT] L1: lea ecx, [ecx + 4] sub eax, 48 jnz L1 // width_adj = ((lpbiInput->biWidth - FrameWidth) >> 2 // assign (edx, width_adj) mov edx, (LPBITMAPINFOHEADER)[ebx].biWidth sub edx, [esp + FRAME_WIDTH] shr edx, 2 // aspect = (width_adj ? LumaIters : 0) // assign (esi, aspect) // kill (ecx, LumaIters) mov [esp + LUMA_ITERS], ecx xor esi, esi test edx, edx jz L2 mov esi, ecx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (ecx, height_adj) L2: mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight sub ecx, [esp + FRAME_HEIGHT] add ecx, esi shr ecx, 1 // stretch = (height_adj ? 1 : 0) xor eax, eax test ecx, ecx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov edi, 12 sub edi, eax mov [esp + MARK], edi // iNextLine = lpbiInput->biWidth >> 1 // kill (ebx, lpbiInput) // assign (ebx, iNextLine) mov ebx, (LPBITMAPINFOHEADER)[ebx].biWidth shr ebx, 1 // iBackTwoLines = -(iNextline + (FrameWidth >> 1)) mov edi, [esp + FRAME_WIDTH] shr edi, 1 add edi, ebx neg edi mov [esp + BACK_TWO_LINES], edi // pnext = lpInput+(iNextLine*((FrameHeight-aspect-1)+height_adj))+ width_adj // kill (ebx, iNextLine) // kill (ecx, height_adj) // kill (edx, width_adj) // kill (esi, aspect) // assign (esi, pnext) mov eax, [esp + FRAME_HEIGHT] sub eax, esi dec eax add eax, ecx mov esi, [esp + LP_INPUT] add esi, edx imul ebx add esi, eax // assign (edi, YPlane) mov edi, [esp + YPLANE] // for (j = 0; j < LumaIters; j++) xor eax, eax mov [esp + LOOP_J], eax // for (k = 0; k < mark; k++) L4: xor eax, eax mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 2, pnext++) L5: mov eax, [esp + FRAME_WIDTH] mov [esp + LOOP_I], eax // This jump is here to make sure the following loop starts on the U pipe jmp L6 L6: // lpCEntry = &lpCTable[*pnext&0xF] // t = ( BYUV[lpCEntry->rgbBlue>>1].YU + // GYUV[lpCEntry->rgbGreen>>1].YU + // RYUV[lpCEntry->rgbRed>>1].YU ) // *(YPlane+1) = (U8)((t>>8)+8) // lpCEntry = &lpCTable[(*pnext>>4)&0xF] // t = ( BYUV[lpCEntry->rgbBlue>>1].YU + // GYUV[lpCEntry->rgbGreen>>1].YU + // RYUV[lpCEntry->rgbRed>>1].YU ) // *YPlane = (U8)((t>>8)+8) // YPlane += 2 // *UPlane++ = (U8)((t>24)+64) // t = ( RYUV[lpCEntry->rgbRed>>1].V + // GYUV[lpCEntry->rgbGreen>>1].V + // BYUV[lpCEntry->rgbBlue>>1].V ) // *VPlane++ = (U8)((t>>8)+64) // assign (ebp: lpCEntry,B1) // assign (eax: P2,B2,Y2,Y1,U) // assign (ebx: B1,V) // assign (ecx: G2,G1) // assign (edx: R2,R1) // 1 mov al, [esi] mov ebp, [esp + LPCTABLE] // 2 and eax, 0xF xor ecx, ecx // 3 lea ebx, [ebp+eax*4] xor edx, edx // 4 mov al, (LPRGBQUAD)[ebx].rgbBlue nop // 5 mov cl, (LPRGBQUAD)[ebx].rgbGreen and al, 0xFE // 6 mov dl, (LPRGBQUAD)[ebx].rgbRed and cl, 0xFE // 7 mov eax, [BYUV+eax*4].YU and dl, 0xFE // 8 add eax, [GYUV+ecx*4].YU mov bl, [esi] // 9 add eax, [RYUV+edx*4].YU and ebx, 0xF0 // shr ebx, 4 nop // 10 shr eax, 8 lea ebp, [ebp+ebx*4] // 11 add eax, 8 nop // 12 mov [edi + 1], al mov bl, (LPRGBQUAD)[ebp].rgbBlue // 13 mov cl, (LPRGBQUAD)[ebp].rgbGreen and bl, 0xFE // 14 mov dl, (LPRGBQUAD)[ebp].rgbRed and cl, 0xFE // 15 mov eax, [BYUV+ebx*4].YU and dl, 0xFE // 16 add eax, [GYUV+ecx*4].YU mov ebp, ebx // 17 add eax, [RYUV+edx*4].YU nop // 18 shr eax, 8 mov ebx, [esp + LOOP_K] // 19 add eax, 8 and ebx, 1 // 20 mov [edi], al jnz L9 // 21 mov ebx, [RYUV+edx*4].V mov edx, [esp + UPLANE] // 22 sar eax, 16 add ebx, [GYUV+ecx*4].V // 23 add eax, 64 add ebx, [BYUV+ebp*4].V // 24 mov [edx], al inc edx // 25 mov [esp + UPLANE], edx mov edx, [esp + VPLANE] // 26 sar ebx, 8 inc edx // 27 add ebx, 64 mov [esp + VPLANE], edx // 28 mov [edx - 1], bl nop L9: // 32 mov eax, [esp + LOOP_I] lea esi, [esi + 1] // 33 sub eax, 2 lea edi, [edi + 2] // 34 mov [esp + LOOP_I], eax jnz L6 // only esi (pnext) is live at this point (after line loop) // if (stretch && (0 == k) && j) mov eax, [esp + STRETCH] test eax, eax jz L14 mov eax, [esp + LOOP_K] test eax, eax jnz L14 mov eax, [esp + LOOP_J] test eax, eax jz L14 // spill YPlane ptr mov [esp + YPLANE], edi nop // for (i = FrameWidth; i > 0; i -= 8) // assign (ebx, pyprev) // assign (ecx, t) // assign (edx, pynext) // assign (edi, pyspace) // assign (ebp, i) // make sure offsets are such that there are no bank conflicts here mov ebx, [esp + PYPREV] mov edi, [esp + PYSPACE] mov edx, [esp + PYNEXT] mov ebp, [esp + FRAME_WIDTH] // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t // t = (*pyprev++ & 0xFEFEFEFE) >> 1 // t += (*pynext++ & 0xFEFEFEFE) >> 1 // *pyspace++ = t L15: // 1 mov eax, [ebx] lea ebx, [ebx + 4] // 2 mov ecx, [edx] lea edx, [edx + 4] // 3 shr ecx, 1 and eax, 0xFEFEFEFE // 4 shr eax, 1 and ecx, 0x7F7F7F7F // 5 add eax, ecx mov ecx, [ebx] // 6 shr ecx, 1 mov [edi], eax // 7 mov eax, [edx] and ecx, 0x7F7F7F7F // 8 shr eax, 1 lea edi, [edi + 4] // 9 and eax, 0x7F7F7F7F lea ebx, [ebx + 4] // 10 lea edx, [edx + 4] add eax, ecx // 11 mov [edi], eax lea edi, [edi + 4] // 12 sub ebp, 8 jnz L15 // kill (ebx, pyprev) // kill (ecx, t) // kill (edx, pynext) // kill (edi, pyspace) // kill (ebp, i) // restore YPlane mov edi, [esp + YPLANE] // pnext += iBackTwoLines L14: add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj; add edi, [esp + YPITCH_ADJ] // if(0 == (k&1)) mov eax, [esp + LOOP_K] and eax, 1 jnz L16 // UPlane += uvpitch_adj; // VPlane += uvpitch_adj; mov eax, [esp + UVPITCH_ADJ] add [esp + UPLANE], eax add [esp + VPLANE], eax L16: inc DWORD PTR [esp + LOOP_K] mov eax, [esp + LOOP_K] cmp eax, [esp + MARK] jl L5 // if (stretch) cmp DWORD PTR [esp + STRETCH], 0 je L17 // pyprev = YPlane - pitch mov eax, edi sub eax, [esp + PITCH_PARM] mov [esp + PYPREV], eax // pyspace = YPlane mov [esp + PYSPACE], edi // pynext = (YPlane += pitch) add edi, [esp + PITCH_PARM] mov [esp + PYNEXT], edi L17: inc DWORD PTR [esp + LOOP_J] mov eax, [esp + LOOP_J] cmp eax, [esp + LUMA_ITERS] jl L4 // kill(esi, pnext) // if (stretch) mov esi, [esp + PYPREV] cmp DWORD PTR [esp + STRETCH], 0 je L19 // for (i = FrameWidth; i > 0; i -= 4) // assign (esi, pyprev) // assign (edi, pyspace) // assign (ebp, i) mov edi, [esp + PYSPACE] mov ebp, [esp + FRAME_WIDTH] L18: mov ecx, [esi] lea esi, [esi + 4] mov [edi], ecx lea edi, [edi + 4] sub ebp, 4 jnz L18 // kill (esi, pyprev) // kill (edi, pyspace) // kill (ebp, i) L19: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef PYPREV #undef PYSPACE #undef PYNEXT #undef LOOP_I #undef LOOP_J #undef LOOP_K #undef BACK_TWO_LINES #undef STRETCH #undef MARK #undef LUMA_ITERS #undef LPCTABLE #undef YPITCH_ADJ #undef UVPITCH_ADJ /*************************************************** * H26X_YVU9toYUV12() * Convert from YVU9 to YUV12 * and copy to destination memory with pitch * defined by the constant PITCH. * * uv_plane_common() * Helper function to convert V and U plane information. * Since the process is similar for both planes, the * conversion code was included in this subroutine. * ***************************************************/ #if 0 #define READ_DWORD_AND_SHIFT(val,src) \ (((val) = *((unsigned int *)(src))), ((val) &= 0xFEFEFEFE), ((val) >>= 1)) #define READ_QWORD_AND_SHIFT(val,src) \ (((val) = *((unsigned __int64 *)(src))), ((val) &= 0xFEFEFEFEFEFEFEFE), ((val) >>= 1)) #define WRITE_DWORD(dest,val) ((*(unsigned int *)(dest)) = (val)) #define WRITE_QWORD(dest,val) ((*(unsigned __int64 *)(dest)) = (val)) #define AVERAGE_DWORDS(out,in1,in2) ((out) = ((((in1) + (in2)) & 0xFEFEFEFE) >> 1)) #define DUP_LOWER_TWO_BYTES(dest,val) \ (*((unsigned int *)(dest)) = (((val) & 0x000000FF) | (((val) << 8) & 0x0000FF00) | \ (((val) << 8) & 0x00FF0000) | (((val) << 16) & 0xFF000000))) #define DUP_UPPER_TWO_BYTES(dest,val) \ (*((unsigned int *)(dest)) = ((((val) >> 16) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \ (((val) >> 8) & 0x00FF0000) | ((val) & 0xFF000000))) _STATIC void C_uv_plane_common( U8 *psrc, U8 *Plane, UN pitch, UN OutputFrameWidth, UN ChromaIters, UN spitch_adj) { U8* pnext = psrc + (OutputFrameWidth>>1) + spitch_adj; U8* pdest_copy = Plane; U8* pdest_avg = Plane + pitch; int dpitch_adj = pitch - OutputFrameWidth; int stretch = (spitch_adj ? 1 : 0); int mark = 6 - stretch; int flag = stretch; int i, j, k; UN t1,t2; for (j = ChromaIters; j > 0; j--) { for (k = mark + (flag & 1); k > 0; k--) { if (!stretch && (1 == j) && (1 == k)) { pnext = psrc; } for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_copy += 8, pdest_avg += 8) { READ_DWORD_AND_SHIFT(t1,psrc); DUP_LOWER_TWO_BYTES(pdest_copy,t1); DUP_UPPER_TWO_BYTES((pdest_copy+4),t1); READ_DWORD_AND_SHIFT(t2,pnext); AVERAGE_DWORDS(t1,t1,t2); DUP_LOWER_TWO_BYTES(pdest_avg,t1); DUP_UPPER_TWO_BYTES((pdest_avg+4),t1); } psrc += spitch_adj; pnext += spitch_adj; pdest_copy = pdest_avg + dpitch_adj; pdest_avg = pdest_copy + pitch; } if (stretch) { psrc -= ((OutputFrameWidth>>1) + spitch_adj); pnext -= ((OutputFrameWidth>>1) + spitch_adj); pdest_avg = pdest_copy; for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_avg += 8) { READ_DWORD_AND_SHIFT(t1,psrc); READ_DWORD_AND_SHIFT(t2,pnext); AVERAGE_DWORDS(t1,t1,t2); AVERAGE_DWORDS(t1,t1,t2); DUP_LOWER_TWO_BYTES(pdest_avg,t1); DUP_UPPER_TWO_BYTES((pdest_avg+4),t1); } psrc += spitch_adj; pnext += spitch_adj; pdest_copy = pdest_avg + dpitch_adj; pdest_avg = pdest_copy + pitch; flag++; } } } _STATIC void C_H26X_YVU9toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U8 *pnext, *plast, *pbn; U8 *pvsrc, *pusrc; int width_adj, height_adj; int stretch, mark, aspect; int iNextLine; int i, j, k, t; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1); for (i = FrameHeight; i > 0; i -= 48) { LumaIters += 4; } width_adj = (lpbiInput->biWidth - FrameWidth) >> 1; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; iNextLine = width_adj << 1; pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj; for (j = LumaIters; j > 0; j--) { for (k = mark; k > 0; k--) { for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4) { *(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1; } pnext += iNextLine; YPlane += ypitch_adj; } if (stretch) { plast = pnext - lpbiInput->biWidth; pbn = pnext; for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, plast += 4, pbn += 4) { *(U32 *)YPlane = ( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) + ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1; } YPlane += ypitch_adj; } } pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight); pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight>>2)); t = ((lpbiInput->biWidth>>2) * (height_adj>>2)) + (width_adj>>2); pvsrc += t; pusrc += t; C_uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1); C_uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1); } #endif __declspec(naked) _STATIC void IA_uv_plane_common( U8 *psrc, U8 *Plane, UN pitch, UN OutputFrameWidth, UN ChromaIters, UN spitch_adj) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | spitch_adj | + 64 // | ChromaIters | + 60 // | OutputFrameWidth| + 56 // | pitch | + 52 // | Plane | + 48 // | psrc | + 44 // ----------------------------- // | return addr | + 40 // | saved ebp | + 36 // | saved ebx | + 32 // | saved esi | + 28 // | saved edi | + 24 // | dpitch_adj | + 20 // | stretch | + 16 // | mark | + 12 // | flag | + 8 // | j | + 4 // | k | + 0 #define LOCALSIZE 24 #define SPITCH_ADJ 64 #define CHROMA_ITERS 60 #define OUTPUT_FRAME_WIDTH 56 #define PITCH_PARM 52 #define PLANE 48 #define PSRC 44 #define DPITCH_ADJ 20 #define STRETCH 16 #define MARK 12 #define FLAG 8 #define LOOP_J 4 #define LOOP_K 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // pnext = psrc + (OuputFrameWidth>>1) + uvpitch_adj // pdest_copy = Plane // pdest_avg = Plane + pitch // assign (esi, psrc) // assign (ecx, pnext) // assign (edi, pdest_copy) // assign (edx, pdest_avg) // assign (ebp, i) mov esi, [esp + PSRC] mov ecx, esi mov eax, [esp + OUTPUT_FRAME_WIDTH] shr eax, 1 add eax, [esp + SPITCH_ADJ] add ecx, eax mov edi, [esp + PLANE] mov edx, edi add edx, [esp + PITCH_PARM] // dpitch_adj = pitch - OutputFrameWidth mov eax, [esp + PITCH_PARM] sub eax, [esp + OUTPUT_FRAME_WIDTH] mov [esp + DPITCH_ADJ], eax // stretch = (spitch_adj ? 1 : 0) xor ebx, ebx mov eax, [esp + SPITCH_ADJ] test eax, eax jz L1 inc ebx L1: mov [esp + STRETCH], ebx // mark = 6 - stretch mov eax, 6 sub eax, ebx mov [esp + MARK], eax // flag = stretch mov DWORD PTR [esp + FLAG], ebx // for (j = ChromaIters; j > 0; j--) mov eax, [esp + CHROMA_ITERS] mov [esp + LOOP_J], eax L2: // for (k = mark + (flag & 1); k > 0; k--) mov eax, [esp + FLAG] and eax, 1 add eax, [esp + MARK] mov [esp + LOOP_K], eax L3: // if (!stretch && (0 == j) && (0 == k)) mov eax, [esp + STRETCH] test eax, eax jnz L4 mov eax, [esp + LOOP_J] cmp eax, 1 jne L4 mov eax, [esp + LOOP_K] cmp eax, 1 jne L4 // pnext = psrc mov ecx, esi L4: // for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, // pdest_copy += 8, pdest_avg += 8) mov ebp, [esp + OUTPUT_FRAME_WIDTH] // Pentium pipeline scheduling has not been performed on the following loop code yet L5: // READ_DWORD_AND_SHIFT(t1,psrc) mov eax, [esi] and eax, 0xFEFEFEFE shr eax, 1 // DUP_LOWER_TWO_BYTES(pdest_copy,t1) mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edi], ebx // DUP_UPPER_TWO_BYTES((pdest_copy+4),t1) shr eax, 16 mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edi+4], ebx // READ_DWORD_AND_SHIFT(t2,pnext) // AVERAGE_DWORDS(t1,t1,t2) mov eax, [esi] and eax, 0xFEFEFEFE shr eax, 1 mov ebx, [ecx] and ebx, 0xFEFEFEFE shr ebx, 1 add eax, ebx and eax, 0xFEFEFEFE shr eax, 1 // DUP_LOWER_TWO_BYTES(pdest_avg,t1) mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx], ebx // DUP_UPPER_TWO_BYTES((pdest_avg+4),t1) shr eax, 16 mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx+4], ebx // end of i loop lea esi, [esi + 4] lea ecx, [ecx + 4] lea edi, [edi + 8] lea edx, [edx + 8] sub ebp, 8 jnz L5 // psrc += spitch_adj // pnext += spitch_adj // pdest_copy = pdest_avg + pitch_adj // pdest_avg = pdest_copy + pitch add esi, [esp + SPITCH_ADJ] add ecx, [esp + SPITCH_ADJ] mov eax, edx add eax, [esp + DPITCH_ADJ] mov edi, eax mov edx, edi add edx, [esp + PITCH_PARM] // end of k loop dec DWORD PTR [esp + LOOP_K] jnz L3 // if (stretch) cmp DWORD PTR [esp + STRETCH], 0 jz L6 // psrc -= ((OutputFrameWidth>>1)+spitch_adj) // pnext -= ((OutputFrameWidth>>1)+spitch_adj) // pdest_avg = pdest_copy mov eax, [esp + OUTPUT_FRAME_WIDTH] shr eax, 1 add eax, [esp + SPITCH_ADJ] sub esi, eax sub ecx, eax mov edx, edi // for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_avg += 8) mov ebp, [esp + OUTPUT_FRAME_WIDTH] // Pentium pipeline scheduling has not been performed on the following loop code yet L7: // READ_DWORD_AND_SHIFT(t1,psrc) mov eax, [esi] and eax, 0xFEFEFEFE shr eax, 1 // READ_DWORD_AND_SHIFT(t2,pnext) mov ebx, [ecx] and ebx, 0xFEFEFEFE shr ebx, 1 // AVERAGE_DWORDS(t1,t1,t2) // AVERAGE_DWORDS(t1,t1,t2) add eax, ebx and eax, 0xFEFEFEFE shr eax, 1 add eax, ebx and eax, 0xFEFEFEFE shr eax, 1 // DUP_LOWER_TWO_BYTES(pdest_avg,t1) mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx], ebx // DUP_UPPER_TWO_BYTES((pdest_avg+4),t1) shr eax, 16 mov bl, ah mov bh, ah shl ebx, 16 mov bl, al mov bh, al mov [edx+4], ebx // end of i loop lea esi, [esi + 4] lea ecx, [ecx + 4] lea edx, [edx + 8] sub ebp, 8 jnz L7 // psrc += spitch_adj // pnext += spitch_adj // pdest_copy = pdest_avg + dpitch_adj // pdest_avg = pdest_copy + pitch // flag++ add esi, [esp + SPITCH_ADJ] add ecx, [esp + SPITCH_ADJ] mov eax, edx add eax, [esp + DPITCH_ADJ] mov edi, eax mov edx, edi add edx, [esp + PITCH_PARM] inc DWORD PTR [esp + FLAG] // end of j loop L6: dec DWORD PTR [esp + LOOP_J] jnz L2 add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef SPITCH_ADJ #undef CHROMA_ITERS #undef OUTPUT_FRAME_WIDTH #undef PITCH_PARM #undef PLANE #undef PSRC #undef DPITCH_ADJ #undef STRETCH #undef MARK #undef FLAG #undef LOOP_J #undef LOOP_K __declspec(naked) _STATIC void IA_H26X_YVU9toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | + 88 // | FrameHeight | + 84 // | FrameWidth | + 80 // | VPlane | + 76 // | UPlane | + 72 // | YPlane | + 68 // | lpInput | + 64 // | lpbiInput | + 60 // ----------------------------- // | return addr | + 56 // | saved ebp | + 52 // | saved ebx | + 48 // | saved esi | + 44 // | saved edi | + 40 // | width_adj | + 36 // | height_adj | + 32 // | stretch | + 28 // | mark | + 24 // | iNextLine | + 20 // | j | + 16 // | k | + 12 // | LumaIters | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 #define LOCALSIZE 40 #define PITCH_PARM 88 #define FRAME_HEIGHT 84 #define FRAME_WIDTH 80 #define VPLANE 76 #define UPLANE 72 #define YPLANE 68 #define LP_INPUT 64 #define LPBI_INPUT 60 #define WIDTH_ADJ 36 #define HEIGHT_ADJ 32 #define STRETCH 28 #define MARK 24 #define NEXT_LINE 20 #define LOOP_J 16 #define LOOP_K 12 #define LUMA_ITERS 8 #define YPITCH_ADJ 4 #define UVPITCH_ADJ 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // assign (ebx, lpbiInput) mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (edx, pitch) mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (edx, LumaIters) xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = (lpbiInput->biWidth - FrameWidth) >> 1 // assign (esi, width_adj) mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] shr esi, 1 mov [esp + WIDTH_ADJ], esi // aspect = (width_adj ? LumaIters : 0) // assign (edi, aspect) // kill (edx, LumaIters) mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (edx, height_adj) L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 mov [esp + HEIGHT_ADJ], edx // stretch = (height_adj ? 1 : 0) xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iNextLine = width_adj << 1 mov ebp, esi shl ebp, 1 mov [esp + NEXT_LINE], ebp // pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj // kill (ebx, lpbiInput) // kill (ecx, FrameWidth) // kill (edx, height_adj) // kill (esi, width_adj) // kill (edi, aspect) // assign (esi, pnext) mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth mov ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane) mov edi, [esp + YPLANE] // for (j = LumaIters; j > 0; j--) mov eax, [esp + LUMA_ITERS] mov [esp + LOOP_J], eax // for (k = mark; k > 0; k--) L4: mov eax, [esp + MARK] mov [esp + LOOP_K], eax // for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4) // assign (ebp, i) L5: mov ebp, [esp + FRAME_WIDTH] // This jump is here to make sure the following loop starts on the U pipe jmp L6 L6: // *(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1; // 1 mov eax, [esi] lea esi, [esi + 4] // 2 and eax, 0xFEFEFEFE lea edi, [edi + 4] // 3 shr eax, 1 sub ebp, 4 // 4 mov [edi - 4], eax jnz L6 // pnext += iNextLine // YPlane += ypitch_adj add esi, [esp + NEXT_LINE] add edi, [esp + YPITCH_ADJ] // end of k loop mov eax, [esp + LOOP_K] sub eax, 1 mov [esp + LOOP_K], eax jnz L5 // if (stretch) mov eax, [esp + STRETCH] test eax, eax jz L7 // plast = pnext - lpbiInput->biWidth // pn = pnext // assign (ecx, plast) // assign (edx, pn) mov ecx, esi mov eax, [esp + LPBI_INPUT] sub ecx, (LPBITMAPINFOHEADER)[eax].biWidth mov edx, esi // for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4) // assign (ebp, i) mov ebp, [esp + FRAME_WIDTH] // This jump is here just to make sure the loop code starts with the U pipe jmp L8 L8: // *(U32 *)YPlane = // ( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) + // ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1 // 1 mov eax, [ecx] lea ecx, [ecx + 4] // 2 shr eax, 1 // 3 and eax, 0x7F7F7F7F mov ebx, [edx] // 4 shr ebx, 1 lea edi, [edi + 4] // 5 and ebx, 0x7F7F7F7F // 6 add eax, ebx // 7 and eax, 0xFEFEFEFE // 8 shr eax, 1 // 9 mov [edi - 4], eax sub ebp, 4 // 10 lea edx, [edx + 4] jnz L8 // YPlane += ypitch_adj add edi, [esp + YPITCH_ADJ] L7: // end of the LumaIters loop dec DWORD PTR [esp + LOOP_J] jnz L4 // pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight) // assign (esi, pvsrc) mov eax, [esp + LPBI_INPUT] mov ebx, (LPBITMAPINFOHEADER)[eax].biWidth mov eax, (LPBITMAPINFOHEADER)[eax].biHeight imul ebx add eax, [esp + LP_INPUT] mov esi, eax // pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight)>>2) // assign (edi, pusrc) mov eax, [esp + LPBI_INPUT] mov ecx, (LPBITMAPINFOHEADER)[eax].biWidth shr ecx, 2 mov eax, (LPBITMAPINFOHEADER)[eax].biHeight shr eax, 2 imul ecx add eax, esi mov edi, eax // t = ((lpbiInput->biWidth>>2) * (height>>2)) + (width_adj>>2) // assign (eax, t) mov eax, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[eax].biWidth shr eax, 2 mov ebx, [esp + HEIGHT_ADJ] shr ebx, 2 imul ebx mov ebx, [esp + WIDTH_ADJ] shr ebx, 2 add eax, ebx // pvsrc += t // pusrc += t add esi, eax add edi, eax // uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1) mov ebp, esp mov eax, [ebp + WIDTH_ADJ] shr eax, 1 push eax mov eax, [ebp + LUMA_ITERS] shr eax, 1 push eax mov eax, [ebp + FRAME_WIDTH] shr eax, 1 push eax push DWORD PTR [ebp + PITCH_PARM] push DWORD PTR [ebp + UPLANE] push edi call IA_uv_plane_common lea esp, [esp + 24] // uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1) mov ebp, esp mov eax, [ebp + WIDTH_ADJ] shr eax, 1 push eax mov eax, [ebp + LUMA_ITERS] shr eax, 1 push eax mov eax, [ebp + FRAME_WIDTH] shr eax, 1 push eax push DWORD PTR [ebp + PITCH_PARM] push DWORD PTR [ebp + VPLANE] push esi call IA_uv_plane_common lea esp, [esp + 24] add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef WIDTH_ADJ #undef HEIGHT_ADJ #undef STRETCH #undef MARK #undef NEXT_LINE #undef LOOP_J #undef LOOP_K #undef LUMA_ITERS #undef YPITCH_ADJ #undef UVPITCH_ADJ /*************************************************** * H26X_YUV12toEncYUV12() * Copy YUV12 data to encoder memory at the * appropriate location. It is assumed that the input * data is stored as rows of Y, followed by rows of U, * then rows of V. * ***************************************************/ #if 0 _STATIC void C_H26X_YUV12toEncYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { int i, j; U32 *pnext = (U32 *)lpInput; int ypitch_adj = pitch - FrameWidth; int yinput_height = lpbiInput->biHeight; int yinput_width = lpbiInput->biWidth; int yheight_diff = FrameHeight - yinput_height; int ywidth_diff = FrameWidth - yinput_width; int uvpitch_adj = pitch - (FrameWidth >> 1); int uvoutput_width = FrameWidth >> 1; int uvinput_height = yinput_height >> 1; int uvinput_width = yinput_width >> 1; int uvheight_diff = yheight_diff >> 1; int uvwidth_diff = ywidth_diff >> 1; for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj) { for (i = yinput_width; i > 0; i -= 8) { *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4; *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4; } for (i = ywidth_diff; i > 0; i -= 8) { *(U32 *)YPlane = 0; YPlane += 4; *(U32 *)YPlane = 0; YPlane += 4; } } for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj) { for (i = FrameWidth; i > 0; i -= 8) { *(U32 *)YPlane = 0; YPlane += 4; *(U32 *)YPlane = 0; YPlane += 4; } } for (j = uvinput_height; j > 0; j--, UPlane += uvpitch_adj) { for (i = uvinput_width; i > 0; i -= 8) { *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4; *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4; } for (i = uvwidth_diff; i > 0; i -= 8) { *(U32 *)UPlane = 0x40404040; UPlane += 4; *(U32 *)UPlane = 0x40404040; UPlane += 4; } } for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj) { for (i = uvoutput_width; i > 0; i -= 8) { *(U32 *)UPlane = 0x40404040; UPlane += 4; *(U32 *)UPlane = 0x40404040; UPlane += 4; } } for (j = uvinput_height; j > 0; j--, VPlane += uvpitch_adj) { for (i = uvinput_width; i > 0; i -= 8) { *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4; *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4; } for (i = uvwidth_diff; i > 0; i -= 8) { *(U32 *)VPlane = 0x40404040; VPlane += 4; *(U32 *)VPlane = 0x40404040; VPlane += 4; } } for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj) { for (i = uvoutput_width; i > 0; i -= 8) { *(U32 *)VPlane = 0x40404040; VPlane += 4; *(U32 *)VPlane = 0x40404040; VPlane += 4; } } } #endif __declspec(naked) _STATIC void IA_H26X_YUV12toEncYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | + 92 // | FrameHeight | + 88 // | FrameWidth | + 84 // | VPlane | + 80 // | UPlane | + 76 // | YPlane | + 72 // | lpInput | + 68 // | lpbiInput | + 64 // ----------------------------- // | return addr | + 60 // | saved ebp | + 56 // | saved ebx | + 52 // | saved esi | + 48 // | saved edi | + 44 // | ypitch_adj | + 40 // | yinput_height | + 36 // | yinput_width | + 32 // | yheight_diff | + 28 // | ywidth_diff | + 24 // | uvpitch_adj | + 20 // | uvoutput_width | + 16 // | uvinput_height | + 12 // | uvinput_width | + 8 // | uvheight_diff | + 4 // | uvwidth_diff | + 0 #define LOCALSIZE 44 #define PITCH_PARM 92 #define FRAME_HEIGHT 88 #define FRAME_WIDTH 84 #define VPLANE 80 #define UPLANE 76 #define YPLANE 72 #define LP_INPUT 68 #define LPBI_INPUT 64 #define YPITCH_ADJ 40 #define YINPUT_HEIGHT 36 #define YINPUT_WIDTH 32 #define YHEIGHT_DIFF 28 #define YWIDTH_DIFF 24 #define UVPITCH_ADJ 20 #define UVOUTPUT_WIDTH 16 #define UVINPUT_HEIGHT 12 #define UVINPUT_WIDTH 8 #define UVHEIGHT_DIFF 4 #define UVWIDTH_DIFF 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE mov ebx, [esp + FRAME_HEIGHT] mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] // ypitch_adj = pitch - FrameWidth mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvoutput_width = FrameWidth >> 1 mov ebp, ecx shr ebp, 1 mov [esp + UVOUTPUT_WIDTH], ebp // uvpitch_adj = pitch - (FrameWidth >> 1) sub edx, ebp mov [esp + UVPITCH_ADJ], edx // yinput_height = lpbiInput->biHeight // uvinput_height = yinput_height >> 1 // yinput_width = lpbiInput->biWidth // uvinput_width = yinput_width >> 1 mov ebx, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[ebx].biHeight mov [esp + YINPUT_HEIGHT], eax shr eax, 1 mov [esp + UVINPUT_HEIGHT], eax mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth mov [esp + YINPUT_WIDTH], eax shr eax, 1 mov [esp + UVINPUT_WIDTH], eax // yheight_diff = FrameHeight - yinput_height // uvheight_diff = yheight_diff >> 1; mov eax, [esp + FRAME_HEIGHT] mov ebx, eax sub eax, [esp + YINPUT_HEIGHT] jns NoCrop0 xor eax, eax mov [esp + YINPUT_HEIGHT], ebx shr ebx, 1 mov [esp + UVINPUT_HEIGHT], ebx NoCrop0: mov [esp + YHEIGHT_DIFF], eax shr eax, 1 mov [esp + UVHEIGHT_DIFF], eax // ywidth_diff = FrameWidth - yinput_width // uvwidth_diff = ywidth_diff >> 1; mov eax, [esp + FRAME_WIDTH] xor ebx, ebx sub eax, [esp + YINPUT_WIDTH] jns NoCrop1 mov eax, [esp + FRAME_WIDTH] mov ebx, [esp + YINPUT_WIDTH] sub ebx, eax mov [esp + YINPUT_WIDTH], eax shr eax, 1 mov [esp + UVINPUT_WIDTH], eax xor eax, eax NoCrop1: mov [esp + YWIDTH_DIFF], eax shr eax, 1 mov [esp + UVWIDTH_DIFF], eax // assign (esi, lpInput) mov esi, [esp + LP_INPUT] // assign (edi, YPlane) mov edi, [esp + YPLANE] // for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj) // assign (ecx, j) mov ecx, [esp + YINPUT_HEIGHT] L1: // for (i = yinput_width; i > 0; i -= 8) // assign (ebp, i) mov ebp, [esp + YINPUT_WIDTH] L2: // *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4 // *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4 // 1 mov eax, [esi] mov edx, [esi + 4] // 2 shr eax, 1 and edx, 0xFEFEFEFE // 3 shr edx, 1 and eax, 0x7F7F7F7F // 4 lea esi, [esi + 8] mov [edi], eax // 5 sub ebp, 8 mov [edi + 4], edx // 6 lea edi, [edi + 8] jnz L2 // for (i = ywidth_diff; i > 0; i -= 8) // *(U32 *)YPlane = 0; YPlane += 4; // *(U32 *)YPlane = 0; YPlane += 4; // assign (ebp, i) mov ebp, [esp + YWIDTH_DIFF] test ebp, ebp jz L3 L4: // 1 xor eax, eax sub ebp, 8 // 2 mov [edi], eax mov [edi + 4], eax // 3 lea edi, [edi + 8] jnz L4 // j--, YPlane += ypitch_adj L3: mov eax, [esp + YPITCH_ADJ] add edi, eax add esi, ebx dec ecx jnz L1 // for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj) // assign (ecx, j) mov ecx, [esp + YHEIGHT_DIFF] test ecx, ecx jz L7 L5: // for (i = FrameWidth; i > 0; i -= 8) // *(U32 *)YPlane = 0; YPlane += 4; // *(U32 *)YPlane = 0; YPlane += 4; // assign (ebp, i) mov ebp, [esp + FRAME_WIDTH] L6: // 1 xor eax, eax sub ebp, 8 // 2 mov [edi], eax mov [edi + 4], eax // 3 lea edi, [edi + 8] jnz L6 // j--, YPlane += ypitch_adj mov eax, [esp + YPITCH_ADJ] add edi, eax dec ecx jnz L5 L7: // recompute start of input U plane mov edx, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[edx].biHeight mov ecx, (LPBITMAPINFOHEADER)[edx].biWidth imul eax, ecx // assign (esi, lpInput) mov esi, [esp + LP_INPUT] add esi, eax // assign (edi, UPlane) mov edi, [esp + UPLANE] shr ebx, 1 // for (j = uvinput_height; j > 0; j--, UPlane += ypitch_adj) // assign (ecx, j) mov ecx, [esp + UVINPUT_HEIGHT] L8: // for (i = uvinput_width; i > 0; i -= 8) // assign (ebp, i) mov ebp, [esp + UVINPUT_WIDTH] L9: // *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4 // *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4 // 1 mov eax, [esi] mov edx, [esi + 4] // 2 shr eax, 1 and edx, 0xFEFEFEFE // 3 shr edx, 1 and eax, 0x7F7F7F7F // 4 lea esi, [esi + 8] mov [edi], eax // 5 sub ebp, 8 mov [edi + 4], edx // 6 lea edi, [edi + 8] jnz L9 // for (i = uvwidth_diff; i > 0; i -= 8) // *(U32 *)UPlane = 0x40404040; UPlane += 4; // *(U32 *)UPlane = 0x40404040; UPlane += 4; // assign (ebp, i) mov ebp, [esp + UVWIDTH_DIFF] test ebp, ebp jz L11 L10: // 1 mov eax, 040404040H sub ebp, 8 // 2 mov [edi], eax mov [edi + 4], eax // 3 lea edi, [edi + 8] jnz L10 // j--, UPlane += uvpitch_adj L11: mov eax, [esp + UVPITCH_ADJ] add edi, eax add esi, ebx dec ecx jnz L8 // for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj) // assign (ecx, j) mov ecx, [esp + UVHEIGHT_DIFF] test ecx, ecx jz L14 L12: // for (i = uvoutput_width; i > 0; i -= 8) // *(U32 *)UPlane = 0x40404040; UPlane += 4; // *(U32 *)UPlane = 0x40404040; UPlane += 4; // assign (ebp, i) mov ebp, [esp + UVOUTPUT_WIDTH] L13: // 1 mov eax, 040404040H sub ebp, 8 // 2 mov [edi], eax mov [edi + 4], eax // 3 lea edi, [edi + 8] jnz L13 // j--, UPlane += uvpitch_adj mov eax, [esp + UVPITCH_ADJ] add edi, eax dec ecx jnz L12 L14: // recompute start of input V plane mov edx, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[edx].biHeight mov ecx, (LPBITMAPINFOHEADER)[edx].biWidth imul eax, ecx // assign (esi, lpInput) mov esi, [esp + LP_INPUT] add esi, eax shr eax, 2 add esi, eax // assign (edi, VPlane) mov edi, [esp + VPLANE] // for (j = uvinput_height; j > 0; j--, VPlane += ypitch_adj) // assign (ecx, j) mov ecx, [esp + UVINPUT_HEIGHT] L15: // for (i = uvinput_width; i > 0; i -= 8) // assign (ebp, i) mov ebp, [esp + UVINPUT_WIDTH] L16: // *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4 // *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4 // 1 mov eax, [esi] mov edx, [esi + 4] // 2 shr eax, 1 and edx, 0xFEFEFEFE // 3 shr edx, 1 and eax, 0x7F7F7F7F // 4 lea esi, [esi + 8] mov [edi], eax // 5 sub ebp, 8 mov [edi + 4], edx // 6 lea edi, [edi + 8] jnz L16 // for (i = uvwidth_diff; i > 0; i -= 8) // *(U32 *)VPlane = 0x40404040; VPlane += 4; // *(U32 *)VPlane = 0x40404040; VPlane += 4; // assign (ebp, i) mov ebp, [esp + UVWIDTH_DIFF] test ebp, ebp jz L18 L17: // 1 mov eax, 040404040H sub ebp, 8 // 2 mov [edi], eax mov [edi + 4], eax // 3 lea edi, [edi + 8] jnz L17 // j--, VPlane += uvpitch_adj L18: mov eax, [esp + UVPITCH_ADJ] add edi, eax add esi, ebx dec ecx jnz L15 // for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj) // assign (ecx, j) mov ecx, [esp + UVHEIGHT_DIFF] test ecx, ecx jz L21 L19: // for (i = uvoutput_width; i > 0; i -= 8) // *(U32 *)VPlane = 0x40404040; VPlane += 4; // *(U32 *)VPlane = 0x40404040; VPlane += 4; // assign (ebp, i) mov ebp, [esp + UVOUTPUT_WIDTH] L20: // 1 mov eax, 040404040H sub ebp, 8 // 2 mov [edi], eax mov [edi + 4], eax // 3 lea edi, [edi + 8] jnz L20 // j--, VPlane += uvpitch_adj mov eax, [esp + UVPITCH_ADJ] add edi, eax dec ecx jnz L19 L21: add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef YPITCH_ADJ #undef YINPUT_HEIGHT #undef YINPUT_WIDTH #undef YHEIGHT_DIFF #undef YWIDTH_DIFF #undef UVPITCH_ADJ #undef UVOUTPUT_WIDTH #undef UVINPUT_HEIGHT #undef UVINPUT_WIDTH #undef UVHEIGHT_DIFF #undef UVWIDTH_DIFF #if defined(_CODEC_STATS) void NOC_H26X_YUY2toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U8 *pnext, *plast, *pbn, *peol; int width_adj, height_adj; int stretch, mark, aspect; int iBackTwoLines; int j, k; int LumaIters = 0; int ypitch_adj = 0; int uvpitch_adj = 0; int nextline = -(lpbiInput->biWidth << 1); for (j = FrameHeight; j > 0; j -= 48) { LumaIters += 4; } width_adj = lpbiInput->biWidth - FrameWidth; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // Move from end of line N to beginning of line N-1 iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) << 1); // Point to the beginning of the last line. pnext = lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj; for (j = LumaIters; j > 0; j--) { for (k = 0; k < mark; k++) { for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) { if (0 == (k & 1)) { *(YPlane+0) = *(pnext+ 0); *(YPlane+1) = *(pnext+ 2); *(YPlane+2) = *(pnext+ 4); *(YPlane+3) = *(pnext+ 6); *(YPlane+4) = *(pnext+ 8); *(YPlane+5) = *(pnext+10); *(YPlane+6) = *(pnext+12); *(YPlane+7) = *(pnext+14); *(UPlane+0) = ((*(pnext+ 1)>>1) + (*(pnext+ 1+nextline)>>1)); *(UPlane+1) = ((*(pnext+ 5)>>1) + (*(pnext+ 5+nextline)>>1)); *(UPlane+2) = ((*(pnext+ 9)>>1) + (*(pnext+ 9+nextline)>>1)); *(UPlane+3) = ((*(pnext+13)>>1) + (*(pnext+13+nextline)>>1)); *(VPlane+0) = ((*(pnext+ 3)>>1) + (*(pnext+ 3+nextline)>>1)); *(VPlane+1) = ((*(pnext+ 7)>>1) + (*(pnext+ 7+nextline)>>1)); *(VPlane+2) = ((*(pnext+11)>>1) + (*(pnext+11+nextline)>>1)); *(VPlane+3) = ((*(pnext+15)>>1) + (*(pnext+15+nextline)>>1)); UPlane += 4; VPlane += 4; } else { *(YPlane+0) = *(pnext+ 0); *(YPlane+1) = *(pnext+ 2); *(YPlane+2) = *(pnext+ 4); *(YPlane+3) = *(pnext+ 6); *(YPlane+4) = *(pnext+ 8); *(YPlane+5) = *(pnext+10); *(YPlane+6) = *(pnext+12); *(YPlane+7) = *(pnext+14); } } pnext += iBackTwoLines; YPlane += ypitch_adj; if (0 == (k & 1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } if (stretch) { plast = pnext - (lpbiInput->biWidth << 1); pbn = pnext; for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8) { *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)); *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)); *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)); *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)); } YPlane += ypitch_adj; } } } #endif #if 0 void C_H26X_YUY2toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 *lpInput, U8 *YPlane, U8 *UPlane, U8 *VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { U8 *pnext, *plast, *pbn, *peol; int width_adj, height_adj; int stretch, mark, aspect; int iBackTwoLines; int j, k; int LumaIters = 0; int ypitch_adj = pitch - FrameWidth; int uvpitch_adj = pitch - (FrameWidth >> 1); int nextline = -(lpbiInput->biWidth << 1); for (j = FrameHeight; j > 0; j -= 48) { LumaIters += 4; } width_adj = lpbiInput->biWidth - FrameWidth; aspect = (width_adj ? LumaIters : 0); height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1; stretch = (height_adj ? 1 : 0); mark = 12 - stretch; // Move from end of line N to beginning of line N-1 iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) << 1); // Point to the beginning of the last line. pnext = lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) + width_adj; for (j = LumaIters; j > 0; j--) { for (k = 0; k < mark; k++) { for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) { if (0 == (k & 1)) { *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1; *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1; *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1; *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1; *(UPlane+0) = ((*(pnext+ 1)>>1) + (*(pnext+ 1+nextline)>>1)) >> 1; *(UPlane+1) = ((*(pnext+ 5)>>1) + (*(pnext+ 5+nextline)>>1)) >> 1; *(UPlane+2) = ((*(pnext+ 9)>>1) + (*(pnext+ 9+nextline)>>1)) >> 1; *(UPlane+3) = ((*(pnext+13)>>1) + (*(pnext+13+nextline)>>1)) >> 1; *(VPlane+0) = ((*(pnext+ 3)>>1) + (*(pnext+ 3+nextline)>>1)) >> 1; *(VPlane+1) = ((*(pnext+ 7)>>1) + (*(pnext+ 7+nextline)>>1)) >> 1; *(VPlane+2) = ((*(pnext+11)>>1) + (*(pnext+11+nextline)>>1)) >> 1; *(VPlane+3) = ((*(pnext+15)>>1) + (*(pnext+15+nextline)>>1)) >> 1; UPlane += 4; VPlane += 4; } else { *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1; *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1; *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1; *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1; } } pnext += iBackTwoLines; YPlane += ypitch_adj; if (0 == (k & 1)) { UPlane += uvpitch_adj; VPlane += uvpitch_adj; } } if (stretch) { plast = pnext - (lpbiInput->biWidth << 1); pbn = pnext; for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8) { *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1; *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1; *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1; *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1; } YPlane += ypitch_adj; } } } #endif __declspec(naked) _STATIC void IA_H26X_YUY2toYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * BGR24Image, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | + 96 // | FrameHeight | + 92 // | FrameWidth | + 88 // | VPlane | + 84 // | UPlane | + 80 // | YPlane | + 76 // | lpInput | + 72 // | lpbiInput | + 68 // ---------------------------- // | return addr | + 64 // | saved ebp | + 60 // | saved ebx | + 56 // | saved esi | + 52 // | saved edi | + 48 // | pyprev | + 44 // | pyspace | + 40 // | pynext | + 36 // | peol | + 32 // | j | + 28 // | k | + 24 // | iBackTwoLines | + 20 // | stretch | + 16 // | mark | + 12 // | LumaIters | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 #define LOCALSIZE 48 #define PITCH_PARM 96 #define FRAME_HEIGHT 92 #define FRAME_WIDTH 88 #define VPLANE 84 #define UPLANE 80 #define YPLANE 76 #define LP_INPUT 72 #define LPBI_INPUT 68 #define PYPREV 44 #define PYSPACE 40 #define PYNEXT 36 #define PEOL 32 #define LOOP_J 28 #define LOOP_K 24 #define BACK_TWO_LINES 20 #define STRETCH 16 #define MARK 12 #define LUMA_ITERS 8 #define YPITCH_ADJ 4 #define UVPITCH_ADJ 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // assign (ebx, lpbiInput) mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (edx, pitch) mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (edx, LumaIters) xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = lpbiInput->biWidth - FrameWidth; // assign (esi, width_adj) mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] // aspect = (width_adj ? LumaIters : 0) // assign (edi, aspect) // kill (edx, LumaIters) mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (edx, height_adj) L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0) xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1) mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] shl ebp, 1 neg ebp mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput + // ((lpbiInput->biWidth << 1) * // ((FrameHeight - aspect - 1) + height_adj)) + // width_adj // kill (ebx, lpbiInput) // kill (ecx, FrameWidth) // kill (edx, height_adj) // kill (esi, width_adj) // kill (edi, aspect) // assign (esi, pnext) mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane) // assign (edx, UPlane) // assign (ebp, VPlane) mov edi, [esp + YPLANE] mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] // for (j = 0; j < LumaIters; j++) xor eax, eax mov [esp + LOOP_J], eax L4: // for (k = 0; k < mark; k++) xor eax, eax mov [esp + LOOP_K], eax L5: // for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, esi mov [esp + PEOL], ecx // if (0 == (k & 1)) { mov eax, [esp + LOOP_K] test eax, 1 jnz L6 // *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1 // *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1 // *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1 // *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1 // *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1 // *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1 // *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1 // *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1 // or graphically // ************************************************************************************************* // Values * Y 0 * U 0 * Y 1 * V 0 * Y 2 * U 1 * Y 3 * V 1 * Y 4 * U 2 * Y 5 * V 2 * Y 6 * U 3 * Y 7 * V 3 * // ************************************************************************************************* // Y Offsets 0 2 4 6 8 10 12 14 // U Offsets 1 5 9 13 // Y Offsets 3 7 11 15 // Register usage: // eax - accumulate Y values // ebx - accumulate U values // ecx - accumulate V values // esi - ptr to interlaced (VYUY) input // edi - ptr for writing Y values // edx - ptr for writing U values // ebp - ptr for writing V values L7: ; 1 mov al, [esi+4] ; Y2 mov bl, [esi+9] ; U2 ; 2 mov ah, [esi+6] ; Y3 mov bh, [esi+13] ; U3 ; 3 shl eax, 16 mov cl, [esi+11] ; V2 ; 4 shl ebx, 16 mov ch, [esi+15] ; V3 ; 5 shl ecx, 16 mov al, [esi] ; Y0 ; 6 mov bh, [esi+5] ; U1 mov ah, [esi+2] ; Y1 ; 7 shr eax, 1 mov bl, [esi+1] ; U0 ; 8 shr ebx, 1 mov ch, [esi+7] ; V1 ; 9 and eax, 07F7F7F7FH mov cl, [esi+3] ; V0 ; 10 shr ecx, 1 and ebx, 07F7F7F7FH ; 11 mov [edi], eax and ecx, 07F7F7F7FH ; 12 mov al, [esi+12] ; Y6 mov [edx], ebx ; 13 mov ah, [esi+14] ; Y7 mov [ebp], ecx ; 14 shl eax, 16 mov ecx, [esp + PEOL] ; 15 mov al, [esi+8] ; Y4 lea edi, [edi+8] ; 16 mov ah, [esi+10] ; Y5 lea edx, [edx+4] ; 17 shr eax, 1 lea ebp, [ebp+4] ; 18 and eax, 07F7F7F7FH lea esi, [esi+16] ; 19 mov [edi-4], eax cmp esi, ecx ; 20 jl L7 jmp L8 // } else { // *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1 // *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1 // *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1 // *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1 // } // Register usage: // eax, ebx - accumulate Y values // ecx - peol // esi - ptr to interlaced (VYUY) input // edi - ptr for writing Y values L6: ; 1 mov al, [esi+4] ; Y2 mov bl, [esi+12] ; Y6 ; 2 mov ah, [esi+6] ; Y3 mov bh, [esi+14] ; Y7 ; 3 shl eax, 16 lea edi, [edi+8] ; 4 shl ebx, 16 mov al, [esi] ; Y0 ; 5 mov ah, [esi+2] ; Y1 mov bh, [esi+10] ; Y5 ; 6 shr eax, 1 mov bl, [esi+8] ; Y4 ; 7 shr ebx, 1 and eax, 07F7F7F7FH ; 8 mov [edi-8], eax and ebx, 07F7F7F7FH ; 9 mov [edi-8+4], ebx lea esi, [esi+16] ; 10 cmp esi, ecx jl L6 L8: // pnext += iBackTwoLines add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj add edi, [esp + YPITCH_ADJ] // if (0 == (k&1)) mov eax, [esp + LOOP_K] test eax, 1 jnz L9 // UPlane += uvpitch_adj add edx, [esp + UVPITCH_ADJ] // VPlane += uvpitch_adj add ebp, [esp + UVPITCH_ADJ] L9: mov eax, [esp + LOOP_K] inc eax mov [esp + LOOP_K], eax cmp eax, [esp + MARK] jl L5 // if (stretch) mov eax, [esp + STRETCH] test eax, eax jz L10 // Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average. mov [esp + UPLANE], edx mov [esp + VPLANE], ebp // plast = pnext - (lpbiInput->biWidth << 1) // assign (plast, edx) mov edx, esi mov eax, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[eax].biWidth shl eax, 1 sub edx, eax // pbn = pnext // assign (pbn, ebp) mov ebp, esi // for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8) mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, ebp // *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1 // *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1 // *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1 // *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1 mov al, [edx+4] mov bl, [ebp+4] mov bh, [ebp+6] shl ebx, 16 L11: ; 1 mov ah, [edx+6] mov bl, [ebp] ; 2 shl eax, 16 mov bh, [ebp+2] ; 3 mov al, [edx] lea edi, [edi+4] ; 4 mov ah, [edx+2] lea edx, [edx+8] ; 5 and eax, 0xFEFEFEFE lea ebp, [ebp+8] ; 6 shr eax, 1 and ebx, 0xFEFEFEFE ; 7 shr ebx, 1 nop ; 8 add eax, ebx mov bl, [ebp+4] ; 9 shr eax, 1 mov bh, [ebp+6] ; 10 shl ebx, 16 and eax, 0x7F7F7F7F ; 11 mov [edi-4], eax mov al, [edx+4] ; 12 cmp ebp, ecx jl L11 // YPlane += ypitch_adj; add edi, [esp + YPITCH_ADJ] // Recover pts to UPlane and VPlane mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] L10: mov eax, [esp + LOOP_J] inc eax mov [esp + LOOP_J], eax cmp eax, [esp + LUMA_ITERS] jl L4 add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } bool UYVY_to_YUV12_Flip( LPBITMAPINFOHEADER lpbiInput, U8 * pImage, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { DWORD dwFrameWidthHalf, dwFrameHeightHalf; BYTE *pRowStartY, *pRowStartSrc, *pRowStartU, *pRowStartV; int offset; int nRowsToSkip=0, nColsToSkip=0, nRowSkipDelta=0xffffff, nColSkipDelta=0xffffff; int nSrcRowIndex, nDstRowIndex, nSrcColIndex, nDstColIndex, COLUMNSTOSKIP=0, ROWSTOSKIP=0; if ((FrameWidth != (DWORD)(lpbiInput->biWidth)) || (FrameHeight != (DWORD)(lpbiInput->biHeight))) { nColsToSkip = COLUMNSTOSKIP = lpbiInput->biWidth - FrameWidth; nRowsToSkip = ROWSTOSKIP = lpbiInput->biHeight - FrameHeight; if ((nColsToSkip < 0) || (nRowsToSkip < 0)) { return false; } // nXXXSkipDelta dictate how often we "skip" a row or col if (nRowsToSkip) { nRowSkipDelta = (lpbiInput->biHeight + (nRowsToSkip - 1)) / nRowsToSkip; } if (nColsToSkip) { nColSkipDelta = (lpbiInput->biWidth + (nColsToSkip - 1)) / nColsToSkip; } } // quick check to make sure we're processing CIF, QCIF, or SQCIF if ((FrameWidth % 4) || (FrameHeight % 4)) { return false; } dwFrameWidthHalf = FrameWidth / 2; dwFrameHeightHalf = FrameHeight / 2; nSrcRowIndex = 0; nDstRowIndex = 0; // step 1, convert the Y values over while ((DWORD)nDstRowIndex < FrameHeight) { // ASSERT(nSrcRowIndex < lpbiInput->biHeight); pRowStartY = YPlane + (pitch * nDstRowIndex); pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 1; // do we need to skip this row ? if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0)) { nRowsToSkip--; nSrcRowIndex++; continue; } // Copy the Y values of the input row into the destination row nSrcColIndex = 0; nDstColIndex = 0; nColsToSkip = COLUMNSTOSKIP; while ((DWORD)nDstColIndex < FrameWidth) { // ASSERT(nSrcColIndex < lpbiInput->biWidth); // do we need to skip this column ? if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0)) { nColsToSkip--; nSrcColIndex++; continue; } pRowStartY[nDstColIndex] = pRowStartSrc[nSrcColIndex * 2] >> 1; nSrcColIndex++; nDstColIndex++; } nSrcRowIndex++; nDstRowIndex++; } nSrcRowIndex = 0; nDstRowIndex = 0; nRowsToSkip = ROWSTOSKIP; // step 2, process U and V values while ((DWORD)nDstRowIndex < dwFrameHeightHalf) // dest is only half as many rows as src { // ASSERT(nSrcRowIndex < lpbiInput->biHeight); // don't process odd numbered rows if (nSrcRowIndex % 2) { // if we were supposed to skip this src row anyway, make sure // we update our decrement if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0)) { nRowsToSkip--; } nSrcRowIndex++; continue; } // do we need to skip this row ? if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0)) { nRowsToSkip--; nSrcRowIndex++; continue; } pRowStartU = UPlane + (pitch * nDstRowIndex); pRowStartV = VPlane + (pitch * nDstRowIndex); pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 0; // Copy the U and V values of the input row into the destination row nSrcColIndex = 0; nDstColIndex = 0; nColsToSkip = COLUMNSTOSKIP; // reset column skip count while ((DWORD)nDstColIndex < dwFrameWidthHalf) { // ASSERT(nSrcColIndex < lpbiInput->biWidth); // skip odd numbered columns if (nSrcColIndex % 2) { // if we were supposed to skip this src row anyway, make sure // we update our decrement if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0)) { nColsToSkip--; } nSrcColIndex++; continue; } // do we need to skip this column ? if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0)) { nSrcColIndex++; nColsToSkip--; continue; } offset = nSrcColIndex * 2; pRowStartU[nDstColIndex] = pRowStartSrc[offset] >> 1; pRowStartV[nDstColIndex] = pRowStartSrc[offset+2] >> 1; nSrcColIndex++; nDstColIndex++; } nSrcRowIndex++; nDstRowIndex++; } // and we are done! return true; } __declspec(naked) _STATIC void IA_H26X_UYVYtoYUV12( LPBITMAPINFOHEADER lpbiInput, U8 * BGR24Image, U8 * YPlane, U8 * UPlane, U8 * VPlane, UN FrameWidth, UN FrameHeight, const int pitch) { // Permanent (callee-save) registers - ebx, esi, edi, ebp // Temporary (caller-save) registers - eax, ecx, edx // // Stack frame layout // | pitch | + 96 // | FrameHeight | + 92 // | FrameWidth | + 88 // | VPlane | + 84 // | UPlane | + 80 // | YPlane | + 76 // | lpInput | + 72 // | lpbiInput | + 68 // ---------------------------- // | return addr | + 64 // | saved ebp | + 60 // | saved ebx | + 56 // | saved esi | + 52 // | saved edi | + 48 // | pyprev | + 44 // | pyspace | + 40 // | pynext | + 36 // | peol | + 32 // | j | + 28 // | k | + 24 // | iBackTwoLines | + 20 // | stretch | + 16 // | mark | + 12 // | LumaIters | + 8 // | ypitch_adj | + 4 // | uvpitch_adj | + 0 _asm { push ebp push ebx push esi push edi sub esp, LOCALSIZE // assign (ebx, lpbiInput) mov ebx, [esp + LPBI_INPUT] // ypitch_adj = pitch - FrameWidth // assign (ecx, FrameWidth) // assign (edx, pitch) mov ecx, [esp + FRAME_WIDTH] mov edx, [esp + PITCH_PARM] mov eax, edx sub eax, ecx mov [esp + YPITCH_ADJ], eax // uvpitch_adj = pitch - (FrameWidth >> 1) // kill (edx, pitch) mov ebp, ecx shr ebp, 1 sub edx, ebp mov [esp + UVPITCH_ADJ], edx // for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4 // assign (edx, LumaIters) xor edx, edx mov eax, [esp + FRAME_HEIGHT] L1: lea edx, [edx + 4] sub eax, 48 jnz L1 // width_adj = lpbiInput->biWidth - FrameWidth; // assign (esi, width_adj) mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth sub esi, [esp + FRAME_WIDTH] // aspect = (width_adj ? LumaIters : 0) // assign (edi, aspect) // kill (edx, LumaIters) mov [esp + LUMA_ITERS], edx xor edi, edi test esi, esi jz L2 mov edi, edx // height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1 // assign (edx, height_adj) L2: mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight sub edx, [esp + FRAME_HEIGHT] add edx, edi shr edx, 1 // stretch = (height_adj ? 1 : 0) xor eax, eax test edx, edx jz L3 inc eax L3: mov [esp + STRETCH], eax // mark = 12 - stretch mov ebp, 12 sub ebp, eax mov [esp + MARK], ebp // iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1) mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth add ebp, [esp + FRAME_WIDTH] shl ebp, 1 neg ebp mov [esp + BACK_TWO_LINES], ebp // pnext = lpInput + // ((lpbiInput->biWidth << 1) * // ((FrameHeight - aspect - 1) + height_adj)) + // width_adj // kill (ebx, lpbiInput) // kill (ecx, FrameWidth) // kill (edx, height_adj) // kill (esi, width_adj) // kill (edi, aspect) // assign (esi, pnext) mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth shl eax, 1 mov ebx, [esp + FRAME_HEIGHT] sub ebx, edi dec ebx add ebx, edx imul ebx add esi, eax add esi, [esp + LP_INPUT] // assign (edi, YPlane) // assign (edx, UPlane) // assign (ebp, VPlane) mov edi, [esp + YPLANE] mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] // for (j = 0; j < LumaIters; j++) xor eax, eax mov [esp + LOOP_J], eax L4: // for (k = 0; k < mark; k++) xor eax, eax mov [esp + LOOP_K], eax L5: // for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, esi mov [esp + PEOL], ecx // if (0 == (k & 1)) { mov eax, [esp + LOOP_K] test eax, 1 jnz L6 // *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1 // *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1 // *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1 // *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1 // *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1 // *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1 // *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1 // *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1 // or graphically // ************************************************************************************************* // Values * U 0 * Y 0 * V 0 * Y 1 * U 1 * Y 2 * V 1 * Y 3 * U 2 * Y 4 * V 2 * Y 5 * U 3 * Y 6 * V 3 * Y 7 * // ************************************************************************************************* // Y Offsets 1 3 5 7 9 11 13 15 // U Offsets 0 4 8 12 // Y Offsets 2 6 10 14 // Register usage: // eax - accumulate Y values // ebx - accumulate U values // ecx - accumulate V values // esi - ptr to interlaced (VYUY) input // edi - ptr for writing Y values // edx - ptr for writing U values // ebp - ptr for writing V values L7: ; 1 mov al, [esi+5] ; Y2 mov bl, [esi+8] ; U2 ; 2 mov ah, [esi+7] ; Y3 mov bh, [esi+12] ; U3 ; 3 shl eax, 16 mov cl, [esi+10] ; V2 ; 4 shl ebx, 16 mov ch, [esi+14] ; V3 ; 5 shl ecx, 16 mov al, [esi+1] ; Y0 ; 6 mov bh, [esi+4] ; U1 mov ah, [esi+3] ; Y1 ; 7 shr eax, 1 mov bl, [esi] ; U0 ; 8 shr ebx, 1 mov ch, [esi+6] ; V1 ; 9 and eax, 07F7F7F7FH mov cl, [esi+2] ; V0 ; 10 shr ecx, 1 and ebx, 07F7F7F7FH ; 11 mov [edi], eax and ecx, 07F7F7F7FH ; 12 mov al, [esi+13] ; Y6 mov [edx], ebx ; 13 mov ah, [esi+15] ; Y7 mov [ebp], ecx ; 14 shl eax, 16 mov ecx, [esp + PEOL] ; 15 mov al, [esi+9] ; Y4 lea edi, [edi+8] ; 16 mov ah, [esi+11] ; Y5 lea edx, [edx+4] ; 17 shr eax, 1 lea ebp, [ebp+4] ; 18 and eax, 07F7F7F7FH lea esi, [esi+16] ; 19 mov [edi-4], eax cmp esi, ecx ; 20 jl L7 jmp L8 // } else { // *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1 // *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1 // *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1 // *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1 // } // Register usage: // eax, ebx - accumulate Y values // ecx - peol // esi - ptr to interlaced (VYUY) input // edi - ptr for writing Y values L6: ; 1 mov al, [esi+5] ; Y2 mov bl, [esi+13] ; Y6 ; 2 mov ah, [esi+7] ; Y3 mov bh, [esi+15] ; Y7 ; 3 shl eax, 16 lea edi, [edi+8] ; 4 shl ebx, 16 mov al, [esi+1] ; Y0 ; 5 mov ah, [esi+3] ; Y1 mov bh, [esi+11] ; Y5 ; 6 shr eax, 1 mov bl, [esi+9] ; Y4 ; 7 shr ebx, 1 and eax, 07F7F7F7FH ; 8 mov [edi-8], eax and ebx, 07F7F7F7FH ; 9 mov [edi-8+4], ebx lea esi, [esi+16] ; 10 cmp esi, ecx jl L6 L8: // pnext += iBackTwoLines add esi, [esp + BACK_TWO_LINES] // YPlane += ypitch_adj add edi, [esp + YPITCH_ADJ] // if (0 == (k&1)) mov eax, [esp + LOOP_K] test eax, 1 jnz L9 // UPlane += uvpitch_adj add edx, [esp + UVPITCH_ADJ] // VPlane += uvpitch_adj add ebp, [esp + UVPITCH_ADJ] L9: mov eax, [esp + LOOP_K] inc eax mov [esp + LOOP_K], eax cmp eax, [esp + MARK] jl L5 // if (stretch) mov eax, [esp + STRETCH] test eax, eax jz L10 // Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average. mov [esp + UPLANE], edx mov [esp + VPLANE], ebp // plast = pnext - (lpbiInput->biWidth << 1) // assign (plast, edx) mov edx, esi mov eax, [esp + LPBI_INPUT] mov eax, (LPBITMAPINFOHEADER)[eax].biWidth shl eax, 1 sub edx, eax // pbn = pnext // assign (pbn, ebp) mov ebp, esi // for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8) mov ecx, [esp + FRAME_WIDTH] shl ecx, 1 add ecx, ebp // *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1 // *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1 // *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1 // *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1 mov al, [edx+5] mov bl, [ebp+5] mov bh, [ebp+7] shl ebx, 16 L11: ; 1 mov ah, [edx+7] mov bl, [ebp+1] ; 2 shl eax, 16 mov bh, [ebp+3] ; 3 mov al, [edx+1] lea edi, [edi+4] ; 4 mov ah, [edx+3] lea edx, [edx+8] ; 5 and eax, 0xFEFEFEFE lea ebp, [ebp+8] ; 6 shr eax, 1 and ebx, 0xFEFEFEFE ; 7 shr ebx, 1 nop ; 8 add eax, ebx mov bl, [ebp+5] ; 9 shr eax, 1 mov bh, [ebp+7] ; 10 shl ebx, 16 and eax, 0x7F7F7F7F ; 11 mov [edi-4], eax mov al, [edx+5] ; 12 cmp ebp, ecx jl L11 // YPlane += ypitch_adj; add edi, [esp + YPITCH_ADJ] // Recover pts to UPlane and VPlane mov edx, [esp + UPLANE] mov ebp, [esp + VPLANE] L10: mov eax, [esp + LOOP_J] inc eax mov [esp + LOOP_J], eax cmp eax, [esp + LUMA_ITERS] jl L4 add esp, LOCALSIZE pop edi pop esi pop ebx pop ebp ret } } #undef LOCALSIZE #undef PITCH_PARM #undef FRAME_HEIGHT #undef FRAME_WIDTH #undef VPLANE #undef UPLANE #undef YPLANE #undef LP_INPUT #undef LPBI_INPUT #undef PYPREV #undef PYSPACE #undef PYNEXT #undef PEOL #undef LOOP_J #undef LOOP_K #undef BACK_TWO_LINES #undef STRETCH #undef MARK #undef LUMA_ITERS #undef YPITCH_ADJ #undef UVPITCH_ADJ /************************************************************* * Name: colorCnvtFrame * Description: Color convert and copy input frame. ************************************************************/ void colorCnvtFrame( T_H263EncoderCatalog * EC, LPCODINST lpCompInst, ICCOMPRESS * lpicComp, U8 * YPlane, U8 * UPlane, U8 * VPlane ) { U8 *RGBCursor = (U8 *) lpicComp->lpInput; LPBITMAPINFOHEADER lpbiInput = lpicComp->lpbiInput; bool bRet; FX_ENTRY("colorCnvtFrame") /* The Connectix Quick Cam requires RGB to YUV12 conversion. * The B/W camera generates palette versions (8 and 4 bit). * The color camera generates RGB24 for million colors and * RGB16555 for thousands colors. */ if (BI_RGB == lpicComp->lpbiInput->biCompression) { if (24 == lpicComp->lpbiInput->biBitCount) { #if 0 if ((128 == lpbiInput->biWidth) && (96 == lpbiInput->biHeight)) { U8 YTest[12288]; U8 UTest[6144]; U8 VTest[6144]; int i, j, k; U8 R,G,B; C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YTest, UTest, VTest, EC->FrameWidth, EC->FrameHeight, 128); for (i = 0; i < 96; i++) { for (j = 0; j < 128; j++) { k = (i*128)+j; if (1 < abs(YPlane[(i*384)+j]-YTest[(i*128)+j])) { B = RGBCursor[(((95-i)*128)+j)*3]; G = RGBCursor[(((95-i)*128)+j)*3+1]; R = RGBCursor[(((95-i)*128)+j)*3+2]; } if ((0 == (i%2)) && (0 == (j%2))) { k = ((i>>1)*128)+(j>>1); if (1 < abs(UPlane[((i>>1)*384)+(j>>1)]-UTest[((i>>1)*128)+(j>>1)])) { B = RGBCursor[(((95-i)*128)+j)*3]; G = RGBCursor[(((95-i)*128)+j)*3+1]; R = RGBCursor[(((95-i)*128)+j)*3+2]; } if (1 < abs(VPlane[((i>>1)*384)+(j>>1)] != VTest[((i>>1)*128)+(j>>1)])) { B = RGBCursor[(((95-i)*128)+j)*3]; G = RGBCursor[(((95-i)*128)+j)*3+1]; R = RGBCursor[(((95-i)*128)+j)*3+2]; } } } } } #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else IA_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else if(16 == lpicComp->lpbiInput->biBitCount) { // To use a common routine for all possible combinations of RGB16, // a bitfield number is passed. This number identifies the proper bit shift // and masking values to extract the color information // from the 16-bit pixel words. // // number shift mask // B, G, R // ------ ----------- ---------------- // 555 2, 3, 8 0x7C, 0x7C, 0x7C // 664 3, 3, 9 0x78, 0x7E, 0x7E // 565 2, 4, 9 0x7C, 0x7E, 0x7C // 655 2, 3, 9 0x7C, 0x7C, 0x7E // // Only 555 falls under BI_RGB. The others are specified using the // BI_BITFIELDS compression specification. For BI_BITFIELDS, call // Build16bitModeID to get the actual bitfield number. This routine requires the // three array elements in the bmiColors field of a BITMAPINFO object. // #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_BGR16toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, 555, PITCH); #else IA_H26X_BGR16555toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else if(8 == lpicComp->lpbiInput->biBitCount) { #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, 8, PITCH); #else IA_H26X_CLUT8toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else if(4 == lpicComp->lpbiInput->biBitCount) { #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, 4, PITCH); #else IA_H26X_CLUT4toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else { ERRORMESSAGE(("%s: Unexpected input format detected\r\n", _fx_)); } } else if (FOURCC_YVU9 == lpicComp->lpbiInput->biCompression) { #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else IA_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else if ((FOURCC_YUV12 == lpicComp->lpbiInput->biCompression) || (FOURCC_IYUV == lpicComp->lpbiInput->biCompression)) { #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else IA_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else if (FOURCC_YUY2 == lpicComp->lpbiInput->biCompression) { #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif #if 0 C_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #else IA_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); #endif #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else if (FOURCC_UYVY == lpicComp->lpbiInput->biCompression) { #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER(); } #endif // UYVY images are upside-down in relation to I420 // call the "flipped" version of the UYVY-I420 translator. bRet = UYVY_to_YUV12_Flip(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, EC->FrameWidth, EC->FrameHeight, PITCH); // IA_H26X_UYVYtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane, // EC->FrameWidth, EC->FrameHeight, PITCH); #if defined(_CODEC_STATS) if (pEncoderStats) { pEncoderStats->color_convertor_time = PENTIUM_TIMER() - pEncoderStats->color_convertor_time; } #endif } else { ERRORMESSAGE(("%s: Unexpected input format detected\r\n", _fx_)); } } #endif // } H263P