Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

5753 lines
141 KiB

/* *************************************************************************
** INTEL Corporation Proprietary Information
**
** This listing is supplied under the terms of a license
** agreement with INTEL Corporation and may not be copied
** nor disclosed except in accordance with the terms of
** that agreement.
**
** Copyright (c) 1995 Intel Corporation.
** All Rights Reserved.
**
** *************************************************************************
*/
////////////////////////////////////////////////////////////////////////////
//
// $Author: MDUDA $
// $Date: 21 Nov 1996 17:33:56 $
// $Archive: S:\h26x\src\enc\excolcnv.cpv $
// $Header: S:\h26x\src\enc\excolcnv.cpv 1.45 21 Nov 1996 17:33:56 MDUDA $
// $Log: S:\h26x\src\enc\excolcnv.cpv $
//
// Rev 1.45 21 Nov 1996 17:33:56 MDUDA
// Added more non-compressed YUV12 support (RGB16 and RGB24).
// Also rewrote IA_YUV12toEncYUV12 to be more readable.
//
// Rev 1.44 31 Oct 1996 10:05:48 KLILLEVO
// changed from DBOUT to DbgLog
//
// Rev 1.43 22 Oct 1996 16:44:22 MDUDA
// Added IA support for YUY2 input color conversion and cleaned up C version.
// Now using IA version.
//
// Rev 1.42 18 Oct 1996 14:31:32 MDUDA
//
// Added a C-version of YUY2 input color conversion.
//
// Rev 1.41 11 Oct 1996 16:04:50 MDUDA
// Using new RGB to YUV lookup tables.
//
// Rev 1.40 03 Oct 1996 10:43:58 AGUPTA2
// Got rid of segment directives; made tables read-only.
//
// Rev 1.39 13 Sep 1996 13:34:04 MDUDA
// Fixed YVU9 bug where input = output frame size was not colored
// (U and V planes) properly.
//
// Rev 1.38 11 Sep 1996 15:45:06 MDUDA
// Modified RGB look-up tables and added C_H26X_YUV12toEncYUV12 and
// IA_H26X_YUV12toEncYUV12.
//
// Rev 1.37 03 Sep 1996 14:54:46 MDUDA
// Fixed problem causing VC++ 4.1 internal compiler error. Replaced
// inline assembler constructs such as [ebx.biWidth] with
// (LPBITMAPINFOHEADER)[ebx].biWidth.
//
// Rev 1.36 29 Aug 1996 16:31:14 MDUDA
// Added Pentium assembler versions for all RGB conversion routines.
// Also, rewrote YVU9 support to allow input frame sizes other
// than 160x120 and 240x180.
//
// Rev 1.35 16 Aug 1996 12:17:48 MDUDA
// Fixed bug where U and V values in the BGR converters were treated as unsign
// values. Also did some general cleanup of BGR converters in preparation for
// doing Pentium assembler version.
//
// Rev 1.34 13 Aug 1996 10:35:38 MDUDA
// Added support for RGB4. Generalized RGB LUT support for 4-bit and
// and 8-bit pixels into a single routine.
//
// Rev 1.33 09 Aug 1996 09:45:02 MDUDA
// Added support for RGB16 format on input. This is for the color
// Quick Cam. Also, generalized RGB16 for other bit combinations.
// However, these can only be specified under BI_BITFIELDS format.
//
// Rev 1.32 02 Aug 1996 13:44:48 MDUDA
// modified H26X_BGR24toYUV12 to crop and stretch 240x180 and 160x120
// frames
//
// Rev 1.31 01 Aug 1996 14:03:50 MDUDA
//
// Optimized H26X_YVU9toYUV12 by rewriting function in assembler code. Used in
// _asm. Also re-arranged functions so that colorCnvtFrame is at the end of
// the file.
//
// Rev 1.30 22 Jul 1996 13:28:22 BECHOLS
// Added a CLUT8 to YUV12 color convertor (CC). This CC crops and stretches
// either the 240x180 or the 160x120 image size to produce QCIF and SubQCIF
// image sizes respectively.
//
// Rev 1.29 11 Jul 1996 15:47:02 MDUDA
//
// Modified H263_YVU9toYUV12 to create subQCIF and QCIF from
// 160x120 and 240x180 images, respectively. To fit the new
// formats, the original images are cropped and stretched using a
// dither pattern for the color planes.
//
// Rev 1.28 14 May 1996 12:04:08 KLILLEVO
// changed RGB->YUV color conversion to use the inverse
// if the output YUV->RGB conversion instead of the conversion
// "recommended by the CCIR". Compression performance for RGB
// input was significantly improved (33% less bits for same
// fixed QP)
//
// Rev 1.27 04 May 1996 21:55:20 BECHOLS
// For RGB24 to YVU12 conversion, I unrolled the inner loop by 8 and changed
// the writes to DWORD vs. BYTE writes. This resulted in a 30% reduction in
// the execution time.
//
// Rev 1.26 10 Apr 1996 16:44:14 RHAZRA
// Fixed a bug in 320x240 mode for the H26X_YUV12toEncYUV12() function.
// DWORD should be and-ed with 0x7f7f7f7f and not 0x7f7f7f.
//
// Rev 1.25 27 Mar 1996 15:10:08 SCDAY
// Optimized H26X_YUV12toEncYUV12 'C' code to read/write DWORDs
//
// Rev 1.24 08 Jan 1996 17:46:14 unknown
//
// Correct logic on bIs320x240 check
//
// Rev 1.23 05 Jan 1996 17:34:38 RMCKENZX
// corrected chroma pad value to 0x40 to achieve black padding
//
// Rev 1.22 05 Jan 1996 17:29:46 RMCKENZX
// Added code to pad out 320x240 stills to 352x288
// full CIF images.
//
// Rev 1.21 04 Jan 1996 18:37:20 TRGARDOS
// Added code to permit 320x240 input and then set a boolean
// bIs320x240.
//
// Rev 1.20 02 Jan 1996 17:09:04 TRGARDOS
// Moved colorCnvFrame into this file and made the
// color convertor functions static.
//
// Rev 1.19 27 Dec 1995 15:32:56 RMCKENZX
// Added copyright notice
//
// Rev 1.18 06 Dec 1995 09:35:42 TRGARDOS
// Added Brian's fix to the input color convertor to avoid
// overflow of the chars.
//
// Rev 1.17 27 Nov 1995 16:09:04 TRGARDOS
// Removed two unused variables to get rid of compiler warnings.
//
// Rev 1.16 30 Oct 1995 14:34:12 TRGARDOS
// Fixed 240x180 to center clip.
//
// Rev 1.15 30 Oct 1995 12:03:16 TRGARDOS
// Added color convertor support for YUV9 240x180.
//
// Rev 1.14 28 Oct 1995 15:39:28 TRGARDOS
// Fixed color conversion problem from YVU9 to YVU12.
//
// Rev 1.13 12 Oct 1995 17:40:12 TRGARDOS
// Fixed YUV12 input color convertor.
//
// Rev 1.12 12 Oct 1995 12:04:16 TRGARDOS
// Changed some variable names in YUV12 convertor.
//
// Rev 1.11 10 Oct 1995 16:34:12 TRGARDOS
// Added YUV12 input support.
//
// Rev 1.10 28 Sep 1995 17:02:36 DBRUCKS
// fix colorIn to not swap left to right
//
// Rev 1.9 15 Sep 1995 16:37:38 TRGARDOS
//
//
// Rev 1.8 13 Sep 1995 17:09:22 TRGARDOS
//
// Finished adding encoder support for YVU9 160x120 frames.
//
// Rev 1.7 11 Sep 1995 11:14:06 DBRUCKS
// add h261 ifdef
//
// Rev 1.6 07 Sep 1995 09:27:54 TRGARDOS
// Added YVU9 to YVU12 color convertor.
//
// Rev 1.5 05 Sep 1995 15:50:46 TRGARDOS
// Added color back in to convertors.
//
// Rev 1.4 01 Sep 1995 17:51:42 TRGARDOS
// Fixed bugs in color converter.
//
// Rev 1.3 01 Sep 1995 10:13:42 TRGARDOS
// Debugging bit stream errors.
//
// Rev 1.2 30 Aug 1995 12:42:26 TRGARDOS
// Fixed bugs in intra AC coef VLC coding.
//
// Rev 1.1 02 Aug 1995 17:28:06 TRGARDOS
//
// Cleaned up stuff to get stub working under new
// version control system.
//
// Rev 1.0 31 Jul 1995 13:07:10 DBRUCKS
// Initial revision.
//
// Rev 1.0 17 Jul 1995 14:46:16 CZHU
// Initial revision.
//
// Rev 1.0 17 Jul 1995 14:14:22 CZHU
// Initial revision.
;////////////////////////////////////////////////////////////////////////////
/*
CCIR 601 Specifies a conversion from RGB to YCrCb. For
what we call U and V, they are equivalent as
U = Cb, V = Cr.
From CCIR 601-2 Annex II, we can go from RGB with values
in the range of 0-255, to YUV values in the same range
by the equation:
Y = ( 77*R + 150*G + 29*B ) >> 8;
V = ( 131*R - 110*G - 21*B ) >> 8 + 128; // Cr
U = ( (-44)*R - 87*G + 131*B ) >> 8 + 128; // Cb
Has now changed to the inverse of the YUV->RGB on the
output, since the old version produced way too many bits.
The new version is:
Y = ( 16836*R + 33056*G + 6416*B ) >> 16 + 16;
V = ( 28777*R - 24117*G - 4660*B ) >> 16 + 128; // Cr
U = ( (-9726)*R - 19064*G + 28790*B ) >> 16 + 128; // Cb
*/
#include "precomp.h"
#if defined(_CODEC_STATS)
static const double RDTSC_SHIFT_32 = 4294967296.0;
static double PENTIUM_TIMER()
{
unsigned long int a, b;
double temp1, temp2, result;
__asm
{
_emit 0x0f
_emit 0x31
mov a, eax
mov b, edx
}
temp1 = (double) a;
temp2 = (double) (b & 0xFFFF);
if (RDTSC_CLOCK_FREQ) {
result = (temp1 + temp2 * RDTSC_SHIFT_32) / RDTSC_CLOCK_FREQ;
} else {
result = 0.0;
}
return( result * 1000.0 );
}
#endif
// Set all local functions to "static", and then set it accordingly if
// VTune statistics are to be collected. VTune doesn't recognize static functions
// so we need some way to turn off the static attribute if VTune is to be run
// on the executable. For now, simply use a define of _VTUNE to build the driver.
#if defined(_VTUNE)
#define _STATIC
#else
#define _STATIC static
#endif
// These are the look-up tables for the RGB converters. They are 8 bytes/entry
// to allow addressing via the scale by 8 indexed addressing mode. A pseudo-SIMD
// arrangement is used in these tables. Since all R, G and B contributions to the
// Y value are positive and fit in 15 bits, these are stored in the lower 16-bits
// of the YU word. In some cases, the U contribution is negative so it is placed
// in the upper 16 bits of the YU word. When a Y value is calculated, the U value
// is calculated in parallel. The V contribution is negative in some cases, but it
// gets its own word.
// This is the code that was used to generate the tables.
#if 0
#define YRCoef 16836
#define YGCoef 33056
#define YBCoef 6416
#define URCoef 9726
#define UGCoef 19064
#define UBCoef 28790
#define VRCoef 28777
#define VGCoef 24117
#define VBCoef 4660
#include <stdio.h>
void main() {
int i,j;
printf("struct YUV {\n");
printf(" int YU;\n");
printf(" int V;\n");
printf("};\n\n");
printf("struct YUV RYUV[] = {\n");
for (i = 0; i < 64; i++) {
for (j = 0; j < 4; j += 2) {
printf("{0x%.8x, 0x%.8x}, ",
((YRCoef*((i*4)+j+1))>>9) |
((-(((URCoef*((i*4)+j+1)))>>9))<<16),
((VRCoef*((i*4)+j+1))>>9));
}
printf("\n");
}
printf("};\n");
printf("struct YUV GYUV[] = {\n");
for (i = 0; i < 64; i++) {
for (j = 0; j < 4; j += 2) {
printf("{0x%.8x, 0x%.8x}, ",
((YGCoef*((i*4)+j+1))>>9) |
((-(((UGCoef*((i*4)+j+1)))>>9))<<16),
-((VGCoef*((i*4)+j+1))>>9));
}
printf("\n");
}
printf("};\n");
printf("struct YUV BYUV[] = {\n");
for (i = 0; i < 64; i++) {
for (j = 0; j < 4; j += 2) {
printf("{0x%.8x, 0x%.8x}, ",
((YBCoef*((i*4)+j+1))>>9) |
(((UBCoef*((i*4)+j+1))>>9)<<16),
-((VBCoef*((i*4)+j+1))>>9));
}
printf("\n");
}
printf("};\n");
}
#endif
struct YUV {
int YU;
int V;
};
const struct YUV RYUV[] = {
{0xffee0020, 0x00000038}, {0xffc80062, 0x000000a8},
{0xffa200a4, 0x00000119}, {0xff7c00e6, 0x00000189},
{0xff560127, 0x000001f9}, {0xff300169, 0x0000026a},
{0xff0a01ab, 0x000002da}, {0xfee401ed, 0x0000034b},
{0xfebe022f, 0x000003bb}, {0xfe980270, 0x0000042b},
{0xfe7202b2, 0x0000049c}, {0xfe4c02f4, 0x0000050c},
{0xfe260336, 0x0000057d}, {0xfe000377, 0x000005ed},
{0xfdda03b9, 0x0000065d}, {0xfdb403fb, 0x000006ce},
{0xfd8e043d, 0x0000073e}, {0xfd68047e, 0x000007af},
{0xfd4204c0, 0x0000081f}, {0xfd1c0502, 0x0000088f},
{0xfcf60544, 0x00000900}, {0xfcd00585, 0x00000970},
{0xfcaa05c7, 0x000009e1}, {0xfc840609, 0x00000a51},
{0xfc5e064b, 0x00000ac2}, {0xfc38068d, 0x00000b32},
{0xfc1206ce, 0x00000ba2}, {0xfbec0710, 0x00000c13},
{0xfbc60752, 0x00000c83}, {0xfba00794, 0x00000cf4},
{0xfb7a07d5, 0x00000d64}, {0xfb540817, 0x00000dd4},
{0xfb2e0859, 0x00000e45}, {0xfb08089b, 0x00000eb5},
{0xfae208dc, 0x00000f26}, {0xfabc091e, 0x00000f96},
{0xfa960960, 0x00001006}, {0xfa7009a2, 0x00001077},
{0xfa4a09e3, 0x000010e7}, {0xfa240a25, 0x00001158},
{0xf9fe0a67, 0x000011c8}, {0xf9d80aa9, 0x00001239},
{0xf9b20aeb, 0x000012a9}, {0xf98c0b2c, 0x00001319},
{0xf9660b6e, 0x0000138a}, {0xf9400bb0, 0x000013fa},
{0xf91a0bf2, 0x0000146b}, {0xf8f40c33, 0x000014db},
{0xf8ce0c75, 0x0000154b}, {0xf8a80cb7, 0x000015bc},
{0xf8820cf9, 0x0000162c}, {0xf85c0d3a, 0x0000169d},
{0xf8360d7c, 0x0000170d}, {0xf8100dbe, 0x0000177d},
{0xf7ea0e00, 0x000017ee}, {0xf7c40e41, 0x0000185e},
{0xf79e0e83, 0x000018cf}, {0xf7780ec5, 0x0000193f},
{0xf7520f07, 0x000019af}, {0xf72c0f49, 0x00001a20},
{0xf7060f8a, 0x00001a90}, {0xf6e00fcc, 0x00001b01},
{0xf6ba100e, 0x00001b71}, {0xf6941050, 0x00001be2},
{0xf66e1091, 0x00001c52}, {0xf64810d3, 0x00001cc2},
{0xf6221115, 0x00001d33}, {0xf5fc1157, 0x00001da3},
{0xf5d61198, 0x00001e14}, {0xf5b011da, 0x00001e84},
{0xf58a121c, 0x00001ef4}, {0xf564125e, 0x00001f65},
{0xf53e12a0, 0x00001fd5}, {0xf51812e1, 0x00002046},
{0xf4f21323, 0x000020b6}, {0xf4cc1365, 0x00002126},
{0xf4a613a7, 0x00002197}, {0xf48013e8, 0x00002207},
{0xf45a142a, 0x00002278}, {0xf434146c, 0x000022e8},
{0xf40e14ae, 0x00002359}, {0xf3e814ef, 0x000023c9},
{0xf3c21531, 0x00002439}, {0xf39c1573, 0x000024aa},
{0xf37615b5, 0x0000251a}, {0xf35015f6, 0x0000258b},
{0xf32a1638, 0x000025fb}, {0xf304167a, 0x0000266b},
{0xf2de16bc, 0x000026dc}, {0xf2b816fe, 0x0000274c},
{0xf292173f, 0x000027bd}, {0xf26c1781, 0x0000282d},
{0xf24617c3, 0x0000289d}, {0xf2201805, 0x0000290e},
{0xf1fa1846, 0x0000297e}, {0xf1d41888, 0x000029ef},
{0xf1ae18ca, 0x00002a5f}, {0xf188190c, 0x00002acf},
{0xf162194d, 0x00002b40}, {0xf13c198f, 0x00002bb0},
{0xf11619d1, 0x00002c21}, {0xf0f01a13, 0x00002c91},
{0xf0ca1a54, 0x00002d02}, {0xf0a41a96, 0x00002d72},
{0xf07e1ad8, 0x00002de2}, {0xf0581b1a, 0x00002e53},
{0xf0321b5c, 0x00002ec3}, {0xf00c1b9d, 0x00002f34},
{0xefe61bdf, 0x00002fa4}, {0xefc01c21, 0x00003014},
{0xef9a1c63, 0x00003085}, {0xef741ca4, 0x000030f5},
{0xef4e1ce6, 0x00003166}, {0xef281d28, 0x000031d6},
{0xef021d6a, 0x00003246}, {0xeedc1dab, 0x000032b7},
{0xeeb61ded, 0x00003327}, {0xee901e2f, 0x00003398},
{0xee6a1e71, 0x00003408}, {0xee441eb2, 0x00003479},
{0xee1e1ef4, 0x000034e9}, {0xedf81f36, 0x00003559},
{0xedd21f78, 0x000035ca}, {0xedac1fba, 0x0000363a},
{0xed861ffb, 0x000036ab}, {0xed60203d, 0x0000371b},
{0xed3a207f, 0x0000378b}, {0xed1420c1, 0x000037fc},
};
const struct YUV GYUV[] = {
{0xffdb0040, 0xffffffd1}, {0xff9100c1, 0xffffff73},
{0xff460142, 0xffffff15}, {0xfefc01c3, 0xfffffeb7},
{0xfeb10245, 0xfffffe59}, {0xfe6702c6, 0xfffffdfa},
{0xfe1c0347, 0xfffffd9c}, {0xfdd203c8, 0xfffffd3e},
{0xfd880449, 0xfffffce0}, {0xfd3d04ca, 0xfffffc82},
{0xfcf3054b, 0xfffffc23}, {0xfca805cc, 0xfffffbc5},
{0xfc5e064e, 0xfffffb67}, {0xfc1306cf, 0xfffffb09},
{0xfbc90750, 0xfffffaaa}, {0xfb7e07d1, 0xfffffa4c},
{0xfb340852, 0xfffff9ee}, {0xfae908d3, 0xfffff990},
{0xfa9f0954, 0xfffff932}, {0xfa5409d5, 0xfffff8d3},
{0xfa0a0a57, 0xfffff875}, {0xf9bf0ad8, 0xfffff817},
{0xf9750b59, 0xfffff7b9}, {0xf92a0bda, 0xfffff75b},
{0xf8e00c5b, 0xfffff6fc}, {0xf8960cdc, 0xfffff69e},
{0xf84b0d5d, 0xfffff640}, {0xf8010dde, 0xfffff5e2},
{0xf7b60e60, 0xfffff584}, {0xf76c0ee1, 0xfffff525},
{0xf7210f62, 0xfffff4c7}, {0xf6d70fe3, 0xfffff469},
{0xf68c1064, 0xfffff40b}, {0xf64210e5, 0xfffff3ad},
{0xf5f71166, 0xfffff34e}, {0xf5ad11e7, 0xfffff2f0},
{0xf5621269, 0xfffff292}, {0xf51812ea, 0xfffff234},
{0xf4cd136b, 0xfffff1d6}, {0xf48313ec, 0xfffff177},
{0xf439146d, 0xfffff119}, {0xf3ee14ee, 0xfffff0bb},
{0xf3a4156f, 0xfffff05d}, {0xf35915f0, 0xffffeffe},
{0xf30f1672, 0xffffefa0}, {0xf2c416f3, 0xffffef42},
{0xf27a1774, 0xffffeee4}, {0xf22f17f5, 0xffffee86},
{0xf1e51876, 0xffffee27}, {0xf19a18f7, 0xffffedc9},
{0xf1501978, 0xffffed6b}, {0xf10519f9, 0xffffed0d},
{0xf0bb1a7b, 0xffffecaf}, {0xf0701afc, 0xffffec50},
{0xf0261b7d, 0xffffebf2}, {0xefdb1bfe, 0xffffeb94},
{0xef911c7f, 0xffffeb36}, {0xef471d00, 0xffffead8},
{0xeefc1d81, 0xffffea79}, {0xeeb21e02, 0xffffea1b},
{0xee671e84, 0xffffe9bd}, {0xee1d1f05, 0xffffe95f},
{0xedd21f86, 0xffffe901}, {0xed882007, 0xffffe8a2},
{0xed3d2088, 0xffffe844}, {0xecf32109, 0xffffe7e6},
{0xeca8218a, 0xffffe788}, {0xec5e220b, 0xffffe72a},
{0xec13228d, 0xffffe6cb}, {0xebc9230e, 0xffffe66d},
{0xeb7e238f, 0xffffe60f}, {0xeb342410, 0xffffe5b1},
{0xeaea2491, 0xffffe552}, {0xea9f2512, 0xffffe4f4},
{0xea552593, 0xffffe496}, {0xea0a2614, 0xffffe438},
{0xe9c02696, 0xffffe3da}, {0xe9752717, 0xffffe37b},
{0xe92b2798, 0xffffe31d}, {0xe8e02819, 0xffffe2bf},
{0xe896289a, 0xffffe261}, {0xe84b291b, 0xffffe203},
{0xe801299c, 0xffffe1a4}, {0xe7b62a1d, 0xffffe146},
{0xe76c2a9f, 0xffffe0e8}, {0xe7212b20, 0xffffe08a},
{0xe6d72ba1, 0xffffe02c}, {0xe68c2c22, 0xffffdfcd},
{0xe6422ca3, 0xffffdf6f}, {0xe5f82d24, 0xffffdf11},
{0xe5ad2da5, 0xffffdeb3}, {0xe5632e26, 0xffffde55},
{0xe5182ea8, 0xffffddf6}, {0xe4ce2f29, 0xffffdd98},
{0xe4832faa, 0xffffdd3a}, {0xe439302b, 0xffffdcdc},
{0xe3ee30ac, 0xffffdc7e}, {0xe3a4312d, 0xffffdc1f},
{0xe35931ae, 0xffffdbc1}, {0xe30f322f, 0xffffdb63},
{0xe2c432b1, 0xffffdb05}, {0xe27a3332, 0xffffdaa6},
{0xe22f33b3, 0xffffda48}, {0xe1e53434, 0xffffd9ea},
{0xe19b34b5, 0xffffd98c}, {0xe1503536, 0xffffd92e},
{0xe10635b7, 0xffffd8cf}, {0xe0bb3638, 0xffffd871},
{0xe07136ba, 0xffffd813}, {0xe026373b, 0xffffd7b5},
{0xdfdc37bc, 0xffffd757}, {0xdf91383d, 0xffffd6f8},
{0xdf4738be, 0xffffd69a}, {0xdefc393f, 0xffffd63c},
{0xdeb239c0, 0xffffd5de}, {0xde673a41, 0xffffd580},
{0xde1d3ac3, 0xffffd521}, {0xddd23b44, 0xffffd4c3},
{0xdd883bc5, 0xffffd465}, {0xdd3d3c46, 0xffffd407},
{0xdcf33cc7, 0xffffd3a9}, {0xdca93d48, 0xffffd34a},
{0xdc5e3dc9, 0xffffd2ec}, {0xdc143e4a, 0xffffd28e},
{0xdbc93ecc, 0xffffd230}, {0xdb7f3f4d, 0xffffd1d2},
{0xdb343fce, 0xffffd173}, {0xdaea404f, 0xffffd115},
};
const struct YUV BYUV[] = {
{0x0038000c, 0xfffffff7}, {0x00a80025, 0xffffffe5},
{0x0119003e, 0xffffffd3}, {0x01890057, 0xffffffc1},
{0x01fa0070, 0xffffffaf}, {0x026a0089, 0xffffff9c},
{0x02da00a2, 0xffffff8a}, {0x034b00bb, 0xffffff78},
{0x03bb00d5, 0xffffff66}, {0x042c00ee, 0xffffff54},
{0x049c0107, 0xffffff41}, {0x050d0120, 0xffffff2f},
{0x057d0139, 0xffffff1d}, {0x05ee0152, 0xffffff0b},
{0x065e016b, 0xfffffef9}, {0x06cf0184, 0xfffffee6},
{0x073f019d, 0xfffffed4}, {0x07b001b6, 0xfffffec2},
{0x082001cf, 0xfffffeb0}, {0x089001e8, 0xfffffe9e},
{0x09010201, 0xfffffe8b}, {0x0971021a, 0xfffffe79},
{0x09e20233, 0xfffffe67}, {0x0a52024c, 0xfffffe55},
{0x0ac30266, 0xfffffe43}, {0x0b33027f, 0xfffffe30},
{0x0ba40298, 0xfffffe1e}, {0x0c1402b1, 0xfffffe0c},
{0x0c8502ca, 0xfffffdfa}, {0x0cf502e3, 0xfffffde8},
{0x0d6602fc, 0xfffffdd5}, {0x0dd60315, 0xfffffdc3},
{0x0e46032e, 0xfffffdb1}, {0x0eb70347, 0xfffffd9f},
{0x0f270360, 0xfffffd8c}, {0x0f980379, 0xfffffd7a},
{0x10080392, 0xfffffd68}, {0x107903ab, 0xfffffd56},
{0x10e903c4, 0xfffffd44}, {0x115a03dd, 0xfffffd31},
{0x11ca03f7, 0xfffffd1f}, {0x123b0410, 0xfffffd0d},
{0x12ab0429, 0xfffffcfb}, {0x131c0442, 0xfffffce9},
{0x138c045b, 0xfffffcd6}, {0x13fc0474, 0xfffffcc4},
{0x146d048d, 0xfffffcb2}, {0x14dd04a6, 0xfffffca0},
{0x154e04bf, 0xfffffc8e}, {0x15be04d8, 0xfffffc7b},
{0x162f04f1, 0xfffffc69}, {0x169f050a, 0xfffffc57},
{0x17100523, 0xfffffc45}, {0x1780053c, 0xfffffc33},
{0x17f10555, 0xfffffc20}, {0x1861056e, 0xfffffc0e},
{0x18d20588, 0xfffffbfc}, {0x194205a1, 0xfffffbea},
{0x19b205ba, 0xfffffbd8}, {0x1a2305d3, 0xfffffbc5},
{0x1a9305ec, 0xfffffbb3}, {0x1b040605, 0xfffffba1},
{0x1b74061e, 0xfffffb8f}, {0x1be50637, 0xfffffb7d},
{0x1c550650, 0xfffffb6a}, {0x1cc60669, 0xfffffb58},
{0x1d360682, 0xfffffb46}, {0x1da7069b, 0xfffffb34},
{0x1e1706b4, 0xfffffb22}, {0x1e8806cd, 0xfffffb0f},
{0x1ef806e6, 0xfffffafd}, {0x1f6806ff, 0xfffffaeb},
{0x1fd90719, 0xfffffad9}, {0x20490732, 0xfffffac7},
{0x20ba074b, 0xfffffab4}, {0x212a0764, 0xfffffaa2},
{0x219b077d, 0xfffffa90}, {0x220b0796, 0xfffffa7e},
{0x227c07af, 0xfffffa6c}, {0x22ec07c8, 0xfffffa59},
{0x235d07e1, 0xfffffa47}, {0x23cd07fa, 0xfffffa35},
{0x243e0813, 0xfffffa23}, {0x24ae082c, 0xfffffa11},
{0x251e0845, 0xfffff9fe}, {0x258f085e, 0xfffff9ec},
{0x25ff0877, 0xfffff9da}, {0x26700890, 0xfffff9c8},
{0x26e008aa, 0xfffff9b6}, {0x275108c3, 0xfffff9a3},
{0x27c108dc, 0xfffff991}, {0x283208f5, 0xfffff97f},
{0x28a2090e, 0xfffff96d}, {0x29130927, 0xfffff95b},
{0x29830940, 0xfffff948}, {0x29f40959, 0xfffff936},
{0x2a640972, 0xfffff924}, {0x2ad4098b, 0xfffff912},
{0x2b4509a4, 0xfffff8ff}, {0x2bb509bd, 0xfffff8ed},
{0x2c2609d6, 0xfffff8db}, {0x2c9609ef, 0xfffff8c9},
{0x2d070a08, 0xfffff8b7}, {0x2d770a21, 0xfffff8a4},
{0x2de80a3b, 0xfffff892}, {0x2e580a54, 0xfffff880},
{0x2ec90a6d, 0xfffff86e}, {0x2f390a86, 0xfffff85c},
{0x2faa0a9f, 0xfffff849}, {0x301a0ab8, 0xfffff837},
{0x308a0ad1, 0xfffff825}, {0x30fb0aea, 0xfffff813},
{0x316b0b03, 0xfffff801}, {0x31dc0b1c, 0xfffff7ee},
{0x324c0b35, 0xfffff7dc}, {0x32bd0b4e, 0xfffff7ca},
{0x332d0b67, 0xfffff7b8}, {0x339e0b80, 0xfffff7a6},
{0x340e0b99, 0xfffff793}, {0x347f0bb2, 0xfffff781},
{0x34ef0bcc, 0xfffff76f}, {0x35600be5, 0xfffff75d},
{0x35d00bfe, 0xfffff74b}, {0x36400c17, 0xfffff738},
{0x36b10c30, 0xfffff726}, {0x37210c49, 0xfffff714},
{0x37920c62, 0xfffff702}, {0x38020c7b, 0xfffff6f0},
};
#define COEF_WIDTH 8
#define SHIFT_WIDTH COEF_WIDTH
//
// All of the RGB converters follow the template given below. The converters make
// some assumptions about the frame size. All output frame sizes are assumed to
// have a frame height that is a multiple of 48. Also, the output frame width
// is assumed to be a multiple of 8. If the input frame size is equal
// to the output frame size, no stretching or cropping is done. Otherwise, the
// image is cropped and stretched for an 11:12 aspect ratio.
//
#if 0
void rgb_color_converter() {
for (j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= m, pnext += n) {
compute m Y values using look-up tables
if (0 == (k&1)) {
compute m/2 U,V values using look-up tables
}
}
if ((0 == k) && j) {
for (i = FrameWidth; i > 0; i -= 8 {
t = *pyprev++ & 0xFEFEFEFE;
t += *pynext++ & 0xFEFEFEFE;
*pyspace++ = t;
t = *pyprev++ & 0xFEFEFEFE;
t += *pynext++ & 0xFEFEFEFE;
*pyspace++ = t;
}
}
pnext += iBackTwoLines;
py += ypitch_adj;
if (0 == (k&1)) {
pu += uvpitch_adj;
pv += uvpitch_adj;
}
}
if (stretch) {
pyprev = py - pitch;
pyspace = py;
pynext = py + pitch;
}
}
if (stretch) {
for (i = FrameWidth; i > 0; i -= 4 {
*pyspace++ = *pyprev++;
}
}
}
#endif
//
// For the IA versions, the strategy is to compute the Y value for an odd RGB value
// followed by computing the Y value for the corresponding even RGB value. The registers
// are then set with the proper values to compute U and V values for the even RGB
// value. This avoids repeating the shifting and masking needed to extract the Red,
// Green and Blue components.
//
/*****************************************************************************
*
* H26X_BGR24toYUV12()
*
* Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory
* with pitch defined by the constant PITCH. The input data is stored in
* the order B,G,R,B,G,R...
*
*/
#if 0
_STATIC void C_H26X_BGR24toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * lpInput,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
U32 *pnext, *pyprev, *pyspace, *pynext;
U32 tm;
int t;
int i, j, k;
int iBackTwoLines;
int stretch, mark, aspect;
int height_adj, width_adj;
int LumaIters = 0;
int ypitch_adj = pitch - FrameWidth;
int uvpitch_adj = pitch - (FrameWidth >> 1);
// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).
for (i = FrameHeight; i > 0; i -= 48) {
LumaIters += 4;
}
width_adj = (lpbiInput->biWidth - FrameWidth) >> 1;
width_adj += (width_adj << 1);
aspect = (width_adj ? LumaIters : 0);
height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;
stretch = (height_adj ? 1 : 0);
mark = 12 - stretch;
// The input image is upside down - process the lines in reverse order.
// Move from end of line N to beginning of line N-1
iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 2);
iBackTwoLines += (iBackTwoLines << 1);
// Point to the beginning of the last line.
pnext = (U32 *)
(lpInput +
((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) *
((FrameHeight - aspect - 1) + height_adj)) +
width_adj);
for ( j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= 4, pnext += 3) {
tm = pnext[0];
t = BYUV[tm>>25].YU;
tm = pnext[1];
t += (GYUV[(tm>>1)&0x7F].YU +
RYUV[(tm>>9)&0x7F].YU);
*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
tm = pnext[0];
t = (BYUV[(tm>>1)&0x7F].YU +
GYUV[(tm>>9)&0x7F].YU +
RYUV[(tm>>17)&0x7F].YU);
*YPlane = (U8)((t>>SHIFT_WIDTH)+8);
if (0 == (k&1)) {
*UPlane++ = (U8)((t>>24)+64);
t = (RYUV[(tm>>17)&0x7F].V +
GYUV[(tm>>9)&0x7F].V +
BYUV[(tm>>1)&0x7F].V);
*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
}
tm = pnext[2];
t = (BYUV[(tm>>9)&0x7F].YU +
GYUV[(tm>>17)&0x7F].YU +
RYUV[tm>>25].YU);
*(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8);
tm = pnext[1];
t = BYUV[(tm>>17)&0x7F].YU + GYUV[tm>>25].YU;
tm = pnext[2];
t += RYUV[(tm>>1)&0x7F].YU;
*(YPlane+2) = (U8)((t>>SHIFT_WIDTH)+8);
YPlane += 4;
if (0 == (k&1)) {
*UPlane++ = (U8)((t>>24)+64);
t = RYUV[(tm>>1)&0x7F].V;
tm = pnext[1];
t += GYUV[tm>>25].V + BYUV[(tm>>17)&0x7F].V;
*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
}
}
if (stretch && (0 == k) && j) {
for (i = FrameWidth; i > 0; i -= 8) {
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
}
}
pnext += iBackTwoLines;
YPlane += ypitch_adj;
// Increment after even lines.
if(0 == (k&1)) {
UPlane += uvpitch_adj;
VPlane += uvpitch_adj;
}
} // end of for k
if (stretch) {
pyprev = (U32 *)(YPlane - pitch);
pyspace = (U32 *)YPlane;
pynext = (U32 *)(YPlane += pitch);
}
} // end of for j
if (stretch) {
for (i = FrameWidth; i > 0; i -= 4) {
*pyspace++ = *pyprev++;
}
}
} // end of C_H26X_BGR24toYUV12()
#endif
__declspec(naked)
_STATIC void IA_H26X_BGR24toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * BGR24Image,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | i | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 48
#define PITCH_PARM 96
#define FRAME_HEIGHT 92
#define FRAME_WIDTH 88
#define VPLANE 84
#define UPLANE 80
#define YPLANE 76
#define LP_INPUT 72
#define LPBI_INPUT 68
#define PYPREV 44
#define PYSPACE 40
#define PYNEXT 36
#define LOOP_I 32
#define LOOP_J 28
#define LOOP_K 24
#define BACK_TWO_LINES 20
#define STRETCH 16
#define MARK 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT]
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx
shr ebp, 1
sub edx, ebp
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea edx, [edx + 4]
sub eax, 48
jnz L1
// width_adj = (lpbiInput->biWidth - FrameWidth) >> 1
// width_adj += width_adj << 1
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth
sub esi, [esp + FRAME_WIDTH]
mov eax, esi
shr eax, 1
add esi, eax
// aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx
xor edi, edi
test esi, esi
jz L2
mov edi, edx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2:
mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub edx, [esp + FRAME_HEIGHT]
add edx, edi
shr edx, 1
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test edx, edx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov ebp, 12
sub ebp, eax
mov [esp + MARK], ebp
// iBackTwoLines = -(lpbiInput->biWidth + FrameWidth)
// iBackTwoLines += (iBackTwoLines << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth
add ebp, [esp + FRAME_WIDTH]
neg ebp
mov eax, ebp
shl eax, 1
add ebp, eax
mov [esp + BACK_TWO_LINES], ebp
// pnext = lpInput +
// ((lpbiInput->biWidth + (lpbiInput->biWidth << 1)) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth
shl eax, 1
add eax, (LPBITMAPINFOHEADER)[ebx].biWidth
mov ebx, [esp + FRAME_HEIGHT]
sub ebx, edi
dec ebx
add ebx, edx
imul ebx
add esi, eax
add esi, [esp + LP_INPUT]
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
// for (k = 0; k < mark; k++)
L4:
xor eax, eax
mov [esp + LOOP_K], eax
// for (i = FrameWidth; i > 0; i -= 4, pnext += 12)
L5:
mov eax, [esp + FRAME_WIDTH]
mov [esp + LOOP_I], eax
// This jump is here to make sure the following loop starts in the U pipe
jmp L6
L6:
// ---------------------
// | B2 | R1 | G1 | B1 | pnext[0]
// ---------------------
// | G3 | B3 | R2 | G2 | pnext[1]
// ---------------------
// | R4 | G4 | B4 | R3 | pnext[2]
// ---------------------
// t0 = pnext[0]
// t1 = pnext[1]
// t = ( BYUV[t0>>25].YU +
// GYUV[(t1>> 1)&0x7F].YU +
// RYUV[(t1>> 9)&0x7F].YU )
// *(YPlane+1) = ((t>>8)+8)
// t = ( BYUV[(t0>> 1)&0x7F].YU +
// GYUV[(t0>> 9)&0x7F].YU +
// RYUV[(t0>>17)&0x7F].YU )
// *YPlane = ((t>>8)+8)
// assign(eax: B2,Y1,Y2,U)
// assign(ebx: B1,V)
// assign(ecx: G2,G1)
// assign(edx: R2,R1)
// assign(ebp: B1)
// 1
mov eax, [esi]
mov ecx, [esi + 4]
// 2
mov ebx, eax
mov edx, ecx
// 3
shr eax, 25
and ecx, 0xFE
// 4
shr ecx, 1
and edx, 0xFE00
// 5
shr edx, 9
and ebx, 0xFEFEFE
// 6
mov eax, [BYUV+eax*8].YU
nop
// 7
add eax, [GYUV+ecx*8].YU
mov ecx, ebx
// 8
add eax, [RYUV+edx*8].YU
mov edx, ebx
// 9
and ebx, 0xFE
add eax, 0x800
// 10
sar eax, 8
nop
// 11
shr ebx, 1
nop
// 12
shr ecx, 9
mov [edi + 1], al
// 13
shr edx, 17
and ecx, 0x7F
// 14
mov eax, [BYUV+ebx*8].YU
and edx, 0x7F
// 15
add eax, [GYUV+ecx*8].YU
mov ebp, ebx
// 16
add eax, [RYUV+edx*8].YU
nop
// 17
sar eax, 8
mov ebx, [esp + LOOP_K]
// 18
add eax, 8
and ebx, 1
// 19
mov [edi], al
jnz L9
// At this point, ebp: B1, ecx: G1, edx: R1
// t0 = pnext[0]
// *UPlane++ = ((t>>24)+64)
// t = ( RYUV[(t0>>17)&0x7F].V +
// GYUV[(t0>> 9)&0x7F].V +
// BYUV[(t0>> 1)&0x7F].V )
// *VPlane++ = ((t>>8)+64)
// 20
mov ebx, [RYUV+edx*8].V
mov edx, [esp + UPLANE]
// 21
sar eax, 16
add ebx, [GYUV+ecx*8].V
// 22
add eax, 64
add ebx, [BYUV+ebp*8].V
// 23
mov [edx], al
inc edx
// 24
mov [esp + UPLANE], edx
mov edx, [esp + VPLANE]
// 25
sar ebx, 8
inc edx
// 26
add ebx, 64
mov [esp + VPLANE], edx
// 27
mov [edx - 1], bl
nop
L9:
// ---------------------
// | B2 | R1 | G1 | B1 | pnext[0]
// ---------------------
// | G3 | B3 | R2 | G2 | pnext[1]
// ---------------------
// | R4 | G4 | B4 | R3 | pnext[2]
// ---------------------
// t1 = pnext[1]
// t2 = pnext[2]
// t = ( BYUV[(t2>> 9)&0x7F].YU +
// GYUV[(t2>>17)&0x7F].YU +
// RYUV[t2>>25].YR )
// *(YPlane+3) = ((t>>8)+8)
// t = ( BYUV[(t1>>17)&0x7F].YU +
// GYUV[t1>>25].YU +
// RYUV[(t2>> 1)&0x7F].YU )
// *(YPlane+2) = ((t>>8)+8)
// YPlane += 4
// assign(eax: B4,Y3,Y4,U)
// assign(ebx: R3,V)
// assign(ecx: G4,G3)
// assign(edx: R4/B3)
// assign(ebp: R3)
// 28
mov ebp, [esi + 4]
mov ebx, [esi + 8]
// 29
mov eax, ebx
mov ecx, ebx
// 30
shr eax, 9
mov edx, ebx
// 31
shr ecx, 17
and eax, 0x7F
// 32
shr edx, 25
and ecx, 0x7F
// 33
mov eax, [BYUV+eax*8].YU
nop
// 34
add eax, [GYUV+ecx*8].YU
and ebx, 0xFE
// 35
add eax, [RYUV+edx*8].YU
mov ecx, ebp
// 36
shr ebx, 1
add eax, 0x800
// 37
sar eax, 8
mov edx, ebp
// 38
shr edx, 17
mov [edi + 3], al
// 39
shr ecx, 25
and edx, 0x7F
// 40
mov eax, [RYUV+ebx*8].YU
mov ebp, ebx
// 41
add eax, [GYUV+ecx*8].YU
nop
// 42
add eax, [BYUV+edx*8].YU
nop
// 43
sar eax, 8
mov ebx, [esp + LOOP_K]
// 44
add eax, 8
and ebx, 1
// 45
mov [edi + 2], al
jnz L16
// At this point, ebp: R3, ecx: G3, edx: B3
// t1 = pnext[1]
// t2 = pnext[2]
// *UPlane++ = ((t>>16)+64)
// t = ( RYUV[(t2>> 1)&0x7F].V +
// GYUV[t1>>25].V +
// BYUV[(t1>>17)&0x7F].V )
// *VPlane++ = ((t>>8)+64)
// 46
mov ebx, [BYUV+edx*8].V
mov edx, [esp + UPLANE]
// 47
sar eax, 16
add ebx, [GYUV+ecx*8].V
// 48
add eax, 64
add ebx, [RYUV+ebp*8].V
// 49
mov [edx], al
inc edx
// 50
mov [esp + UPLANE], edx
mov edx, [esp + VPLANE]
// 51
sar ebx, 8
inc edx
// 52
add ebx, 64
mov [esp + VPLANE], edx
// 53
mov [edx - 1], bl
nop
L16:
// 54
mov eax, [esp + LOOP_I]
lea esi, [esi + 12]
// 55
sub eax, 4
lea edi, [edi + 4]
// 56
mov [esp + LOOP_I], eax
jnz L6
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH]
test eax, eax
jz L21
mov eax, [esp + LOOP_K]
test eax, eax
jnz L21
mov eax, [esp + LOOP_J]
test eax, eax
jz L21
// spill YPlane ptr
mov [esp + YPLANE], edi
nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV]
mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT]
mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L22:
// 1
mov eax, [ebx]
lea ebx, [ebx + 4]
// 2
mov ecx, [edx]
lea edx, [edx + 4]
// 3
shr ecx, 1
and eax, 0xFEFEFEFE
// 4
shr eax, 1
and ecx, 0x7F7F7F7F
// 5
add eax, ecx
mov ecx, [ebx]
// 6
shr ecx, 1
mov [edi], eax
// 7
mov eax, [edx]
and ecx, 0x7F7F7F7F
// 8
shr eax, 1
lea edi, [edi + 4]
// 9
and eax, 0x7F7F7F7F
lea ebx, [ebx + 4]
// 10
lea edx, [edx + 4]
add eax, ecx
// 11
mov [edi], eax
lea edi, [edi + 4]
// 12
sub ebp, 8
jnz L22
// kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L21:
add esi, [esp + BACK_TWO_LINES]
// YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ]
// if(0 == (k&1))
mov eax, [esp + LOOP_K]
and eax, 1
jnz L23
// UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ]
add [esp + UPLANE], eax
add [esp + VPLANE], eax
L23:
inc DWORD PTR [esp + LOOP_K]
mov eax, [esp + LOOP_K]
cmp eax, [esp + MARK]
jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0
je L24
// pyprev = YPlane - pitch
mov eax, edi
sub eax, [esp + PITCH_PARM]
mov [esp + PYPREV], eax
// pyspace = YPlane
mov [esp + PYSPACE], edi
// pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM]
mov [esp + PYNEXT], edi
L24:
inc DWORD PTR [esp + LOOP_J]
mov eax, [esp + LOOP_J]
cmp eax, [esp + LUMA_ITERS]
jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// if (stretch)
mov esi, [esp + PYPREV]
cmp DWORD PTR [esp + STRETCH], 0
je L26
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH]
mov edi, [esp + PYSPACE]
L25:
mov ecx, [esi]
lea esi, [esi + 4]
mov [edi], ecx
lea edi, [edi + 4]
sub ebp, 4
jnz L25
// kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L26:
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
#if 0
_STATIC void C_H26X_BGR16toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * lpInput,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
UN bitfield,
const int pitch)
{
U32 *pnext, *pyprev, *pyspace, *pynext;
U32 tm;
int t;
int i, j, k;
int iBackTwoLines;
int stretch, mark, aspect;
int width_adj, height_adj;
int LumaIters = 0;
int ypitch_adj = pitch - FrameWidth;
int uvpitch_adj = pitch - (FrameWidth >> 1);
// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).
for (i = FrameHeight; i > 0; i -= 48) {
LumaIters += 4;
}
width_adj = lpbiInput->biWidth - FrameWidth;
aspect = (width_adj ? LumaIters : 0);
height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;
stretch = (height_adj ? 1 : 0);
mark = 12 - stretch;
// The input image is upside down - process the lines in reverse order.
// Move from end of line N to beginning of line N-1
iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) >> 1);
// Point to the beginning of the last line.
pnext = (U32 *)(lpInput +
((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj)) +
width_adj);
for ( j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= 2, pnext++) {
tm = *pnext;
switch (bitfield) {
// 555 2, 3, 8 0x7C, 0x7C, 0x7C
case 555:
t = (BYUV[(tm>>14)&0x7C].YU +
GYUV[(tm>>19)&0x7C].YU +
RYUV[(tm>>24)&0x7C].YU);
*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
t = (BYUV[(tm<<2)&0x7C].YU +
GYUV[(tm>>3)&0x7C].YU +
RYUV[(tm>>8)&0x7C].YU);
*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);
YPlane += 2;
break;
#if 0
// Beware - untested code ahead
// 664 3, 3, 9 0x78, 0x7E, 0x7E
case 664:
t = (BYUV[(tm>>13)&0x78].YU +
GYUV[(tm>>19)&0x7E].YU +
RYUV[(tm>>25)&0x7E].YU);
*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
t = (BYUV[(tm<<3)&0x78].YU +
GYUV[(tm>>3)&0x7E].YU +
RYUV[(tm>>9)&0x7E].YU);
*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);
YPlane += 2;
break;
// 565 2, 4, 9 0x7C, 0x7E, 0x7C
case 565:
t = (BYUV[(tm>>14)&0x7C].YU +
GYUV[(tm>>20)&0x7E].YU +
RYUV[(tm>>25)&0x7C].YU);
*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
t = (BYUV[(tm<<2)&0x7C].YU +
GYUV[(tm>>4)&0x7E].YU +
RYUV[(tm>>9)&0x7C].YU);
*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);
YPlane += 2;
break;
// 655 2, 3, 9 0x7C, 0x7C, 0x7E
case 655:
t = (BYUV[(tm>>14)&0x7C].YU +
GYUV[(tm>>19)&0x7C].YU +
RYUV[(tm>>25)&0x7E].YU);
*(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
t = (BYUV[(tm<<2)&0x7C].YU +
GYUV[(tm>>3)&0x7C].YU +
RYUV[(tm>>9)&0x7E].YU);
*(YPlane) = (U8)((t>>SHIFT_WIDTH)+8);
YPlane += 2;
break;
#endif
}
if (0 == (k&1)) {
switch (bitfield) {
// 555 2, 3, 8 0x7C, 0x7C, 0x7C
case 555:
*UPlane++ = (U8)((t>>24)+64);
t = (RYUV[(tm>>8)&0x7C].V +
GYUV[(tm>>3)&0x7C].V +
BYUV[(tm<<2)&0x7C].V);
*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
break;
#if 0
// Beware - untested code ahead
// 664 3, 3, 9 0x78, 0x7E, 0x7E
case 664:
*UPlane++ = (U8)((t>>24)+64);
t = (RYUV[(tm>>9)&0x7E].V +
GYUV[(tm>>3)&0x7E].V +
BYUV[(tm<<3)&0x78].V);
*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
break;
// 565 2, 4, 9 0x7C, 0x7E, 0x7C
case 565:
*UPlane++ = (U8)((t>>24)+64);
t = (RYUV[(tm>>9)&0x7C].V +
GYUV[(tm>>4)&0x7E].V +
BYUV[(tm<<2)&0x7C].V);
*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
break;
// 655 2, 3, 9 0x7C, 0x7C, 0x7E
case 655:
*UPlane++ = (U8)((t>>24)+64);
t = (RYUV[(tm>>9)&0x7E].V +
GYUV[(tm>>3)&0x7C].V +
BYUV[(tm<<2)&0x7C].V);
*VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
break;
#endif
}
}
}
if (stretch && (0 == k) && j) {
for (i = FrameWidth; i > 0; i -= 8) {
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
}
}
pnext += iBackTwoLines;
YPlane += ypitch_adj;
// Increment after even lines.
if(0 == (k&1)) {
UPlane += uvpitch_adj;
VPlane += uvpitch_adj;
}
} // end of for k
if (stretch) {
pyprev = (U32 *)(YPlane - pitch);
pyspace = (U32 *)YPlane;
pynext = (U32 *)(YPlane += pitch);
}
} // end of for j
if (stretch) {
for (i = FrameWidth; i > 0; i -= 4) {
*pyspace++ = *pyprev++;
}
}
} // end of C_H26X_BGR16toYUV12
#endif
__declspec(naked)
_STATIC void IA_H26X_BGR16555toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * lpInput,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | i | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 48
#define PITCH_PARM 96
#define FRAME_HEIGHT 92
#define FRAME_WIDTH 88
#define VPLANE 84
#define UPLANE 80
#define YPLANE 76
#define LP_INPUT 72
#define LPBI_INPUT 68
#define PYPREV 44
#define PYSPACE 40
#define PYNEXT 36
#define LOOP_I 32
#define LOOP_J 28
#define LOOP_K 24
#define BACK_TWO_LINES 20
#define STRETCH 16
#define MARK 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT]
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx
shr ebp, 1
sub edx, ebp
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea edx, [edx + 4]
sub eax, 48
jnz L1
// width_adj = lpbiInput->biWidth - FrameWidth
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth
sub esi, [esp + FRAME_WIDTH]
// aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx
xor edi, edi
test esi, esi
jz L2
mov edi, edx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2:
mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub edx, [esp + FRAME_HEIGHT]
add edx, edi
shr edx, 1
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test edx, edx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov ebp, 12
sub ebp, eax
mov [esp + MARK], ebp
// iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth
add ebp, [esp + FRAME_WIDTH]
shl ebp, 1
neg ebp
mov [esp + BACK_TWO_LINES], ebp
// pnext = lpInput +
// ((lpbiInput->biWidth << 1) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth
shl eax, 1
mov ebx, [esp + FRAME_HEIGHT]
sub ebx, edi
dec ebx
add ebx, edx
imul ebx
add esi, eax
add esi, [esp + LP_INPUT]
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
// for (k = 0; k < mark; k++)
L4:
xor eax, eax
mov [esp + LOOP_K], eax
// for (i = FrameWidth; i > 0; i -= 2, pnext += 4)
L5:
mov eax, [esp + FRAME_WIDTH]
mov [esp + LOOP_I], eax
// This jump is here to make sure the following loop starts on the U pipe
jmp L6
L6:
// tm = pnext[0]
// t = ( BYUV[(tm>>14)&0x7C].YU +
// GYUV[(tm>>19)&0x7C].YU +
// RYUV[(tm>>24)&0x7C].YU )
// *(YPlane+1) = (U8)((t>>8)+8)
// t = ( BYUV[(tm<< 2)&0x7C].YU +
// GYUV[(tm>> 8)&0x7C].YU +
// RYUV[(tm>>13)&0x7C].YU )
// *YPlane = (U8)((t>>8)+8)
// YPlane += 2
// assign(eax: B2/Y1/Y2/U)
// assign(ebx: B1/V)
// assign(ecx: G2/G1)
// assign(edx: R2/R1)
// assign(ebp: B1)
// 1
mov eax, [esi]
nop
// 2
mov ebx, eax
mov ecx, eax
// 3
shr eax, 14
mov edx, ebx
// 4
shr ecx, 19
and eax, 0x7C
// 5
shr edx, 24
and ecx, 0x7C
// 6
mov eax, [BYUV+eax*8].YU
and edx, 0x7C
// 7
add eax, [GYUV+ecx*8].YU
mov ecx, ebx
// 8
add eax, [RYUV+edx*8].YU
mov edx, ebx
// 9
sar eax, 8
and ebx, 0x1F
// 10
shl ebx, 2
add eax, 8
// 11
shr ecx, 3
mov [edi + 1], al
// 12
shr edx, 8
and ecx, 0x7C
// 13
mov eax, [BYUV+ebx*8].YU
and edx, 0x7C
// 14
add eax, [GYUV+ecx*8].YU
mov ebp, ebx
// 15
add eax, [RYUV+edx*8].YU
nop
// 16
sar eax, 8
mov ebx, [esp + LOOP_K]
// 17
add eax, 8
and ebx, 1
// 18
mov [edi], al
jnz L9
// At this point, ebp: B1, ecx: G1, edx: R1
// *UPlane++ = (U8)((t>>24)+64)
// t = ( VBGR[(t>>13)&0x7C].VR +
// VBGR[(t>> 8)&0x7C].VG +
// VBGR[(t<< 2)&0x7C].VB )
// *VPlane++ = (U8)((t>>8)+64)
// 19
mov ebx, [RYUV+edx*8].V
mov edx, [esp + UPLANE]
// 20
sar eax, 16
add ebx, [GYUV+ecx*8].V
// 21
add eax, 64
add ebx, [BYUV+ebp*8].V
// 22
mov [edx], al
inc edx
// 23
mov [esp + UPLANE], edx
mov edx, [esp + VPLANE]
// 24
sar ebx, 8
inc edx
// 25
add ebx, 64
mov [esp + VPLANE], edx
// 26
mov [edx - 1], bl
nop
L9:
// 27
mov eax, [esp + LOOP_I]
lea esi, [esi + 4]
// 28
sub eax, 2
lea edi, [edi + 2]
// 29
mov [esp + LOOP_I], eax
jnz L6
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH]
test eax, eax
jz L14
mov eax, [esp + LOOP_K]
test eax, eax
jnz L14
mov eax, [esp + LOOP_J]
test eax, eax
jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi
nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV]
mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT]
mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15:
// 1
mov eax, [ebx]
lea ebx, [ebx + 4]
// 2
mov ecx, [edx]
lea edx, [edx + 4]
// 3
shr ecx, 1
and eax, 0xFEFEFEFE
// 4
shr eax, 1
and ecx, 0x7F7F7F7F
// 5
add eax, ecx
mov ecx, [ebx]
// 6
shr ecx, 1
mov [edi], eax
// 7
mov eax, [edx]
and ecx, 0x7F7F7F7F
// 8
shr eax, 1
lea edi, [edi + 4]
// 9
and eax, 0x7F7F7F7F
lea ebx, [ebx + 4]
// 10
lea edx, [edx + 4]
add eax, ecx
// 11
mov [edi], eax
lea edi, [edi + 4]
// 12
sub ebp, 8
jnz L15
// kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L14:
add esi, [esp + BACK_TWO_LINES]
// YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ]
// if(0 == (k&1))
mov eax, [esp + LOOP_K]
and eax, 1
jnz L16
// UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ]
add [esp + UPLANE], eax
add [esp + VPLANE], eax
L16:
inc DWORD PTR [esp + LOOP_K]
mov eax, [esp + LOOP_K]
cmp eax, [esp + MARK]
jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0
je L17
// pyprev = YPlane - pitch
mov eax, edi
sub eax, [esp + PITCH_PARM]
mov [esp + PYPREV], eax
// pyspace = YPlane
mov [esp + PYSPACE], edi
// pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM]
mov [esp + PYNEXT], edi
L17:
inc DWORD PTR [esp + LOOP_J]
mov eax, [esp + LOOP_J]
cmp eax, [esp + LUMA_ITERS]
jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// if (stretch)
mov esi, [esp + PYPREV]
cmp DWORD PTR [esp + STRETCH], 0
je L19
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH]
mov edi, [esp + PYSPACE]
L18:
mov ecx, [esi]
lea esi, [esi + 4]
mov [edi], ecx
lea edi, [edi + 4]
sub ebp, 4
jnz L18
// kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19:
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/*****************************************************************************
*
* H26X_CLUTtoYUV12()
*
* Convert from CLUT8/CLUT4 to YUV12 (YCrCb 4:2:0) and copy to destination memory
* with pitch defined by the constant PITCH.
*
* This is needed to support the quickcam.
*/
#if 0
_STATIC void C_H26X_CLUTtoYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * lpInput,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
UN pixel_bits,
const int pitch)
{
U32 *pnext, *pyprev, *pyspace, *pynext;
U32 tm, tn;
int t;
int i, j, k, m, n;
int iNextLine, iBackTwoLines;
int stretch, mark, aspect;
int width_adj, height_adj;
int yshift, uvshift;
int pixel_mask, loop_cnt, loop_limit;
RGBQUAD *lpCEntry, *lpCTable = (RGBQUAD *)((U8 *)lpbiInput + sizeof(BITMAPINFOHEADER));
int LumaIters = 0;
int ypitch_adj = (pitch - FrameWidth);
int uvpitch_adj = (pitch - (FrameWidth >> 1));
ASSERT((8 == pixel_bits) || (4 == pixel_bits));
// This loop is here simply to avoid a divide. LumaIters = (FrameHeight/12).
for (i = FrameHeight; i > 0; i -= 48) {
LumaIters += 4;
}
width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1);
aspect = (width_adj ? LumaIters : 0);
height_adj = ((lpbiInput->biHeight - (FrameHeight - aspect)) >> 1);
stretch = (height_adj ? 1 : 0);
mark = 12 - stretch;
iNextLine = lpbiInput->biWidth;
iBackTwoLines = -((iNextLine + (int)FrameWidth) >> 2);
if (8 == pixel_bits) {
yshift = 8;
uvshift = 16;
pixel_mask = 0xFF;
loop_cnt = 2;
loop_limit = 4;
} else {
yshift = 4;
uvshift = 8;
pixel_mask = 0xF;
loop_cnt = 1;
loop_limit = 8;
width_adj >>= 1;
iNextLine >>= 1;
iBackTwoLines >>= 1;
}
// The input image is upside down - process the lines in reverse order.
// Point to the beginning of the last line.
pnext = (U32 *)(lpInput +
(iNextLine * ((FrameHeight - aspect - 1) + height_adj)) + width_adj);
for (j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = FrameWidth; i > 0; i -= 8) {
for (n = 0; n < loop_cnt; n++) {
tm = *pnext++;
tm = ((4 == pixel_bits) ?
( ((tm >> 4) & 0x0F0F0F0F) | ((tm << 4) & 0xF0F0F0F0) ) : tm);
tn = tm;
for (m = 0; m < loop_limit; m += 4) {
lpCEntry = &lpCTable[tm&pixel_mask];
t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
GYUV[lpCEntry->rgbGreen>>1].YU +
RYUV[lpCEntry->rgbRed>>1].YU );
*YPlane++ = (U8)((t>>8)+8);
tm >>= yshift;
lpCEntry = &lpCTable[tm&pixel_mask];
t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
GYUV[lpCEntry->rgbGreen>>1].YU +
RYUV[lpCEntry->rgbRed>>1].YU );
*YPlane++ = (U8)((t>>8)+8);
tm >>= yshift;
lpCEntry = &lpCTable[tm&pixel_mask];
t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
GYUV[lpCEntry->rgbGreen>>1].YU +
RYUV[lpCEntry->rgbRed>>1].YU );
*YPlane++ = (U8)((t>>8)+8);
tm >>= yshift;
lpCEntry = &lpCTable[tm&pixel_mask];
t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
GYUV[lpCEntry->rgbGreen>>1].YU +
RYUV[lpCEntry->rgbRed>>1].YU );
*YPlane++ = (U8)((t>>8)+8);
tm >>= yshift;
}
if (0 == (k&1)) {
for (m = 0; m < loop_limit; m += 2, tn >>= uvshift) {
lpCEntry = &lpCTable[tn&pixel_mask];
t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
RYUV[lpCEntry->rgbRed>>1].YU +
GYUV[lpCEntry->rgbGreen>>1].YU );
*UPlane++ = (U8)((t>>24)+64);
t = ( RYUV[lpCEntry->rgbRed>>1].V +
GYUV[lpCEntry->rgbGreen>>1].V +
BYUV[lpCEntry->rgbBlue>>1].V );
*VPlane++ = (U8)((t>>8)+64);
}
}
}
}
if (stretch && (0 == k) && j) {
for (i = FrameWidth; i > 0; i -= 8) {
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
}
}
pnext += iBackTwoLines;
YPlane += ypitch_adj;
// Increment after even lines.
if(0 == (k&1)) {
UPlane += uvpitch_adj;
VPlane += uvpitch_adj;
}
}
if (stretch) {
pyprev = (U32 *)(YPlane - pitch);
pyspace = (U32 *)YPlane;
pynext = (U32 *)(YPlane += pitch);
}
}
if (stretch) {
for (i = FrameWidth; i > 0; i -= 4) {
*pyspace++ = *pyprev++;
}
}
} // end of H26X_CLUTtoYUV12()
#endif
__declspec(naked)
_STATIC void IA_H26X_CLUT8toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * lpInput,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | +100
// | FrameHeight | + 96
// | FrameWidth | + 92
// | VPlane | + 88
// | UPlane | + 84
// | YPlane | + 80
// | lpInput | + 76
// | lpbiInput | + 72
// ----------------------------
// | return addr | + 68
// | saved ebp | + 64
// | saved ebx | + 60
// | saved esi | + 56
// | saved edi | + 52
// | pyprev | + 48
// | pyspace | + 44
// | pynext | + 40
// | i | + 36
// | j | + 32
// | k | + 28
// | iBackTwoLines | + 24
// | stretch | + 20
// | mark | + 16
// | lpCEntry | + 12
// | lpCTable | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 52
#define PITCH_PARM 100
#define FRAME_HEIGHT 96
#define FRAME_WIDTH 92
#define VPLANE 88
#define UPLANE 84
#define YPLANE 80
#define LP_INPUT 76
#define LPBI_INPUT 72
#define PYPREV 48
#define PYSPACE 44
#define PYNEXT 40
#define LOOP_I 36
#define LOOP_J 32
#define LOOP_K 28
#define BACK_TWO_LINES 24
#define STRETCH 20
#define MARK 16
#define LUMA_ITERS 12
#define LPCTABLE 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER)
// assign (ebx, lpbiInput)
mov eax, [esp + LPBI_INPUT]
mov ebx, eax
add eax, TYPE BITMAPINFOHEADER
mov [esp + LPCTABLE], eax
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (ecx, FrameWidth)
// kill (edx, pitch)
shr ecx, 1
sub edx, ecx
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (ecx, LumaIters)
xor ecx, ecx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea ecx, [ecx + 4]
sub eax, 48
jnz L1
// width_adj = ((lpbiInput->biWidth - FrameWidth) >> 1
// assign (edx, width_adj)
mov edx, (LPBITMAPINFOHEADER)[ebx].biWidth
sub edx, [esp + FRAME_WIDTH]
shr edx, 1
// aspect = (width_adj ? LumaIters : 0)
// assign (esi, aspect)
// kill (ecx, LumaIters)
mov [esp + LUMA_ITERS], ecx
xor esi, esi
test edx, edx
jz L2
mov esi, ecx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (ecx, height_adj)
L2:
mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub ecx, [esp + FRAME_HEIGHT]
add ecx, esi
shr ecx, 1
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test ecx, ecx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov edi, 12
sub edi, eax
mov [esp + MARK], edi
// iNextLine = lpbiInput->biWidth
// kill (ebx, lpbiInput)
// assign (ebx, iNextLine)
mov ebx, (LPBITMAPINFOHEADER)[ebx].biWidth
// iBackTwoLines = -(iNextline + FrameWidth)
mov edi, [esp + FRAME_WIDTH]
add edi, ebx
neg edi
mov [esp + BACK_TWO_LINES], edi
// pnext = lpInput +
// (iNextLine*((FrameHeight-aspect-1) + height_adj)) +
// width_adj
// kill (ebx, iNextLine)
// kill (ecx, height_adj)
// kill (edx, width_adj)
// kill (esi, aspect)
// assign (esi, pnext)
mov eax, [esp + FRAME_HEIGHT]
sub eax, esi
dec eax
add eax, ecx
mov esi, [esp + LP_INPUT]
add esi, edx
imul ebx
add esi, eax
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
// for (k = 0; k < mark; k++)
L4:
xor eax, eax
mov [esp + LOOP_K], eax
// for (i = FrameWidth; i > 0; i -= 2, pnext += 2)
L5:
mov eax, [esp + FRAME_WIDTH]
mov [esp + LOOP_I], eax
// This jump is here to make sure the following loop starts on the U pipe
jmp L6
L6:
// lpCEntry = &lpCTable[*(pnext+1)]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *(YPlane+1) = (U8)((t>>8)+8)
// lpCEntry = &lpCTable[*pnext]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *YPlane = (U8)((t>>8)+8)
// YPlane += 2
// *UPlane++ = (U8)((t>>24)+64)
// t = ( VBGR[lpCEntry->rgbRed>>1].V +
// VBGR[lpCEntry->rgbGreen>>1].V +
// VBGR[lpCEntry->rgbBlue>>1].V )
// *VPlane++ = (U8)((t>>8)+64)
// assign (ebp: lpCEntry,B1)
// assign (eax: P2,B2,Y2,Y1,U)
// assign (ebx: B1,V)
// assign (ecx: G2,G1)
// assign (edx: R2,R1)
// 1
xor eax, eax
mov ebp, [esp + LPCTABLE]
// 2
mov al, [esi + 1]
xor ecx, ecx
// 3
lea ebx, [ebp+eax*4]
xor edx, edx
// 4
mov al, (LPRGBQUAD)[ebx].rgbBlue
nop
// 5
mov cl, (LPRGBQUAD)[ebx].rgbGreen
and al, 0xFE
// 6
mov dl, (LPRGBQUAD)[ebx].rgbRed
and cl, 0xFE
// 7
mov eax, [BYUV+eax*4].YU
and dl, 0xFE
// 8
add eax, [GYUV+ecx*4].YU
xor ebx, ebx
// 9
add eax, [RYUV+edx*4].YU
mov bl, [esi]
// 10
sar eax, 8
lea ebp, [ebp+ebx*4]
// 11
add eax, 8
nop
// 12
mov [edi + 1], al
mov bl, (LPRGBQUAD)[ebp].rgbBlue
// 13
mov cl, (LPRGBQUAD)[ebp].rgbGreen
and bl, 0xFE
// 14
mov dl, (LPRGBQUAD)[ebp].rgbRed
and cl, 0xFE
// 15
mov eax, [BYUV+ebx*4].YU
and dl, 0xFE
// 16
add eax, [GYUV+ecx*4].YU
mov ebp, ebx
// 17
add eax, [RYUV+edx*4].YU
nop
// 18
sar eax, 8
mov ebx, [esp + LOOP_K]
// 19
add eax, 8
and ebx, 1
// 20
mov [edi], al
jnz L9
// 21
mov ebx, [RYUV+edx*4].V
mov edx, [esp + UPLANE]
// 22
sar eax, 16
add ebx, [GYUV+ecx*4].V
// 23
add eax, 64
add ebx, [BYUV+ebp*4].V
// 24
mov [edx], al
inc edx
// 25
mov [esp + UPLANE], edx
mov edx, [esp + VPLANE]
// 26
sar ebx, 8
inc edx
// 27
add ebx, 64
mov [esp + VPLANE], edx
// 28
mov [edx - 1], bl
nop
L9:
// 29
mov eax, [esp + LOOP_I]
lea esi, [esi + 2]
// 30
sub eax, 2
lea edi, [edi + 2]
// 31
mov [esp + LOOP_I], eax
jnz L6
// only esi (pnext) is live at this point (after line loop)
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH]
test eax, eax
jz L14
mov eax, [esp + LOOP_K]
test eax, eax
jnz L14
mov eax, [esp + LOOP_J]
test eax, eax
jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi
nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV]
mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT]
mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15:
// 1
mov eax, [ebx]
lea ebx, [ebx + 4]
// 2
mov ecx, [edx]
lea edx, [edx + 4]
// 3
shr ecx, 1
and eax, 0xFEFEFEFE
// 4
shr eax, 1
and ecx, 0x7F7F7F7F
// 5
add eax, ecx
mov ecx, [ebx]
// 6
shr ecx, 1
mov [edi], eax
// 7
mov eax, [edx]
and ecx, 0x7F7F7F7F
// 8
shr eax, 1
lea edi, [edi + 4]
// 9
and eax, 0x7F7F7F7F
lea ebx, [ebx + 4]
// 10
lea edx, [edx + 4]
add eax, ecx
// 11
mov [edi], eax
lea edi, [edi + 4]
// 12
sub ebp, 8
jnz L15
// kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L14:
add esi, [esp + BACK_TWO_LINES]
// YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ]
// if(0 == (k&1))
mov eax, [esp + LOOP_K]
and eax, 1
jnz L16
// UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ]
add [esp + UPLANE], eax
add [esp + VPLANE], eax
L16:
inc DWORD PTR [esp + LOOP_K]
mov eax, [esp + LOOP_K]
cmp eax, [esp + MARK]
jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0
je L17
// pyprev = YPlane - pitch
mov eax, edi
sub eax, [esp + PITCH_PARM]
mov [esp + PYPREV], eax
// pyspace = YPlane
mov [esp + PYSPACE], edi
// pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM]
mov [esp + PYNEXT], edi
L17:
inc DWORD PTR [esp + LOOP_J]
mov eax, [esp + LOOP_J]
cmp eax, [esp + LUMA_ITERS]
jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// if (stretch)
mov esi, [esp + PYPREV]
cmp DWORD PTR [esp + STRETCH], 0
je L19
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH]
mov edi, [esp + PYSPACE]
L18:
mov ecx, [esi]
lea esi, [esi + 4]
mov [edi], ecx
lea edi, [edi + 4]
sub ebp, 4
jnz L18
// kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19:
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef LPCTABLE
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
__declspec(naked)
_STATIC void IA_H26X_CLUT4toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * lpInput,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | +100
// | FrameHeight | + 96
// | FrameWidth | + 92
// | VPlane | + 88
// | UPlane | + 84
// | YPlane | + 80
// | lpInput | + 76
// | lpbiInput | + 72
// ----------------------------
// | return addr | + 68
// | saved ebp | + 64
// | saved ebx | + 60
// | saved esi | + 56
// | saved edi | + 52
// | pyprev | + 48
// | pyspace | + 44
// | pynext | + 40
// | i | + 36
// | j | + 32
// | k | + 28
// | iBackTwoLines | + 24
// | stretch | + 20
// | mark | + 16
// | lpCEntry | + 12
// | lpCTable | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 52
#define PITCH_PARM 100
#define FRAME_HEIGHT 96
#define FRAME_WIDTH 92
#define VPLANE 88
#define UPLANE 84
#define YPLANE 80
#define LP_INPUT 76
#define LPBI_INPUT 72
#define PYPREV 48
#define PYSPACE 44
#define PYNEXT 40
#define LOOP_I 36
#define LOOP_J 32
#define LOOP_K 28
#define BACK_TWO_LINES 24
#define STRETCH 20
#define MARK 16
#define LUMA_ITERS 12
#define LPCTABLE 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// lpCTable = lpbiInput + sizeof(BITMAPINFOHEADER)
// assign (ebx, lpbiInput)
mov eax, [esp + LPBI_INPUT]
mov ebx, eax
add eax, TYPE BITMAPINFOHEADER
mov [esp + LPCTABLE], eax
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (ecx, FrameWidth)
// kill (edx, pitch)
shr ecx, 1
sub edx, ecx
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (ecx, LumaIters)
xor ecx, ecx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea ecx, [ecx + 4]
sub eax, 48
jnz L1
// width_adj = ((lpbiInput->biWidth - FrameWidth) >> 2
// assign (edx, width_adj)
mov edx, (LPBITMAPINFOHEADER)[ebx].biWidth
sub edx, [esp + FRAME_WIDTH]
shr edx, 2
// aspect = (width_adj ? LumaIters : 0)
// assign (esi, aspect)
// kill (ecx, LumaIters)
mov [esp + LUMA_ITERS], ecx
xor esi, esi
test edx, edx
jz L2
mov esi, ecx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (ecx, height_adj)
L2:
mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub ecx, [esp + FRAME_HEIGHT]
add ecx, esi
shr ecx, 1
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test ecx, ecx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov edi, 12
sub edi, eax
mov [esp + MARK], edi
// iNextLine = lpbiInput->biWidth >> 1
// kill (ebx, lpbiInput)
// assign (ebx, iNextLine)
mov ebx, (LPBITMAPINFOHEADER)[ebx].biWidth
shr ebx, 1
// iBackTwoLines = -(iNextline + (FrameWidth >> 1))
mov edi, [esp + FRAME_WIDTH]
shr edi, 1
add edi, ebx
neg edi
mov [esp + BACK_TWO_LINES], edi
// pnext = lpInput+(iNextLine*((FrameHeight-aspect-1)+height_adj))+ width_adj
// kill (ebx, iNextLine)
// kill (ecx, height_adj)
// kill (edx, width_adj)
// kill (esi, aspect)
// assign (esi, pnext)
mov eax, [esp + FRAME_HEIGHT]
sub eax, esi
dec eax
add eax, ecx
mov esi, [esp + LP_INPUT]
add esi, edx
imul ebx
add esi, eax
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
// for (k = 0; k < mark; k++)
L4:
xor eax, eax
mov [esp + LOOP_K], eax
// for (i = FrameWidth; i > 0; i -= 2, pnext++)
L5:
mov eax, [esp + FRAME_WIDTH]
mov [esp + LOOP_I], eax
// This jump is here to make sure the following loop starts on the U pipe
jmp L6
L6:
// lpCEntry = &lpCTable[*pnext&0xF]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *(YPlane+1) = (U8)((t>>8)+8)
// lpCEntry = &lpCTable[(*pnext>>4)&0xF]
// t = ( BYUV[lpCEntry->rgbBlue>>1].YU +
// GYUV[lpCEntry->rgbGreen>>1].YU +
// RYUV[lpCEntry->rgbRed>>1].YU )
// *YPlane = (U8)((t>>8)+8)
// YPlane += 2
// *UPlane++ = (U8)((t>24)+64)
// t = ( RYUV[lpCEntry->rgbRed>>1].V +
// GYUV[lpCEntry->rgbGreen>>1].V +
// BYUV[lpCEntry->rgbBlue>>1].V )
// *VPlane++ = (U8)((t>>8)+64)
// assign (ebp: lpCEntry,B1)
// assign (eax: P2,B2,Y2,Y1,U)
// assign (ebx: B1,V)
// assign (ecx: G2,G1)
// assign (edx: R2,R1)
// 1
mov al, [esi]
mov ebp, [esp + LPCTABLE]
// 2
and eax, 0xF
xor ecx, ecx
// 3
lea ebx, [ebp+eax*4]
xor edx, edx
// 4
mov al, (LPRGBQUAD)[ebx].rgbBlue
nop
// 5
mov cl, (LPRGBQUAD)[ebx].rgbGreen
and al, 0xFE
// 6
mov dl, (LPRGBQUAD)[ebx].rgbRed
and cl, 0xFE
// 7
mov eax, [BYUV+eax*4].YU
and dl, 0xFE
// 8
add eax, [GYUV+ecx*4].YU
mov bl, [esi]
// 9
add eax, [RYUV+edx*4].YU
and ebx, 0xF0
//
shr ebx, 4
nop
// 10
shr eax, 8
lea ebp, [ebp+ebx*4]
// 11
add eax, 8
nop
// 12
mov [edi + 1], al
mov bl, (LPRGBQUAD)[ebp].rgbBlue
// 13
mov cl, (LPRGBQUAD)[ebp].rgbGreen
and bl, 0xFE
// 14
mov dl, (LPRGBQUAD)[ebp].rgbRed
and cl, 0xFE
// 15
mov eax, [BYUV+ebx*4].YU
and dl, 0xFE
// 16
add eax, [GYUV+ecx*4].YU
mov ebp, ebx
// 17
add eax, [RYUV+edx*4].YU
nop
// 18
shr eax, 8
mov ebx, [esp + LOOP_K]
// 19
add eax, 8
and ebx, 1
// 20
mov [edi], al
jnz L9
// 21
mov ebx, [RYUV+edx*4].V
mov edx, [esp + UPLANE]
// 22
sar eax, 16
add ebx, [GYUV+ecx*4].V
// 23
add eax, 64
add ebx, [BYUV+ebp*4].V
// 24
mov [edx], al
inc edx
// 25
mov [esp + UPLANE], edx
mov edx, [esp + VPLANE]
// 26
sar ebx, 8
inc edx
// 27
add ebx, 64
mov [esp + VPLANE], edx
// 28
mov [edx - 1], bl
nop
L9:
// 32
mov eax, [esp + LOOP_I]
lea esi, [esi + 1]
// 33
sub eax, 2
lea edi, [edi + 2]
// 34
mov [esp + LOOP_I], eax
jnz L6
// only esi (pnext) is live at this point (after line loop)
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH]
test eax, eax
jz L14
mov eax, [esp + LOOP_K]
test eax, eax
jnz L14
mov eax, [esp + LOOP_J]
test eax, eax
jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi
nop
// for (i = FrameWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV]
mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT]
mov ebp, [esp + FRAME_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15:
// 1
mov eax, [ebx]
lea ebx, [ebx + 4]
// 2
mov ecx, [edx]
lea edx, [edx + 4]
// 3
shr ecx, 1
and eax, 0xFEFEFEFE
// 4
shr eax, 1
and ecx, 0x7F7F7F7F
// 5
add eax, ecx
mov ecx, [ebx]
// 6
shr ecx, 1
mov [edi], eax
// 7
mov eax, [edx]
and ecx, 0x7F7F7F7F
// 8
shr eax, 1
lea edi, [edi + 4]
// 9
and eax, 0x7F7F7F7F
lea ebx, [ebx + 4]
// 10
lea edx, [edx + 4]
add eax, ecx
// 11
mov [edi], eax
lea edi, [edi + 4]
// 12
sub ebp, 8
jnz L15
// kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += iBackTwoLines
L14:
add esi, [esp + BACK_TWO_LINES]
// YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ]
// if(0 == (k&1))
mov eax, [esp + LOOP_K]
and eax, 1
jnz L16
// UPlane += uvpitch_adj;
// VPlane += uvpitch_adj;
mov eax, [esp + UVPITCH_ADJ]
add [esp + UPLANE], eax
add [esp + VPLANE], eax
L16:
inc DWORD PTR [esp + LOOP_K]
mov eax, [esp + LOOP_K]
cmp eax, [esp + MARK]
jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0
je L17
// pyprev = YPlane - pitch
mov eax, edi
sub eax, [esp + PITCH_PARM]
mov [esp + PYPREV], eax
// pyspace = YPlane
mov [esp + PYSPACE], edi
// pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM]
mov [esp + PYNEXT], edi
L17:
inc DWORD PTR [esp + LOOP_J]
mov eax, [esp + LOOP_J]
cmp eax, [esp + LUMA_ITERS]
jl L4
// kill(esi, pnext)
// if (stretch)
mov esi, [esp + PYPREV]
cmp DWORD PTR [esp + STRETCH], 0
je L19
// for (i = FrameWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov edi, [esp + PYSPACE]
mov ebp, [esp + FRAME_WIDTH]
L18:
mov ecx, [esi]
lea esi, [esi + 4]
mov [edi], ecx
lea edi, [edi + 4]
sub ebp, 4
jnz L18
// kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19:
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef LPCTABLE
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/***************************************************
* H26X_YVU9toYUV12()
* Convert from YVU9 to YUV12
* and copy to destination memory with pitch
* defined by the constant PITCH.
*
* uv_plane_common()
* Helper function to convert V and U plane information.
* Since the process is similar for both planes, the
* conversion code was included in this subroutine.
*
***************************************************/
#if 0
#define READ_DWORD_AND_SHIFT(val,src) \
(((val) = *((unsigned int *)(src))), ((val) &= 0xFEFEFEFE), ((val) >>= 1))
#define READ_QWORD_AND_SHIFT(val,src) \
(((val) = *((unsigned __int64 *)(src))), ((val) &= 0xFEFEFEFEFEFEFEFE), ((val) >>= 1))
#define WRITE_DWORD(dest,val) ((*(unsigned int *)(dest)) = (val))
#define WRITE_QWORD(dest,val) ((*(unsigned __int64 *)(dest)) = (val))
#define AVERAGE_DWORDS(out,in1,in2) ((out) = ((((in1) + (in2)) & 0xFEFEFEFE) >> 1))
#define DUP_LOWER_TWO_BYTES(dest,val) \
(*((unsigned int *)(dest)) = (((val) & 0x000000FF) | (((val) << 8) & 0x0000FF00) | \
(((val) << 8) & 0x00FF0000) | (((val) << 16) & 0xFF000000)))
#define DUP_UPPER_TWO_BYTES(dest,val) \
(*((unsigned int *)(dest)) = ((((val) >> 16) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \
(((val) >> 8) & 0x00FF0000) | ((val) & 0xFF000000)))
_STATIC void C_uv_plane_common(
U8 *psrc,
U8 *Plane,
UN pitch,
UN OutputFrameWidth,
UN ChromaIters,
UN spitch_adj) {
U8* pnext = psrc + (OutputFrameWidth>>1) + spitch_adj;
U8* pdest_copy = Plane;
U8* pdest_avg = Plane + pitch;
int dpitch_adj = pitch - OutputFrameWidth;
int stretch = (spitch_adj ? 1 : 0);
int mark = 6 - stretch;
int flag = stretch;
int i, j, k;
UN t1,t2;
for (j = ChromaIters; j > 0; j--) {
for (k = mark + (flag & 1); k > 0; k--) {
if (!stretch && (1 == j) && (1 == k)) {
pnext = psrc;
}
for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4,
pnext += 4,
pdest_copy += 8,
pdest_avg += 8) {
READ_DWORD_AND_SHIFT(t1,psrc);
DUP_LOWER_TWO_BYTES(pdest_copy,t1);
DUP_UPPER_TWO_BYTES((pdest_copy+4),t1);
READ_DWORD_AND_SHIFT(t2,pnext);
AVERAGE_DWORDS(t1,t1,t2);
DUP_LOWER_TWO_BYTES(pdest_avg,t1);
DUP_UPPER_TWO_BYTES((pdest_avg+4),t1);
}
psrc += spitch_adj;
pnext += spitch_adj;
pdest_copy = pdest_avg + dpitch_adj;
pdest_avg = pdest_copy + pitch;
}
if (stretch) {
psrc -= ((OutputFrameWidth>>1) + spitch_adj);
pnext -= ((OutputFrameWidth>>1) + spitch_adj);
pdest_avg = pdest_copy;
for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4,
pnext += 4,
pdest_avg += 8) {
READ_DWORD_AND_SHIFT(t1,psrc);
READ_DWORD_AND_SHIFT(t2,pnext);
AVERAGE_DWORDS(t1,t1,t2);
AVERAGE_DWORDS(t1,t1,t2);
DUP_LOWER_TWO_BYTES(pdest_avg,t1);
DUP_UPPER_TWO_BYTES((pdest_avg+4),t1);
}
psrc += spitch_adj;
pnext += spitch_adj;
pdest_copy = pdest_avg + dpitch_adj;
pdest_avg = pdest_copy + pitch;
flag++;
}
}
}
_STATIC void C_H26X_YVU9toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch) {
U8 *pnext, *plast, *pbn;
U8 *pvsrc, *pusrc;
int width_adj, height_adj;
int stretch, mark, aspect;
int iNextLine;
int i, j, k, t;
int LumaIters = 0;
int ypitch_adj = pitch - FrameWidth;
int uvpitch_adj = pitch - (FrameWidth >> 1);
for (i = FrameHeight; i > 0; i -= 48) {
LumaIters += 4;
}
width_adj = (lpbiInput->biWidth - FrameWidth) >> 1;
aspect = (width_adj ? LumaIters : 0);
height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;
stretch = (height_adj ? 1 : 0);
mark = 12 - stretch;
iNextLine = width_adj << 1;
pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj;
for (j = LumaIters; j > 0; j--) {
for (k = mark; k > 0; k--) {
for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4) {
*(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1;
}
pnext += iNextLine;
YPlane += ypitch_adj;
}
if (stretch) {
plast = pnext - lpbiInput->biWidth;
pbn = pnext;
for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, plast += 4, pbn += 4) {
*(U32 *)YPlane =
( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) +
((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1;
}
YPlane += ypitch_adj;
}
}
pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight);
pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight>>2));
t = ((lpbiInput->biWidth>>2) * (height_adj>>2)) + (width_adj>>2);
pvsrc += t;
pusrc += t;
C_uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1);
C_uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1);
}
#endif
__declspec(naked)
_STATIC void IA_uv_plane_common(
U8 *psrc,
U8 *Plane,
UN pitch,
UN OutputFrameWidth,
UN ChromaIters,
UN spitch_adj)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | spitch_adj | + 64
// | ChromaIters | + 60
// | OutputFrameWidth| + 56
// | pitch | + 52
// | Plane | + 48
// | psrc | + 44
// -----------------------------
// | return addr | + 40
// | saved ebp | + 36
// | saved ebx | + 32
// | saved esi | + 28
// | saved edi | + 24
// | dpitch_adj | + 20
// | stretch | + 16
// | mark | + 12
// | flag | + 8
// | j | + 4
// | k | + 0
#define LOCALSIZE 24
#define SPITCH_ADJ 64
#define CHROMA_ITERS 60
#define OUTPUT_FRAME_WIDTH 56
#define PITCH_PARM 52
#define PLANE 48
#define PSRC 44
#define DPITCH_ADJ 20
#define STRETCH 16
#define MARK 12
#define FLAG 8
#define LOOP_J 4
#define LOOP_K 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// pnext = psrc + (OuputFrameWidth>>1) + uvpitch_adj
// pdest_copy = Plane
// pdest_avg = Plane + pitch
// assign (esi, psrc)
// assign (ecx, pnext)
// assign (edi, pdest_copy)
// assign (edx, pdest_avg)
// assign (ebp, i)
mov esi, [esp + PSRC]
mov ecx, esi
mov eax, [esp + OUTPUT_FRAME_WIDTH]
shr eax, 1
add eax, [esp + SPITCH_ADJ]
add ecx, eax
mov edi, [esp + PLANE]
mov edx, edi
add edx, [esp + PITCH_PARM]
// dpitch_adj = pitch - OutputFrameWidth
mov eax, [esp + PITCH_PARM]
sub eax, [esp + OUTPUT_FRAME_WIDTH]
mov [esp + DPITCH_ADJ], eax
// stretch = (spitch_adj ? 1 : 0)
xor ebx, ebx
mov eax, [esp + SPITCH_ADJ]
test eax, eax
jz L1
inc ebx
L1:
mov [esp + STRETCH], ebx
// mark = 6 - stretch
mov eax, 6
sub eax, ebx
mov [esp + MARK], eax
// flag = stretch
mov DWORD PTR [esp + FLAG], ebx
// for (j = ChromaIters; j > 0; j--)
mov eax, [esp + CHROMA_ITERS]
mov [esp + LOOP_J], eax
L2:
// for (k = mark + (flag & 1); k > 0; k--)
mov eax, [esp + FLAG]
and eax, 1
add eax, [esp + MARK]
mov [esp + LOOP_K], eax
L3:
// if (!stretch && (0 == j) && (0 == k))
mov eax, [esp + STRETCH]
test eax, eax
jnz L4
mov eax, [esp + LOOP_J]
cmp eax, 1
jne L4
mov eax, [esp + LOOP_K]
cmp eax, 1
jne L4
// pnext = psrc
mov ecx, esi
L4:
// for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4,
// pdest_copy += 8, pdest_avg += 8)
mov ebp, [esp + OUTPUT_FRAME_WIDTH]
// Pentium pipeline scheduling has not been performed on the following loop code yet
L5:
// READ_DWORD_AND_SHIFT(t1,psrc)
mov eax, [esi]
and eax, 0xFEFEFEFE
shr eax, 1
// DUP_LOWER_TWO_BYTES(pdest_copy,t1)
mov bl, ah
mov bh, ah
shl ebx, 16
mov bl, al
mov bh, al
mov [edi], ebx
// DUP_UPPER_TWO_BYTES((pdest_copy+4),t1)
shr eax, 16
mov bl, ah
mov bh, ah
shl ebx, 16
mov bl, al
mov bh, al
mov [edi+4], ebx
// READ_DWORD_AND_SHIFT(t2,pnext)
// AVERAGE_DWORDS(t1,t1,t2)
mov eax, [esi]
and eax, 0xFEFEFEFE
shr eax, 1
mov ebx, [ecx]
and ebx, 0xFEFEFEFE
shr ebx, 1
add eax, ebx
and eax, 0xFEFEFEFE
shr eax, 1
// DUP_LOWER_TWO_BYTES(pdest_avg,t1)
mov bl, ah
mov bh, ah
shl ebx, 16
mov bl, al
mov bh, al
mov [edx], ebx
// DUP_UPPER_TWO_BYTES((pdest_avg+4),t1)
shr eax, 16
mov bl, ah
mov bh, ah
shl ebx, 16
mov bl, al
mov bh, al
mov [edx+4], ebx
// end of i loop
lea esi, [esi + 4]
lea ecx, [ecx + 4]
lea edi, [edi + 8]
lea edx, [edx + 8]
sub ebp, 8
jnz L5
// psrc += spitch_adj
// pnext += spitch_adj
// pdest_copy = pdest_avg + pitch_adj
// pdest_avg = pdest_copy + pitch
add esi, [esp + SPITCH_ADJ]
add ecx, [esp + SPITCH_ADJ]
mov eax, edx
add eax, [esp + DPITCH_ADJ]
mov edi, eax
mov edx, edi
add edx, [esp + PITCH_PARM]
// end of k loop
dec DWORD PTR [esp + LOOP_K]
jnz L3
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0
jz L6
// psrc -= ((OutputFrameWidth>>1)+spitch_adj)
// pnext -= ((OutputFrameWidth>>1)+spitch_adj)
// pdest_avg = pdest_copy
mov eax, [esp + OUTPUT_FRAME_WIDTH]
shr eax, 1
add eax, [esp + SPITCH_ADJ]
sub esi, eax
sub ecx, eax
mov edx, edi
// for (i = OutputFrameWidth; i > 0; i -= 8, psrc += 4, pnext += 4, pdest_avg += 8)
mov ebp, [esp + OUTPUT_FRAME_WIDTH]
// Pentium pipeline scheduling has not been performed on the following loop code yet
L7:
// READ_DWORD_AND_SHIFT(t1,psrc)
mov eax, [esi]
and eax, 0xFEFEFEFE
shr eax, 1
// READ_DWORD_AND_SHIFT(t2,pnext)
mov ebx, [ecx]
and ebx, 0xFEFEFEFE
shr ebx, 1
// AVERAGE_DWORDS(t1,t1,t2)
// AVERAGE_DWORDS(t1,t1,t2)
add eax, ebx
and eax, 0xFEFEFEFE
shr eax, 1
add eax, ebx
and eax, 0xFEFEFEFE
shr eax, 1
// DUP_LOWER_TWO_BYTES(pdest_avg,t1)
mov bl, ah
mov bh, ah
shl ebx, 16
mov bl, al
mov bh, al
mov [edx], ebx
// DUP_UPPER_TWO_BYTES((pdest_avg+4),t1)
shr eax, 16
mov bl, ah
mov bh, ah
shl ebx, 16
mov bl, al
mov bh, al
mov [edx+4], ebx
// end of i loop
lea esi, [esi + 4]
lea ecx, [ecx + 4]
lea edx, [edx + 8]
sub ebp, 8
jnz L7
// psrc += spitch_adj
// pnext += spitch_adj
// pdest_copy = pdest_avg + dpitch_adj
// pdest_avg = pdest_copy + pitch
// flag++
add esi, [esp + SPITCH_ADJ]
add ecx, [esp + SPITCH_ADJ]
mov eax, edx
add eax, [esp + DPITCH_ADJ]
mov edi, eax
mov edx, edi
add edx, [esp + PITCH_PARM]
inc DWORD PTR [esp + FLAG]
// end of j loop
L6:
dec DWORD PTR [esp + LOOP_J]
jnz L2
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef SPITCH_ADJ
#undef CHROMA_ITERS
#undef OUTPUT_FRAME_WIDTH
#undef PITCH_PARM
#undef PLANE
#undef PSRC
#undef DPITCH_ADJ
#undef STRETCH
#undef MARK
#undef FLAG
#undef LOOP_J
#undef LOOP_K
__declspec(naked)
_STATIC void IA_H26X_YVU9toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 88
// | FrameHeight | + 84
// | FrameWidth | + 80
// | VPlane | + 76
// | UPlane | + 72
// | YPlane | + 68
// | lpInput | + 64
// | lpbiInput | + 60
// -----------------------------
// | return addr | + 56
// | saved ebp | + 52
// | saved ebx | + 48
// | saved esi | + 44
// | saved edi | + 40
// | width_adj | + 36
// | height_adj | + 32
// | stretch | + 28
// | mark | + 24
// | iNextLine | + 20
// | j | + 16
// | k | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 40
#define PITCH_PARM 88
#define FRAME_HEIGHT 84
#define FRAME_WIDTH 80
#define VPLANE 76
#define UPLANE 72
#define YPLANE 68
#define LP_INPUT 64
#define LPBI_INPUT 60
#define WIDTH_ADJ 36
#define HEIGHT_ADJ 32
#define STRETCH 28
#define MARK 24
#define NEXT_LINE 20
#define LOOP_J 16
#define LOOP_K 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT]
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx
shr ebp, 1
sub edx, ebp
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea edx, [edx + 4]
sub eax, 48
jnz L1
// width_adj = (lpbiInput->biWidth - FrameWidth) >> 1
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth
sub esi, [esp + FRAME_WIDTH]
shr esi, 1
mov [esp + WIDTH_ADJ], esi
// aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx
xor edi, edi
test esi, esi
jz L2
mov edi, edx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2:
mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub edx, [esp + FRAME_HEIGHT]
add edx, edi
shr edx, 1
mov [esp + HEIGHT_ADJ], edx
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test edx, edx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov ebp, 12
sub ebp, eax
mov [esp + MARK], ebp
// iNextLine = width_adj << 1
mov ebp, esi
shl ebp, 1
mov [esp + NEXT_LINE], ebp
// pnext = lpInput + (lpbiInput->biWidth * height_adj) + width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth
mov ebx, edx
imul ebx
add esi, eax
add esi, [esp + LP_INPUT]
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = LumaIters; j > 0; j--)
mov eax, [esp + LUMA_ITERS]
mov [esp + LOOP_J], eax
// for (k = mark; k > 0; k--)
L4:
mov eax, [esp + MARK]
mov [esp + LOOP_K], eax
// for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4)
// assign (ebp, i)
L5:
mov ebp, [esp + FRAME_WIDTH]
// This jump is here to make sure the following loop starts on the U pipe
jmp L6
L6:
// *(U32 *)YPlane = (*(U32 *)pnext & 0xFEFEFEFE) >> 1;
// 1
mov eax, [esi]
lea esi, [esi + 4]
// 2
and eax, 0xFEFEFEFE
lea edi, [edi + 4]
// 3
shr eax, 1
sub ebp, 4
// 4
mov [edi - 4], eax
jnz L6
// pnext += iNextLine
// YPlane += ypitch_adj
add esi, [esp + NEXT_LINE]
add edi, [esp + YPITCH_ADJ]
// end of k loop
mov eax, [esp + LOOP_K]
sub eax, 1
mov [esp + LOOP_K], eax
jnz L5
// if (stretch)
mov eax, [esp + STRETCH]
test eax, eax
jz L7
// plast = pnext - lpbiInput->biWidth
// pn = pnext
// assign (ecx, plast)
// assign (edx, pn)
mov ecx, esi
mov eax, [esp + LPBI_INPUT]
sub ecx, (LPBITMAPINFOHEADER)[eax].biWidth
mov edx, esi
// for (i = FrameWidth; i > 0; i -= 4, YPlane += 4, pnext += 4)
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH]
// This jump is here just to make sure the loop code starts with the U pipe
jmp L8
L8:
// *(U32 *)YPlane =
// ( ( ((*(U32 *)plast & 0xFEFEFEFE) >> 1) +
// ((*(U32 *)pbn & 0xFEFEFEFE) >> 1) ) & 0xFEFEFEFE ) >> 1
// 1
mov eax, [ecx]
lea ecx, [ecx + 4]
// 2
shr eax, 1
// 3
and eax, 0x7F7F7F7F
mov ebx, [edx]
// 4
shr ebx, 1
lea edi, [edi + 4]
// 5
and ebx, 0x7F7F7F7F
// 6
add eax, ebx
// 7
and eax, 0xFEFEFEFE
// 8
shr eax, 1
// 9
mov [edi - 4], eax
sub ebp, 4
// 10
lea edx, [edx + 4]
jnz L8
// YPlane += ypitch_adj
add edi, [esp + YPITCH_ADJ]
L7:
// end of the LumaIters loop
dec DWORD PTR [esp + LOOP_J]
jnz L4
// pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight)
// assign (esi, pvsrc)
mov eax, [esp + LPBI_INPUT]
mov ebx, (LPBITMAPINFOHEADER)[eax].biWidth
mov eax, (LPBITMAPINFOHEADER)[eax].biHeight
imul ebx
add eax, [esp + LP_INPUT]
mov esi, eax
// pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight)>>2)
// assign (edi, pusrc)
mov eax, [esp + LPBI_INPUT]
mov ecx, (LPBITMAPINFOHEADER)[eax].biWidth
shr ecx, 2
mov eax, (LPBITMAPINFOHEADER)[eax].biHeight
shr eax, 2
imul ecx
add eax, esi
mov edi, eax
// t = ((lpbiInput->biWidth>>2) * (height>>2)) + (width_adj>>2)
// assign (eax, t)
mov eax, [esp + LPBI_INPUT]
mov eax, (LPBITMAPINFOHEADER)[eax].biWidth
shr eax, 2
mov ebx, [esp + HEIGHT_ADJ]
shr ebx, 2
imul ebx
mov ebx, [esp + WIDTH_ADJ]
shr ebx, 2
add eax, ebx
// pvsrc += t
// pusrc += t
add esi, eax
add edi, eax
// uv_plane_common(pusrc,UPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1)
mov ebp, esp
mov eax, [ebp + WIDTH_ADJ]
shr eax, 1
push eax
mov eax, [ebp + LUMA_ITERS]
shr eax, 1
push eax
mov eax, [ebp + FRAME_WIDTH]
shr eax, 1
push eax
push DWORD PTR [ebp + PITCH_PARM]
push DWORD PTR [ebp + UPLANE]
push edi
call IA_uv_plane_common
lea esp, [esp + 24]
// uv_plane_common(pvsrc,VPlane,pitch,FrameWidth>>1,LumaIters>>1,width_adj>>1)
mov ebp, esp
mov eax, [ebp + WIDTH_ADJ]
shr eax, 1
push eax
mov eax, [ebp + LUMA_ITERS]
shr eax, 1
push eax
mov eax, [ebp + FRAME_WIDTH]
shr eax, 1
push eax
push DWORD PTR [ebp + PITCH_PARM]
push DWORD PTR [ebp + VPLANE]
push esi
call IA_uv_plane_common
lea esp, [esp + 24]
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef WIDTH_ADJ
#undef HEIGHT_ADJ
#undef STRETCH
#undef MARK
#undef NEXT_LINE
#undef LOOP_J
#undef LOOP_K
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/***************************************************
* H26X_YUV12toEncYUV12()
* Copy YUV12 data to encoder memory at the
* appropriate location. It is assumed that the input
* data is stored as rows of Y, followed by rows of U,
* then rows of V.
*
***************************************************/
#if 0
_STATIC void C_H26X_YUV12toEncYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch) {
int i, j;
U32 *pnext = (U32 *)lpInput;
int ypitch_adj = pitch - FrameWidth;
int yinput_height = lpbiInput->biHeight;
int yinput_width = lpbiInput->biWidth;
int yheight_diff = FrameHeight - yinput_height;
int ywidth_diff = FrameWidth - yinput_width;
int uvpitch_adj = pitch - (FrameWidth >> 1);
int uvoutput_width = FrameWidth >> 1;
int uvinput_height = yinput_height >> 1;
int uvinput_width = yinput_width >> 1;
int uvheight_diff = yheight_diff >> 1;
int uvwidth_diff = ywidth_diff >> 1;
for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj) {
for (i = yinput_width; i > 0; i -= 8) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4;
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4;
}
for (i = ywidth_diff; i > 0; i -= 8) {
*(U32 *)YPlane = 0; YPlane += 4;
*(U32 *)YPlane = 0; YPlane += 4;
}
}
for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj) {
for (i = FrameWidth; i > 0; i -= 8) {
*(U32 *)YPlane = 0; YPlane += 4;
*(U32 *)YPlane = 0; YPlane += 4;
}
}
for (j = uvinput_height; j > 0; j--, UPlane += uvpitch_adj) {
for (i = uvinput_width; i > 0; i -= 8) {
*(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4;
*(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4;
}
for (i = uvwidth_diff; i > 0; i -= 8) {
*(U32 *)UPlane = 0x40404040; UPlane += 4;
*(U32 *)UPlane = 0x40404040; UPlane += 4;
}
}
for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj) {
for (i = uvoutput_width; i > 0; i -= 8) {
*(U32 *)UPlane = 0x40404040; UPlane += 4;
*(U32 *)UPlane = 0x40404040; UPlane += 4;
}
}
for (j = uvinput_height; j > 0; j--, VPlane += uvpitch_adj) {
for (i = uvinput_width; i > 0; i -= 8) {
*(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4;
*(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4;
}
for (i = uvwidth_diff; i > 0; i -= 8) {
*(U32 *)VPlane = 0x40404040; VPlane += 4;
*(U32 *)VPlane = 0x40404040; VPlane += 4;
}
}
for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj) {
for (i = uvoutput_width; i > 0; i -= 8) {
*(U32 *)VPlane = 0x40404040; VPlane += 4;
*(U32 *)VPlane = 0x40404040; VPlane += 4;
}
}
}
#endif
__declspec(naked)
_STATIC void IA_H26X_YUV12toEncYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 92
// | FrameHeight | + 88
// | FrameWidth | + 84
// | VPlane | + 80
// | UPlane | + 76
// | YPlane | + 72
// | lpInput | + 68
// | lpbiInput | + 64
// -----------------------------
// | return addr | + 60
// | saved ebp | + 56
// | saved ebx | + 52
// | saved esi | + 48
// | saved edi | + 44
// | ypitch_adj | + 40
// | yinput_height | + 36
// | yinput_width | + 32
// | yheight_diff | + 28
// | ywidth_diff | + 24
// | uvpitch_adj | + 20
// | uvoutput_width | + 16
// | uvinput_height | + 12
// | uvinput_width | + 8
// | uvheight_diff | + 4
// | uvwidth_diff | + 0
#define LOCALSIZE 44
#define PITCH_PARM 92
#define FRAME_HEIGHT 88
#define FRAME_WIDTH 84
#define VPLANE 80
#define UPLANE 76
#define YPLANE 72
#define LP_INPUT 68
#define LPBI_INPUT 64
#define YPITCH_ADJ 40
#define YINPUT_HEIGHT 36
#define YINPUT_WIDTH 32
#define YHEIGHT_DIFF 28
#define YWIDTH_DIFF 24
#define UVPITCH_ADJ 20
#define UVOUTPUT_WIDTH 16
#define UVINPUT_HEIGHT 12
#define UVINPUT_WIDTH 8
#define UVHEIGHT_DIFF 4
#define UVWIDTH_DIFF 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
mov ebx, [esp + FRAME_HEIGHT]
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
// ypitch_adj = pitch - FrameWidth
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvoutput_width = FrameWidth >> 1
mov ebp, ecx
shr ebp, 1
mov [esp + UVOUTPUT_WIDTH], ebp
// uvpitch_adj = pitch - (FrameWidth >> 1)
sub edx, ebp
mov [esp + UVPITCH_ADJ], edx
// yinput_height = lpbiInput->biHeight
// uvinput_height = yinput_height >> 1
// yinput_width = lpbiInput->biWidth
// uvinput_width = yinput_width >> 1
mov ebx, [esp + LPBI_INPUT]
mov eax, (LPBITMAPINFOHEADER)[ebx].biHeight
mov [esp + YINPUT_HEIGHT], eax
shr eax, 1
mov [esp + UVINPUT_HEIGHT], eax
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth
mov [esp + YINPUT_WIDTH], eax
shr eax, 1
mov [esp + UVINPUT_WIDTH], eax
// yheight_diff = FrameHeight - yinput_height
// uvheight_diff = yheight_diff >> 1;
mov eax, [esp + FRAME_HEIGHT]
mov ebx, eax
sub eax, [esp + YINPUT_HEIGHT]
jns NoCrop0
xor eax, eax
mov [esp + YINPUT_HEIGHT], ebx
shr ebx, 1
mov [esp + UVINPUT_HEIGHT], ebx
NoCrop0:
mov [esp + YHEIGHT_DIFF], eax
shr eax, 1
mov [esp + UVHEIGHT_DIFF], eax
// ywidth_diff = FrameWidth - yinput_width
// uvwidth_diff = ywidth_diff >> 1;
mov eax, [esp + FRAME_WIDTH]
xor ebx, ebx
sub eax, [esp + YINPUT_WIDTH]
jns NoCrop1
mov eax, [esp + FRAME_WIDTH]
mov ebx, [esp + YINPUT_WIDTH]
sub ebx, eax
mov [esp + YINPUT_WIDTH], eax
shr eax, 1
mov [esp + UVINPUT_WIDTH], eax
xor eax, eax
NoCrop1:
mov [esp + YWIDTH_DIFF], eax
shr eax, 1
mov [esp + UVWIDTH_DIFF], eax
// assign (esi, lpInput)
mov esi, [esp + LP_INPUT]
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = yinput_height; j > 0; j--, YPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + YINPUT_HEIGHT]
L1:
// for (i = yinput_width; i > 0; i -= 8)
// assign (ebp, i)
mov ebp, [esp + YINPUT_WIDTH]
L2:
// *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4
// *(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F; YPlane += 4
// 1
mov eax, [esi]
mov edx, [esi + 4]
// 2
shr eax, 1
and edx, 0xFEFEFEFE
// 3
shr edx, 1
and eax, 0x7F7F7F7F
// 4
lea esi, [esi + 8]
mov [edi], eax
// 5
sub ebp, 8
mov [edi + 4], edx
// 6
lea edi, [edi + 8]
jnz L2
// for (i = ywidth_diff; i > 0; i -= 8)
// *(U32 *)YPlane = 0; YPlane += 4;
// *(U32 *)YPlane = 0; YPlane += 4;
// assign (ebp, i)
mov ebp, [esp + YWIDTH_DIFF]
test ebp, ebp
jz L3
L4:
// 1
xor eax, eax
sub ebp, 8
// 2
mov [edi], eax
mov [edi + 4], eax
// 3
lea edi, [edi + 8]
jnz L4
// j--, YPlane += ypitch_adj
L3:
mov eax, [esp + YPITCH_ADJ]
add edi, eax
add esi, ebx
dec ecx
jnz L1
// for (j = yheight_diff; j > 0; j--, YPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + YHEIGHT_DIFF]
test ecx, ecx
jz L7
L5:
// for (i = FrameWidth; i > 0; i -= 8)
// *(U32 *)YPlane = 0; YPlane += 4;
// *(U32 *)YPlane = 0; YPlane += 4;
// assign (ebp, i)
mov ebp, [esp + FRAME_WIDTH]
L6:
// 1
xor eax, eax
sub ebp, 8
// 2
mov [edi], eax
mov [edi + 4], eax
// 3
lea edi, [edi + 8]
jnz L6
// j--, YPlane += ypitch_adj
mov eax, [esp + YPITCH_ADJ]
add edi, eax
dec ecx
jnz L5
L7:
// recompute start of input U plane
mov edx, [esp + LPBI_INPUT]
mov eax, (LPBITMAPINFOHEADER)[edx].biHeight
mov ecx, (LPBITMAPINFOHEADER)[edx].biWidth
imul eax, ecx
// assign (esi, lpInput)
mov esi, [esp + LP_INPUT]
add esi, eax
// assign (edi, UPlane)
mov edi, [esp + UPLANE]
shr ebx, 1
// for (j = uvinput_height; j > 0; j--, UPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVINPUT_HEIGHT]
L8:
// for (i = uvinput_width; i > 0; i -= 8)
// assign (ebp, i)
mov ebp, [esp + UVINPUT_WIDTH]
L9:
// *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4
// *(U32 *)UPlane = (*pnext++ >> 1) & 0x7F7F7F7F; UPlane += 4
// 1
mov eax, [esi]
mov edx, [esi + 4]
// 2
shr eax, 1
and edx, 0xFEFEFEFE
// 3
shr edx, 1
and eax, 0x7F7F7F7F
// 4
lea esi, [esi + 8]
mov [edi], eax
// 5
sub ebp, 8
mov [edi + 4], edx
// 6
lea edi, [edi + 8]
jnz L9
// for (i = uvwidth_diff; i > 0; i -= 8)
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVWIDTH_DIFF]
test ebp, ebp
jz L11
L10:
// 1
mov eax, 040404040H
sub ebp, 8
// 2
mov [edi], eax
mov [edi + 4], eax
// 3
lea edi, [edi + 8]
jnz L10
// j--, UPlane += uvpitch_adj
L11:
mov eax, [esp + UVPITCH_ADJ]
add edi, eax
add esi, ebx
dec ecx
jnz L8
// for (j = uvheight_diff; j > 0; j--, UPlane += uvpitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVHEIGHT_DIFF]
test ecx, ecx
jz L14
L12:
// for (i = uvoutput_width; i > 0; i -= 8)
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// *(U32 *)UPlane = 0x40404040; UPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVOUTPUT_WIDTH]
L13:
// 1
mov eax, 040404040H
sub ebp, 8
// 2
mov [edi], eax
mov [edi + 4], eax
// 3
lea edi, [edi + 8]
jnz L13
// j--, UPlane += uvpitch_adj
mov eax, [esp + UVPITCH_ADJ]
add edi, eax
dec ecx
jnz L12
L14:
// recompute start of input V plane
mov edx, [esp + LPBI_INPUT]
mov eax, (LPBITMAPINFOHEADER)[edx].biHeight
mov ecx, (LPBITMAPINFOHEADER)[edx].biWidth
imul eax, ecx
// assign (esi, lpInput)
mov esi, [esp + LP_INPUT]
add esi, eax
shr eax, 2
add esi, eax
// assign (edi, VPlane)
mov edi, [esp + VPLANE]
// for (j = uvinput_height; j > 0; j--, VPlane += ypitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVINPUT_HEIGHT]
L15:
// for (i = uvinput_width; i > 0; i -= 8)
// assign (ebp, i)
mov ebp, [esp + UVINPUT_WIDTH]
L16:
// *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4
// *(U32 *)VPlane = (*pnext++ >> 1) & 0x7F7F7F7F; VPlane += 4
// 1
mov eax, [esi]
mov edx, [esi + 4]
// 2
shr eax, 1
and edx, 0xFEFEFEFE
// 3
shr edx, 1
and eax, 0x7F7F7F7F
// 4
lea esi, [esi + 8]
mov [edi], eax
// 5
sub ebp, 8
mov [edi + 4], edx
// 6
lea edi, [edi + 8]
jnz L16
// for (i = uvwidth_diff; i > 0; i -= 8)
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVWIDTH_DIFF]
test ebp, ebp
jz L18
L17:
// 1
mov eax, 040404040H
sub ebp, 8
// 2
mov [edi], eax
mov [edi + 4], eax
// 3
lea edi, [edi + 8]
jnz L17
// j--, VPlane += uvpitch_adj
L18:
mov eax, [esp + UVPITCH_ADJ]
add edi, eax
add esi, ebx
dec ecx
jnz L15
// for (j = uvheight_diff; j > 0; j--, VPlane += uvpitch_adj)
// assign (ecx, j)
mov ecx, [esp + UVHEIGHT_DIFF]
test ecx, ecx
jz L21
L19:
// for (i = uvoutput_width; i > 0; i -= 8)
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// *(U32 *)VPlane = 0x40404040; VPlane += 4;
// assign (ebp, i)
mov ebp, [esp + UVOUTPUT_WIDTH]
L20:
// 1
mov eax, 040404040H
sub ebp, 8
// 2
mov [edi], eax
mov [edi + 4], eax
// 3
lea edi, [edi + 8]
jnz L20
// j--, VPlane += uvpitch_adj
mov eax, [esp + UVPITCH_ADJ]
add edi, eax
dec ecx
jnz L19
L21:
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef YPITCH_ADJ
#undef YINPUT_HEIGHT
#undef YINPUT_WIDTH
#undef YHEIGHT_DIFF
#undef YWIDTH_DIFF
#undef UVPITCH_ADJ
#undef UVOUTPUT_WIDTH
#undef UVINPUT_HEIGHT
#undef UVINPUT_WIDTH
#undef UVHEIGHT_DIFF
#undef UVWIDTH_DIFF
#if 0
void C_H26X_YUY2toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch) {
U8 *pnext, *plast, *pbn, *peol;
int width_adj, height_adj;
int stretch, mark, aspect;
int iBackTwoLines;
int j, k;
int LumaIters = 0;
int ypitch_adj = pitch - FrameWidth;
int uvpitch_adj = pitch - (FrameWidth >> 1);
int nextline = -(lpbiInput->biWidth << 1);
for (j = FrameHeight; j > 0; j -= 48) {
LumaIters += 4;
}
width_adj = lpbiInput->biWidth - FrameWidth;
aspect = (width_adj ? LumaIters : 0);
height_adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1;
stretch = (height_adj ? 1 : 0);
mark = 12 - stretch;
// Move from end of line N to beginning of line N-1
iBackTwoLines = -((lpbiInput->biWidth + (int)FrameWidth) << 1);
// Point to the beginning of the last line.
pnext = lpInput + ((lpbiInput->biWidth << 1) * ((FrameHeight - aspect - 1) + height_adj))
+ width_adj;
for (j = LumaIters; j > 0; j--) {
for (k = 0; k < mark; k++) {
for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8) {
if (0 == (k & 1)) {
*(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1;
*(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1;
*(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1;
*(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1;
*(UPlane+0) = ((*(pnext+ 1)>>1) + (*(pnext+ 1+nextline)>>1)) >> 1;
*(UPlane+1) = ((*(pnext+ 5)>>1) + (*(pnext+ 5+nextline)>>1)) >> 1;
*(UPlane+2) = ((*(pnext+ 9)>>1) + (*(pnext+ 9+nextline)>>1)) >> 1;
*(UPlane+3) = ((*(pnext+13)>>1) + (*(pnext+13+nextline)>>1)) >> 1;
*(VPlane+0) = ((*(pnext+ 3)>>1) + (*(pnext+ 3+nextline)>>1)) >> 1;
*(VPlane+1) = ((*(pnext+ 7)>>1) + (*(pnext+ 7+nextline)>>1)) >> 1;
*(VPlane+2) = ((*(pnext+11)>>1) + (*(pnext+11+nextline)>>1)) >> 1;
*(VPlane+3) = ((*(pnext+15)>>1) + (*(pnext+15+nextline)>>1)) >> 1;
UPlane += 4; VPlane += 4;
} else {
*(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1;
*(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1;
*(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1;
*(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1;
}
}
pnext += iBackTwoLines;
YPlane += ypitch_adj;
if (0 == (k & 1)) {
UPlane += uvpitch_adj;
VPlane += uvpitch_adj;
}
}
if (stretch) {
plast = pnext - (lpbiInput->biWidth << 1);
pbn = pnext;
for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4,
plast += 8,
pbn += 8) {
*(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1;
*(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1;
*(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1;
*(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1;
}
YPlane += ypitch_adj;
}
}
}
#endif
__declspec(naked)
_STATIC void IA_H26X_YUY2toYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * BGR24Image,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | peol | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
#define LOCALSIZE 48
#define PITCH_PARM 96
#define FRAME_HEIGHT 92
#define FRAME_WIDTH 88
#define VPLANE 84
#define UPLANE 80
#define YPLANE 76
#define LP_INPUT 72
#define LPBI_INPUT 68
#define PYPREV 44
#define PYSPACE 40
#define PYNEXT 36
#define PEOL 32
#define LOOP_J 28
#define LOOP_K 24
#define BACK_TWO_LINES 20
#define STRETCH 16
#define MARK 12
#define LUMA_ITERS 8
#define YPITCH_ADJ 4
#define UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT]
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx
shr ebp, 1
sub edx, ebp
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea edx, [edx + 4]
sub eax, 48
jnz L1
// width_adj = lpbiInput->biWidth - FrameWidth;
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth
sub esi, [esp + FRAME_WIDTH]
// aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx
xor edi, edi
test esi, esi
jz L2
mov edi, edx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2:
mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub edx, [esp + FRAME_HEIGHT]
add edx, edi
shr edx, 1
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test edx, edx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov ebp, 12
sub ebp, eax
mov [esp + MARK], ebp
// iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth
add ebp, [esp + FRAME_WIDTH]
shl ebp, 1
neg ebp
mov [esp + BACK_TWO_LINES], ebp
// pnext = lpInput +
// ((lpbiInput->biWidth << 1) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth
shl eax, 1
mov ebx, [esp + FRAME_HEIGHT]
sub ebx, edi
dec ebx
add ebx, edx
imul ebx
add esi, eax
add esi, [esp + LP_INPUT]
// assign (edi, YPlane)
// assign (edx, UPlane)
// assign (ebp, VPlane)
mov edi, [esp + YPLANE]
mov edx, [esp + UPLANE]
mov ebp, [esp + VPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
L4:
// for (k = 0; k < mark; k++)
xor eax, eax
mov [esp + LOOP_K], eax
L5:
// for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8)
mov ecx, [esp + FRAME_WIDTH]
shl ecx, 1
add ecx, esi
mov [esp + PEOL], ecx
// if (0 == (k & 1)) {
mov eax, [esp + LOOP_K]
test eax, 1
jnz L6
// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1
// *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1
// *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1
// *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1
// or graphically
// *************************************************************************************************
// Values * Y 0 * U 0 * Y 1 * V 0 * Y 2 * U 1 * Y 3 * V 1 * Y 4 * U 2 * Y 5 * V 2 * Y 6 * U 3 * Y 7 * V 3 *
// *************************************************************************************************
// Y Offsets 0 2 4 6 8 10 12 14
// U Offsets 1 5 9 13
// Y Offsets 3 7 11 15
// Register usage:
// eax - accumulate Y values
// ebx - accumulate U values
// ecx - accumulate V values
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
// edx - ptr for writing U values
// ebp - ptr for writing V values
L7:
; 1
mov al, [esi+4] ; Y2
mov bl, [esi+9] ; U2
; 2
mov ah, [esi+6] ; Y3
mov bh, [esi+13] ; U3
; 3
shl eax, 16
mov cl, [esi+11] ; V2
; 4
shl ebx, 16
mov ch, [esi+15] ; V3
; 5
shl ecx, 16
mov al, [esi] ; Y0
; 6
mov bh, [esi+5] ; U1
mov ah, [esi+2] ; Y1
; 7
shr eax, 1
mov bl, [esi+1] ; U0
; 8
shr ebx, 1
mov ch, [esi+7] ; V1
; 9
and eax, 07F7F7F7FH
mov cl, [esi+3] ; V0
; 10
shr ecx, 1
and ebx, 07F7F7F7FH
; 11
mov [edi], eax
and ecx, 07F7F7F7FH
; 12
mov al, [esi+12] ; Y6
mov [edx], ebx
; 13
mov ah, [esi+14] ; Y7
mov [ebp], ecx
; 14
shl eax, 16
mov ecx, [esp + PEOL]
; 15
mov al, [esi+8] ; Y4
lea edi, [edi+8]
; 16
mov ah, [esi+10] ; Y5
lea edx, [edx+4]
; 17
shr eax, 1
lea ebp, [ebp+4]
; 18
and eax, 07F7F7F7FH
lea esi, [esi+16]
; 19
mov [edi-4], eax
cmp esi, ecx
; 20
jl L7
jmp L8
// } else {
// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// }
// Register usage:
// eax, ebx - accumulate Y values
// ecx - peol
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
L6:
; 1
mov al, [esi+4] ; Y2
mov bl, [esi+12] ; Y6
; 2
mov ah, [esi+6] ; Y3
mov bh, [esi+14] ; Y7
; 3
shl eax, 16
lea edi, [edi+8]
; 4
shl ebx, 16
mov al, [esi] ; Y0
; 5
mov ah, [esi+2] ; Y1
mov bh, [esi+10] ; Y5
; 6
shr eax, 1
mov bl, [esi+8] ; Y4
; 7
shr ebx, 1
and eax, 07F7F7F7FH
; 8
mov [edi-8], eax
and ebx, 07F7F7F7FH
; 9
mov [edi-8+4], ebx
lea esi, [esi+16]
; 10
cmp esi, ecx
jl L6
L8:
// pnext += iBackTwoLines
add esi, [esp + BACK_TWO_LINES]
// YPlane += ypitch_adj
add edi, [esp + YPITCH_ADJ]
// if (0 == (k&1))
mov eax, [esp + LOOP_K]
test eax, 1
jnz L9
// UPlane += uvpitch_adj
add edx, [esp + UVPITCH_ADJ]
// VPlane += uvpitch_adj
add ebp, [esp + UVPITCH_ADJ]
L9:
mov eax, [esp + LOOP_K]
inc eax
mov [esp + LOOP_K], eax
cmp eax, [esp + MARK]
jl L5
// if (stretch)
mov eax, [esp + STRETCH]
test eax, eax
jz L10
// Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average.
mov [esp + UPLANE], edx
mov [esp + VPLANE], ebp
// plast = pnext - (lpbiInput->biWidth << 1)
// assign (plast, edx)
mov edx, esi
mov eax, [esp + LPBI_INPUT]
mov eax, (LPBITMAPINFOHEADER)[eax].biWidth
shl eax, 1
sub edx, eax
// pbn = pnext
// assign (pbn, ebp)
mov ebp, esi
// for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8)
mov ecx, [esp + FRAME_WIDTH]
shl ecx, 1
add ecx, ebp
// *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1
// *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1
// *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1
// *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1
mov al, [edx+4]
mov bl, [ebp+4]
mov bh, [ebp+6]
shl ebx, 16
L11:
; 1
mov ah, [edx+6]
mov bl, [ebp]
; 2
shl eax, 16
mov bh, [ebp+2]
; 3
mov al, [edx]
lea edi, [edi+4]
; 4
mov ah, [edx+2]
lea edx, [edx+8]
; 5
and eax, 0xFEFEFEFE
lea ebp, [ebp+8]
; 6
shr eax, 1
and ebx, 0xFEFEFEFE
; 7
shr ebx, 1
nop
; 8
add eax, ebx
mov bl, [ebp+4]
; 9
shr eax, 1
mov bh, [ebp+6]
; 10
shl ebx, 16
and eax, 0x7F7F7F7F
; 11
mov [edi-4], eax
mov al, [edx+4]
; 12
cmp ebp, ecx
jl L11
// YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ]
// Recover pts to UPlane and VPlane
mov edx, [esp + UPLANE]
mov ebp, [esp + VPLANE]
L10:
mov eax, [esp + LOOP_J]
inc eax
mov [esp + LOOP_J], eax
cmp eax, [esp + LUMA_ITERS]
jl L4
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
bool UYVY_to_YUV12_Flip(
LPBITMAPINFOHEADER lpbiInput,
U8 * pImage,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
DWORD dwFrameWidthHalf, dwFrameHeightHalf;
BYTE *pRowStartY, *pRowStartSrc, *pRowStartU, *pRowStartV;
int offset;
int nRowsToSkip=0, nColsToSkip=0, nRowSkipDelta=0xffffff, nColSkipDelta=0xffffff;
int nSrcRowIndex, nDstRowIndex, nSrcColIndex, nDstColIndex, COLUMNSTOSKIP=0, ROWSTOSKIP=0;
if ((FrameWidth != (DWORD)(lpbiInput->biWidth)) || (FrameHeight != (DWORD)(lpbiInput->biHeight)))
{
nColsToSkip = COLUMNSTOSKIP = lpbiInput->biWidth - FrameWidth;
nRowsToSkip = ROWSTOSKIP = lpbiInput->biHeight - FrameHeight;
if ((nColsToSkip < 0) || (nRowsToSkip < 0))
{
return false;
}
// nXXXSkipDelta dictate how often we "skip" a row or col
if (nRowsToSkip)
{
nRowSkipDelta = (lpbiInput->biHeight + (nRowsToSkip - 1)) / nRowsToSkip;
}
if (nColsToSkip)
{
nColSkipDelta = (lpbiInput->biWidth + (nColsToSkip - 1)) / nColsToSkip;
}
}
// quick check to make sure we're processing CIF, QCIF, or SQCIF
if ((FrameWidth % 4) || (FrameHeight % 4))
{
return false;
}
dwFrameWidthHalf = FrameWidth / 2;
dwFrameHeightHalf = FrameHeight / 2;
nSrcRowIndex = 0;
nDstRowIndex = 0;
// step 1, convert the Y values over
while ((DWORD)nDstRowIndex < FrameHeight)
{
// ASSERT(nSrcRowIndex < lpbiInput->biHeight);
pRowStartY = YPlane + (pitch * nDstRowIndex);
pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 1;
// do we need to skip this row ?
if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0))
{
nRowsToSkip--;
nSrcRowIndex++;
continue;
}
// Copy the Y values of the input row into the destination row
nSrcColIndex = 0;
nDstColIndex = 0;
nColsToSkip = COLUMNSTOSKIP;
while ((DWORD)nDstColIndex < FrameWidth)
{
// ASSERT(nSrcColIndex < lpbiInput->biWidth);
// do we need to skip this column ?
if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0))
{
nColsToSkip--;
nSrcColIndex++;
continue;
}
pRowStartY[nDstColIndex] = pRowStartSrc[nSrcColIndex * 2] >> 1;
nSrcColIndex++;
nDstColIndex++;
}
nSrcRowIndex++;
nDstRowIndex++;
}
nSrcRowIndex = 0;
nDstRowIndex = 0;
nRowsToSkip = ROWSTOSKIP;
// step 2, process U and V values
while ((DWORD)nDstRowIndex < dwFrameHeightHalf) // dest is only half as many rows as src
{
// ASSERT(nSrcRowIndex < lpbiInput->biHeight);
// don't process odd numbered rows
if (nSrcRowIndex % 2)
{
// if we were supposed to skip this src row anyway, make sure
// we update our decrement
if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0))
{
nRowsToSkip--;
}
nSrcRowIndex++;
continue;
}
// do we need to skip this row ?
if ((nRowsToSkip > 0) && ((nSrcRowIndex % nRowSkipDelta) == 0))
{
nRowsToSkip--;
nSrcRowIndex++;
continue;
}
pRowStartU = UPlane + (pitch * nDstRowIndex);
pRowStartV = VPlane + (pitch * nDstRowIndex);
pRowStartSrc = pImage + (lpbiInput->biWidth * nSrcRowIndex * 2) + 0;
// Copy the U and V values of the input row into the destination row
nSrcColIndex = 0;
nDstColIndex = 0;
nColsToSkip = COLUMNSTOSKIP; // reset column skip count
while ((DWORD)nDstColIndex < dwFrameWidthHalf)
{
// ASSERT(nSrcColIndex < lpbiInput->biWidth);
// skip odd numbered columns
if (nSrcColIndex % 2)
{
// if we were supposed to skip this src row anyway, make sure
// we update our decrement
if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0))
{
nColsToSkip--;
}
nSrcColIndex++;
continue;
}
// do we need to skip this column ?
if ((nColsToSkip > 0) && ((nSrcColIndex % nColSkipDelta) == 0))
{
nSrcColIndex++;
nColsToSkip--;
continue;
}
offset = nSrcColIndex * 2;
pRowStartU[nDstColIndex] = pRowStartSrc[offset] >> 1;
pRowStartV[nDstColIndex] = pRowStartSrc[offset+2] >> 1;
nSrcColIndex++;
nDstColIndex++;
}
nSrcRowIndex++;
nDstRowIndex++;
}
// and we are done!
return true;
}
__declspec(naked)
_STATIC void IA_H26X_UYVYtoYUV12(
LPBITMAPINFOHEADER lpbiInput,
U8 * BGR24Image,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane,
UN FrameWidth,
UN FrameHeight,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | + 96
// | FrameHeight | + 92
// | FrameWidth | + 88
// | VPlane | + 84
// | UPlane | + 80
// | YPlane | + 76
// | lpInput | + 72
// | lpbiInput | + 68
// ----------------------------
// | return addr | + 64
// | saved ebp | + 60
// | saved ebx | + 56
// | saved esi | + 52
// | saved edi | + 48
// | pyprev | + 44
// | pyspace | + 40
// | pynext | + 36
// | peol | + 32
// | j | + 28
// | k | + 24
// | iBackTwoLines | + 20
// | stretch | + 16
// | mark | + 12
// | LumaIters | + 8
// | ypitch_adj | + 4
// | uvpitch_adj | + 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// assign (ebx, lpbiInput)
mov ebx, [esp + LPBI_INPUT]
// ypitch_adj = pitch - FrameWidth
// assign (ecx, FrameWidth)
// assign (edx, pitch)
mov ecx, [esp + FRAME_WIDTH]
mov edx, [esp + PITCH_PARM]
mov eax, edx
sub eax, ecx
mov [esp + YPITCH_ADJ], eax
// uvpitch_adj = pitch - (FrameWidth >> 1)
// kill (edx, pitch)
mov ebp, ecx
shr ebp, 1
sub edx, ebp
mov [esp + UVPITCH_ADJ], edx
// for (i = FrameHeight; i > 0; i -= 48) LumaIters += 4
// assign (edx, LumaIters)
xor edx, edx
mov eax, [esp + FRAME_HEIGHT]
L1:
lea edx, [edx + 4]
sub eax, 48
jnz L1
// width_adj = lpbiInput->biWidth - FrameWidth;
// assign (esi, width_adj)
mov esi, (LPBITMAPINFOHEADER)[ebx].biWidth
sub esi, [esp + FRAME_WIDTH]
// aspect = (width_adj ? LumaIters : 0)
// assign (edi, aspect)
// kill (edx, LumaIters)
mov [esp + LUMA_ITERS], edx
xor edi, edi
test esi, esi
jz L2
mov edi, edx
// height _adj = (lpbiInput->biHeight - (FrameHeight - aspect)) >> 1
// assign (edx, height_adj)
L2:
mov edx, (LPBITMAPINFOHEADER)[ebx].biHeight
sub edx, [esp + FRAME_HEIGHT]
add edx, edi
shr edx, 1
// stretch = (height_adj ? 1 : 0)
xor eax, eax
test edx, edx
jz L3
inc eax
L3:
mov [esp + STRETCH], eax
// mark = 12 - stretch
mov ebp, 12
sub ebp, eax
mov [esp + MARK], ebp
// iBackTwoLines = -((lpbiInput->biWidth + FrameWidth) << 1)
mov ebp, (LPBITMAPINFOHEADER)[ebx].biWidth
add ebp, [esp + FRAME_WIDTH]
shl ebp, 1
neg ebp
mov [esp + BACK_TWO_LINES], ebp
// pnext = lpInput +
// ((lpbiInput->biWidth << 1) *
// ((FrameHeight - aspect - 1) + height_adj)) +
// width_adj
// kill (ebx, lpbiInput)
// kill (ecx, FrameWidth)
// kill (edx, height_adj)
// kill (esi, width_adj)
// kill (edi, aspect)
// assign (esi, pnext)
mov eax, (LPBITMAPINFOHEADER)[ebx].biWidth
shl eax, 1
mov ebx, [esp + FRAME_HEIGHT]
sub ebx, edi
dec ebx
add ebx, edx
imul ebx
add esi, eax
add esi, [esp + LP_INPUT]
// assign (edi, YPlane)
// assign (edx, UPlane)
// assign (ebp, VPlane)
mov edi, [esp + YPLANE]
mov edx, [esp + UPLANE]
mov ebp, [esp + VPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
L4:
// for (k = 0; k < mark; k++)
xor eax, eax
mov [esp + LOOP_K], eax
L5:
// for ( peol = pnext + (FrameWidth << 1); pnext < peol; pnext += 16, YPlane += 8)
mov ecx, [esp + FRAME_WIDTH]
shl ecx, 1
add ecx, esi
mov [esp + PEOL], ecx
// if (0 == (k & 1)) {
mov eax, [esp + LOOP_K]
test eax, 1
jnz L6
// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// *(UPlane+0) = *(pnext+ 1) >> 1; *(UPlane+1) = *(pnext+ 5) >> 1
// *(UPlane+2) = *(pnext+ 9) >> 1; *(UPlane+3) = *(pnext+13) >> 1
// *(VPlane+0) = *(pnext+ 3) >> 1; *(VPlane+1) = *(pnext+ 7) >> 1
// *(VPlane+2) = *(pnext+11) >> 1; *(VPlane+3) = *(pnext+15) >> 1
// or graphically
// *************************************************************************************************
// Values * U 0 * Y 0 * V 0 * Y 1 * U 1 * Y 2 * V 1 * Y 3 * U 2 * Y 4 * V 2 * Y 5 * U 3 * Y 6 * V 3 * Y 7 *
// *************************************************************************************************
// Y Offsets 1 3 5 7 9 11 13 15
// U Offsets 0 4 8 12
// Y Offsets 2 6 10 14
// Register usage:
// eax - accumulate Y values
// ebx - accumulate U values
// ecx - accumulate V values
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
// edx - ptr for writing U values
// ebp - ptr for writing V values
L7:
; 1
mov al, [esi+5] ; Y2
mov bl, [esi+8] ; U2
; 2
mov ah, [esi+7] ; Y3
mov bh, [esi+12] ; U3
; 3
shl eax, 16
mov cl, [esi+10] ; V2
; 4
shl ebx, 16
mov ch, [esi+14] ; V3
; 5
shl ecx, 16
mov al, [esi+1] ; Y0
; 6
mov bh, [esi+4] ; U1
mov ah, [esi+3] ; Y1
; 7
shr eax, 1
mov bl, [esi] ; U0
; 8
shr ebx, 1
mov ch, [esi+6] ; V1
; 9
and eax, 07F7F7F7FH
mov cl, [esi+2] ; V0
; 10
shr ecx, 1
and ebx, 07F7F7F7FH
; 11
mov [edi], eax
and ecx, 07F7F7F7FH
; 12
mov al, [esi+13] ; Y6
mov [edx], ebx
; 13
mov ah, [esi+15] ; Y7
mov [ebp], ecx
; 14
shl eax, 16
mov ecx, [esp + PEOL]
; 15
mov al, [esi+9] ; Y4
lea edi, [edi+8]
; 16
mov ah, [esi+11] ; Y5
lea edx, [edx+4]
; 17
shr eax, 1
lea ebp, [ebp+4]
; 18
and eax, 07F7F7F7FH
lea esi, [esi+16]
; 19
mov [edi-4], eax
cmp esi, ecx
; 20
jl L7
jmp L8
// } else {
// *(YPlane+0) = *(pnext+ 0) >> 1; *(YPlane+1) = *(pnext+ 2) >> 1
// *(YPlane+2) = *(pnext+ 4) >> 1; *(YPlane+3) = *(pnext+ 6) >> 1
// *(YPlane+4) = *(pnext+ 8) >> 1; *(YPlane+5) = *(pnext+10) >> 1
// *(YPlane+6) = *(pnext+12) >> 1; *(YPlane+7) = *(pnext+14) >> 1
// }
// Register usage:
// eax, ebx - accumulate Y values
// ecx - peol
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
L6:
; 1
mov al, [esi+5] ; Y2
mov bl, [esi+13] ; Y6
; 2
mov ah, [esi+7] ; Y3
mov bh, [esi+15] ; Y7
; 3
shl eax, 16
lea edi, [edi+8]
; 4
shl ebx, 16
mov al, [esi+1] ; Y0
; 5
mov ah, [esi+3] ; Y1
mov bh, [esi+11] ; Y5
; 6
shr eax, 1
mov bl, [esi+9] ; Y4
; 7
shr ebx, 1
and eax, 07F7F7F7FH
; 8
mov [edi-8], eax
and ebx, 07F7F7F7FH
; 9
mov [edi-8+4], ebx
lea esi, [esi+16]
; 10
cmp esi, ecx
jl L6
L8:
// pnext += iBackTwoLines
add esi, [esp + BACK_TWO_LINES]
// YPlane += ypitch_adj
add edi, [esp + YPITCH_ADJ]
// if (0 == (k&1))
mov eax, [esp + LOOP_K]
test eax, 1
jnz L9
// UPlane += uvpitch_adj
add edx, [esp + UVPITCH_ADJ]
// VPlane += uvpitch_adj
add ebp, [esp + UVPITCH_ADJ]
L9:
mov eax, [esp + LOOP_K]
inc eax
mov [esp + LOOP_K], eax
cmp eax, [esp + MARK]
jl L5
// if (stretch)
mov eax, [esp + STRETCH]
test eax, eax
jz L10
// Save ptrs to UPlane and VPlane, use edx and ebp to do the stretch average.
mov [esp + UPLANE], edx
mov [esp + VPLANE], ebp
// plast = pnext - (lpbiInput->biWidth << 1)
// assign (plast, edx)
mov edx, esi
mov eax, [esp + LPBI_INPUT]
mov eax, (LPBITMAPINFOHEADER)[eax].biWidth
shl eax, 1
sub edx, eax
// pbn = pnext
// assign (pbn, ebp)
mov ebp, esi
// for ( peol = pbn + (FrameWidth << 1); pbn < peol; YPlane += 4, plast += 8, pbn += 8)
mov ecx, [esp + FRAME_WIDTH]
shl ecx, 1
add ecx, ebp
// *(YPlane+0) = ((*(plast+0) >> 1) + (*(pbn+0) >> 1)) >> 1
// *(YPlane+1) = ((*(plast+2) >> 1) + (*(pbn+2) >> 1)) >> 1
// *(YPlane+2) = ((*(plast+4) >> 1) + (*(pbn+4) >> 1)) >> 1
// *(YPlane+3) = ((*(plast+6) >> 1) + (*(pbn+6) >> 1)) >> 1
mov al, [edx+5]
mov bl, [ebp+5]
mov bh, [ebp+7]
shl ebx, 16
L11:
; 1
mov ah, [edx+7]
mov bl, [ebp+1]
; 2
shl eax, 16
mov bh, [ebp+3]
; 3
mov al, [edx+1]
lea edi, [edi+4]
; 4
mov ah, [edx+3]
lea edx, [edx+8]
; 5
and eax, 0xFEFEFEFE
lea ebp, [ebp+8]
; 6
shr eax, 1
and ebx, 0xFEFEFEFE
; 7
shr ebx, 1
nop
; 8
add eax, ebx
mov bl, [ebp+5]
; 9
shr eax, 1
mov bh, [ebp+7]
; 10
shl ebx, 16
and eax, 0x7F7F7F7F
; 11
mov [edi-4], eax
mov al, [edx+5]
; 12
cmp ebp, ecx
jl L11
// YPlane += ypitch_adj;
add edi, [esp + YPITCH_ADJ]
// Recover pts to UPlane and VPlane
mov edx, [esp + UPLANE]
mov ebp, [esp + VPLANE]
L10:
mov eax, [esp + LOOP_J]
inc eax
mov [esp + LOOP_J], eax
cmp eax, [esp + LUMA_ITERS]
jl L4
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef FRAME_HEIGHT
#undef FRAME_WIDTH
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef LPBI_INPUT
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef PEOL
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef STRETCH
#undef MARK
#undef LUMA_ITERS
#undef YPITCH_ADJ
#undef UVPITCH_ADJ
/*************************************************************
* Name: colorCnvtFrame
* Description: Color convert and copy input frame.
************************************************************/
void colorCnvtFrame(
T_H263EncoderCatalog * EC,
LPCODINST lpCompInst,
ICCOMPRESS * lpicComp,
U8 * YPlane,
U8 * UPlane,
U8 * VPlane
)
{
U8 *RGBCursor = (U8 *) lpicComp->lpInput;
LPBITMAPINFOHEADER lpbiInput = lpicComp->lpbiInput;
/* The Connectix Quick Cam requires RGB to YUV12 conversion.
* The B/W camera generates palette versions (8 and 4 bit).
* The color camera generates RGB24 for million colors and
* RGB16555 for thousands colors.
*/
if (BI_RGB == lpicComp->lpbiInput->biCompression)
{
if (24 == lpicComp->lpbiInput->biBitCount) {
#if 0
if ((128 == lpbiInput->biWidth) && (96 == lpbiInput->biHeight)) {
U8 YTest[12288];
U8 UTest[6144];
U8 VTest[6144];
int i, j, k;
U8 R,G,B;
C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YTest, UTest, VTest,
EC->FrameWidth, EC->FrameHeight, 128);
for (i = 0; i < 96; i++) {
for (j = 0; j < 128; j++) {
k = (i*128)+j;
if (1 < abs(YPlane[(i*384)+j]-YTest[(i*128)+j])) {
B = RGBCursor[(((95-i)*128)+j)*3];
G = RGBCursor[(((95-i)*128)+j)*3+1];
R = RGBCursor[(((95-i)*128)+j)*3+2];
}
if ((0 == (i%2)) && (0 == (j%2))) {
k = ((i>>1)*128)+(j>>1);
if (1 < abs(UPlane[((i>>1)*384)+(j>>1)]-UTest[((i>>1)*128)+(j>>1)])) {
B = RGBCursor[(((95-i)*128)+j)*3];
G = RGBCursor[(((95-i)*128)+j)*3+1];
R = RGBCursor[(((95-i)*128)+j)*3+2];
}
if (1 < abs(VPlane[((i>>1)*384)+(j>>1)] != VTest[((i>>1)*128)+(j>>1)])) {
B = RGBCursor[(((95-i)*128)+j)*3];
G = RGBCursor[(((95-i)*128)+j)*3+1];
R = RGBCursor[(((95-i)*128)+j)*3+2];
}
}
}
}
}
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#else
IA_H26X_BGR24toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else if(16 == lpicComp->lpbiInput->biBitCount)
{
// To use a common routine for all possible combinations of RGB16,
// a bitfield number is passed. This number identifies the proper bit shift
// and masking values to extract the color information
// from the 16-bit pixel words.
//
// number shift mask
// B, G, R
// ------ ----------- ----------------
// 555 2, 3, 8 0x7C, 0x7C, 0x7C
// 664 3, 3, 9 0x78, 0x7E, 0x7E
// 565 2, 4, 9 0x7C, 0x7E, 0x7C
// 655 2, 3, 9 0x7C, 0x7C, 0x7E
//
// Only 555 falls under BI_RGB. The others are specified using the
// BI_BITFIELDS compression specification. For BI_BITFIELDS, call
// Build16bitModeID to get the actual bitfield number. This routine requires the
// three array elements in the bmiColors field of a BITMAPINFO object.
//
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_BGR16toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, 555, PITCH);
#else
IA_H26X_BGR16555toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else if(8 == lpicComp->lpbiInput->biBitCount)
{
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, 8, PITCH);
#else
IA_H26X_CLUT8toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else if(4 == lpicComp->lpbiInput->biBitCount)
{
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_CLUTtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, 4, PITCH);
#else
IA_H26X_CLUT4toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else
{
DBOUT("ERROR: Unexpected input format detected.");
}
}
else if (FOURCC_YVU9 == lpicComp->lpbiInput->biCompression)
{
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#else
IA_H26X_YVU9toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else if ((FOURCC_YUV12 == lpicComp->lpbiInput->biCompression) || (FOURCC_IYUV == lpicComp->lpbiInput->biCompression))
{
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#else
IA_H26X_YUV12toEncYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else if (FOURCC_YUY2 == lpicComp->lpbiInput->biCompression)
{
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
#if 0
C_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#else
IA_H26X_YUY2toYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
#endif
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else if (FOURCC_UYVY == lpicComp->lpbiInput->biCompression)
{
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time = PENTIUM_TIMER();
}
#endif
UYVY_to_YUV12_Flip(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
EC->FrameWidth, EC->FrameHeight, PITCH);
// IA_H26X_UYVYtoYUV12(lpbiInput, RGBCursor, YPlane, UPlane, VPlane,
// EC->FrameWidth, EC->FrameHeight, PITCH);
#if defined(_CODEC_STATS)
if (pEncoderStats) {
pEncoderStats->color_convertor_time =
PENTIUM_TIMER() - pEncoderStats->color_convertor_time;
}
#endif
}
else
{
DBOUT("ERROR: Unexpected input format detected.");
}
}