windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/d3dec.cpp

/* *************************************************************************
**    INTEL Corporation Proprietary Information
**
**    This listing is supplied under the terms of a license
**    agreement with INTEL Corporation and may not be copied
**    nor disclosed except in accordance with the terms of
**    that agreement.
**
**    Copyright (c) 1995 Intel Corporation.
**    Copyright (c) 1996 Intel Corporation.
**    All Rights Reserved.
**
** *************************************************************************
*/
;// $Author:   JMCVEIGH  $
;// $Date:   11 Dec 1996 14:59:36  $
;// $Archive:   S:\h26x\src\dec\d3dec.cpv  $
;// $Header:   S:\h26x\src\dec\d3dec.cpv   1.119   11 Dec 1996 14:59:36   JMCVEIGH  $
;// $Log:   S:\h26x\src\dec\d3dec.cpv  $
// 
//    Rev 1.119   11 Dec 1996 14:59:36   JMCVEIGH
// 
// Moved deblocking filter within the loop and fixed bug for YUV12
// input and arbitrary frame sizes (must use actual dimensions for
// YUV12, not padded sizes).
// 
//    Rev 1.118   09 Dec 1996 18:02:06   JMCVEIGH
// Added support for arbitrary frame sizes.
// 
//    Rev 1.117   09 Dec 1996 09:35:14   MDUDA
// Put new version of block edge filter under H263P.
// 
//    Rev 1.116   27 Nov 1996 15:24:34   BECHOLS
// Added check for NULL ptr around EMMS at end of decompress.
// 
//    Rev 1.115   26 Nov 1996 09:05:22   KLILLEVO
// changed allocation of dtab to array
// 
//    Rev 1.114   25 Nov 1996 15:23:40   KLILLEVO
// changed filter coefficients and table size for deblocking filter
// 
//    Rev 1.113   25 Nov 1996 14:11:14   KLILLEVO
// updated de-blocking filter to latest version of annex J
// 
//    Rev 1.112   19 Nov 1996 15:05:32   MDUDA
// For YUV12 I420 output color conversion, copy at least the V plane
// to prevent assembler code from reading beyond end of buffer.
// 
//    Rev 1.111   07 Nov 1996 08:31:04   CZHU
// Fixed bugs in Mode C recovery.
// 
//    Rev 1.110   06 Nov 1996 16:37:00   CZHU
// Moved initialization for BlockAction earlier
// 
//    Rev 1.109   06 Nov 1996 15:47:10   CZHU
// 
// Added mode C support, replacing zero size r1.108
// 
//    Rev 1.107   31 Oct 1996 10:50:44   KLILLEVO
// changed one debug message
// 
//    Rev 1.106   31 Oct 1996 10:17:56   KLILLEVO
// changed the last DBOUTs to DbgLog
// 
//    Rev 1.105   25 Oct 1996 15:20:30   KLILLEVO
// changed debug-message for Block Edge Filter initialization 
// in GetDecoderOptions() to be more informatice
// 
//    Rev 1.104   25 Oct 1996 15:01:56   KLILLEVO
// null frame warning should have level 4, not 2
// 
//    Rev 1.103   25 Oct 1996 09:13:40   KLILLEVO
// changed an error message about null frame received after non-PB frame
// to trace message and level 2.
// 
//    Rev 1.102   20 Oct 1996 18:10:46   AGUPTA2
// Changed DBOUT into DbgLog.  ASSERT is not changed to DbgAssert.
// 
// 
//    Rev 1.101   16 Oct 1996 17:17:52   MDUDA
// Added initialization for DC->bReadSrcFormat to fix a capture bug.
// 
//    Rev 1.100   11 Oct 1996 16:08:30   MDUDA
// Added initial _CODEC_STATS stuff.
// 
//    Rev 1.99   26 Sep 1996 10:35:14   KLILLEVO
// need to ExplandPlane for bUnrestrictedMotionVectors in addition to
// bAdvancedPrediction
// 
//    Rev 1.98   26 Sep 1996 09:42:18   BECHOLS
// 
// Added Snapshot Event for synchronization and code to copy the Snapshot
// just prior to color conversion.
// 
//    Rev 1.97   25 Sep 1996 08:05:10   KLILLEVO
// initial extended motion vectors support 
// does not work for AP yet
// 
//    Rev 1.96   20 Sep 1996 09:36:04   MDUDA
// Fixed problem with video effects on YUV12 input images.
// Need to copy frame in this case.
// 
//    Rev 1.95   19 Sep 1996 19:40:40   MDUDA
// Fixed problem with calling AdjustPels - performed frame copy
// and set pFrame to correct location.
// 
//    Rev 1.94   16 Sep 1996 16:44:40   CZHU
// Fixed buffer overflow problem to support RTP MTU down to 128
// 
//    Rev 1.93   11 Sep 1996 15:12:26   CZHU
// Tuned off deblocking filter by default.
// 
//    Rev 1.92   10 Sep 1996 16:10:20   KLILLEVO
// added custom message to turn block edge filter on or off
// 
//    Rev 1.91   10 Sep 1996 14:15:24   BNICKERS
// Select Pentium Pro color convertors, when running on that processor.
// 
//    Rev 1.90   10 Sep 1996 10:31:04   KLILLEVO
// changed all GlobalAlloc/GlobalLock calls to HeapAlloc
// 
//    Rev 1.89   06 Sep 1996 14:21:38   BECHOLS
// 
// Removed code that was wrapped by RTP_HEADER, and removed the wrapping too.
// 
//    Rev 1.88   30 Aug 1996 08:37:58   KLILLEVO
// added C version of block edge filter, and changed the bias in 
// ClampTbl[] from 128 to CLAMP_BIAS (defined to 128)
// The C version of the block edge filter takes up way too much CPU time
// relative to the rest of the decode time (4 ms for QCIF and 16 ms
// for CIF on a P120, so this needs to coded in assembly)
// 
//    Rev 1.87   29 Aug 1996 09:29:08   CZHU
// 
// Fixed another bug in recovering lost packets followed by MODE M packet.
// 
//    Rev 1.86   27 Aug 1996 16:17:00   CZHU
// Commented out previous code to turn on MMX with RTP
// 
//    Rev 1.85   23 Jul 1996 11:20:56   CZHU
// Fixed two bugs related to packet loss recovery, one for the last packet los
// in current frame, the other in mode B packets.
// Also added motion vector adjustment for lost MBs
// 
//    Rev 1.84   18 Jul 1996 09:23:12   KLILLEVO
// implemented YUV12 color convertor (pitch changer) in assembly
// and added it as a normal color convertor function, via the
// ColorConvertorCatalog() call.
// 
//    Rev 1.83   11 Jul 1996 15:12:40   AGUPTA2
// Changed assertion failures into errors when decoder goes past end of 
// the bitstream.
// 
//    Rev 1.82   01 Jul 1996 10:04:12   RHAZRA
// Force shaping flag to false for YUY2 color conversion
// .
// 
//    Rev 1.81   25 Jun 1996 14:27:20   BECHOLS
// Set ini file variables for use with RTP stuff.
// 
//    Rev 1.80   19 Jun 1996 14:30:12   RHAZRA
// 
// Added code to deal with pitch and output buffer offset & pitch
// setting for YUY2 output format.
// 
//    Rev 1.79   14 Jun 1996 17:27:44   AGUPTA2
// Updated the color convertor table.
// 
//    Rev 1.77   30 May 1996 17:04:54   RHAZRA
// Added SQCIF support.
// 
//    Rev 1.76   30 May 1996 15:16:32   KLILLEVO
// added YUV12 output
// 
//    Rev 1.75   30 May 1996 12:45:12   KLILLEVO
// fixed debug warning message in PB-frames mode
// 
//    Rev 1.74   30 May 1996 11:26:38   AGUPTA2
// Added support for MMX color convertors.
// 
//    Rev 1.73   29 May 1996 14:11:14   RHAZRA
// Changes made to use MMxVersion set in ccpuvsn.cpp.
// 
//    Rev 1.72   24 May 1996 10:04:20   KLILLEVO
// does not need to assert out if a null frame is received when
// the previous frame was not a PB. This will often happen
// with the new MMX PB switch
// 
//    Rev 1.71   03 May 1996 13:08:28   CZHU
// 
// Added checking of packet fault after picture header decoding, and 
// change pass1 loop control to recover from packe loss. Checking packet
// fault after MB header decoding.
// 
//    Rev 1.70   12 Apr 1996 14:16:40   RHAZRA
// Added paranthesis to make ifdef SUPPORT_SQCIF work properly
// 
//    Rev 1.69   12 Apr 1996 13:32:22   RHAZRA
// 
// Added SQCIF support with #ifdef SUPPORT_SQCIF.
// 
//    Rev 1.68   10 Apr 1996 16:28:20   RHAZRA
// Added a check to make sure that the input bitstream buffer does
// not exceed the H263 spec mandated size. If it does, the decoder
// now returns ICERR_ERROR.
// 
//    Rev 1.67   04 Apr 1996 13:32:02   RHAZRA
// Changed bitstream buffer allocation as per H.263 spec
// 
//    Rev 1.66   03 Apr 1996 09:06:06   RMCKENZX
// Moved "emms" to end of decoder.
// 
//    Rev 1.65   26 Mar 1996 16:43:38   AGUPTA2
// Corrected opcode for emms.
// 
//    Rev 1.64   22 Mar 1996 17:49:48   AGUPTA2
// MMX support.  Added emms around pass1 and pass2 calls.
// 
//    Rev 1.63   18 Mar 1996 09:58:48   bnickers
// Make color convertors non-destructive.
// 
//    Rev 1.62   12 Mar 1996 20:15:04   RHAZRA
// Fixed still-mode. Use framecopy() in 320x240 mode to copy display frame
// to post frame.
// 
//    Rev 1.61   08 Mar 1996 16:46:12   AGUPTA2
// Added pragma code_seg.
// Created three new routines: IAPass1ProcessFrame(), IAPass2ProcessFrame(),
// and H263InitializeGOBBlockActionStream().  H263InitializeGOB.. rtn. is
// called once for each block after decoding the GOB header; this is good for
// the data cache.  H263InitializeBlockActionStream() is not needed now.
// ExpandPlane() is called only when needed; it is called just before its
// results are needed : before Pass2 call (improves DCache util.).  Decoder
// does not copy current frame to previous frame after decoding; it just swaps
// the pointers.  Made changes to call the new non-destructive color convertor;
// this avoids a frame copy if mirroring is not needed.  I DON"T THINK ADJUST
// PELS FUNCTIONALITY WORKS.
// 
// 
// 
//    Rev 1.59   23 Feb 1996 09:46:52   KLILLEVO
// fixed decoding of Unrestricted Motion Vector mode
// 
//    Rev 1.58   05 Feb 1996 13:35:46   BNICKERS
// Fix RGB16 color flash problem, by allowing different RGB16 formats at oce.
// 
//    Rev 1.57   17 Jan 1996 18:55:10   RMCKENZX
// more clean up from pb null frame bug
// 
//    Rev 1.56   17 Jan 1996 17:56:04   sing
// moved memcopy past the null P frame hack to avoid GPF 
// 
//    Rev 1.55   12 Jan 1996 14:59:42   TRGARDOS
// Added aspect ration correction logic and code to force
// aspect ration correction on based on INI file settings.
// 
//    Rev 1.54   11 Jan 1996 14:05:10   RMCKENZX
// Made changes to support stills.  In initialization set a local
// flag (as DC hasn't been created yet).  In frame handling, restore
// the CIF size and use the new 320x240 Offset To Line Zero figure.
// 
//    Rev 1.53   09 Jan 1996 10:44:38   RMCKENZX
// More revisions to support frame mirroring.  Added
// absolute value to references to destination width.
// 
//    Rev 1.52   08 Jan 1996 17:45:12   unknown
// Check destination pointer before using it
// 
//    Rev 1.51   08 Jan 1996 12:18:20   RMCKENZX
// Added logic to implement frame-mirroring and 
// 320x240 still frames.
// 
//    Rev 1.50   06 Jan 1996 18:39:46   RMCKENZX
// Updated copyright
// 
//    Rev 1.49   06 Jan 1996 18:34:28   RMCKENZX
// Made changes to support still frame at 320x240 resolution
// 
//    Rev 1.48   03 Jan 1996 16:52:40   TRGARDOS
// Added code to set a boolean, bMirror, when destination 
// frame width is the negative of the source frame width.
// Added if statement so that FrameMirror is called instead
// of FrameCopy when bMirror is set. This only works for
// H.263 bit streams. A new function has to be written for
// YUV12 data.
// 
//    Rev 1.47   18 Dec 1995 12:44:28   RMCKENZX
// added copyright notice
// 
//    Rev 1.46   15 Dec 1995 13:51:56   RHAZRA
// 
// Added code to force fpBlockAction->u8BlkType = BT_EMPTY in
// block action stream initialization
// 
//    Rev 1.45   13 Dec 1995 11:00:42   RHAZRA
// No change.
// 
//    Rev 1.44   11 Dec 1995 11:31:22   RHAZRA
// 12-10-95 changes: added AP stuff
// 
//    Rev 1.43   09 Dec 1995 17:26:36   RMCKENZX
// Re-architected the decoder, splitting into a 2-pass
// approach.  See comments in the code.
// 
//    Rev 1.41   09 Nov 1995 14:09:18   AGUPTA2
// Changes for PB-frame (call new ExpandYPlane, ExpandUVPlane rtns.)
// 
//    Rev 1.40   30 Oct 1995 14:08:00   TRGARDOS
// Second attempt - turn off aspect ration correction.
// 
//    Rev 1.39   30 Oct 1995 13:25:14   TRGARDOS
// Turned off aspect ration correction in color convertor.
// 
//    Rev 1.38   27 Oct 1995 16:21:56   CZHU
// Added support to return P frame in the PB pair if the bitstream is
// encoder with special null frame following previous PB frame
// 
//    Rev 1.37   26 Oct 1995 11:25:16   BNICKERS
// Fix quasi color convertor for encoder's decoder;  bugs introduced when
// adding YUV12 color convertors.
// 
//    Rev 1.36   25 Oct 1995 18:09:02   BNICKERS
// 
// Switch to YUV12 color convertors.  Clean up archival stuff.
// 
//    Rev 1.35   13 Oct 1995 16:06:16   CZHU
// First version that supports PB frames. Display B or P frames under
// VfW for now. 
// 
//    Rev 1.34   08 Oct 1995 13:45:56   CZHU
// 
// Added debug session to output reconstructed pels in YUV12 to a file
// 
//    Rev 1.33   27 Sep 1995 16:24:00   TRGARDOS
// 
// Added debug print statements.
// 
//    Rev 1.32   26 Sep 1995 15:32:12   CZHU
// Added expand y, u, v planes.
// 
//    Rev 1.31   26 Sep 1995 10:53:26   CZHU
// 
// Call ExpandPlane to expand each plane before half pel MC.
// 
//    Rev 1.30   25 Sep 1995 11:07:56   CZHU
// Added debug message
// 
//    Rev 1.29   21 Sep 1995 12:04:26   DBRUCKS
// fix assert
// 
//    Rev 1.28   20 Sep 1995 14:47:26   CZHU
// Added iNumberOfMBsPerGOB in decoder catalog
// 
//    Rev 1.27   19 Sep 1995 16:04:10   DBRUCKS
// changed to yuv12forenc
// 
//    Rev 1.26   19 Sep 1995 11:13:16   DBRUCKS
// clarify the code that orders the YYYYCbCr data (YYYYUV) data into 
// YYYYVU in the decoder's internal memory.  The variable names were 
// incorrect in one place.  The reordering is necessary to simplify
// later conversion to YVU9.
// 
//    Rev 1.25   19 Sep 1995 10:36:46   CZHU
// Added comments to the codes added for YUV12 decoder
// 
//    Rev 1.24   18 Sep 1995 08:41:54   CZHU
// 
// Added support for YUV12
// 
//    Rev 1.23   12 Sep 1995 11:13:00   CZHU
// 
// Copy the decoded YUV12 from Current frame to Previous frame
// to prepare for P frames
// 
//    Rev 1.22   11 Sep 1995 16:42:36   CZHU
// P frames
// 
//    Rev 1.21   11 Sep 1995 14:33:10   CZHU
// 
// Refresh MV info in BlockAction stream, needed for P frames
// 
//    Rev 1.20   08 Sep 1995 11:49:52   CZHU
// Added support for P frames and  more debug info
// 
//    Rev 1.19   07 Sep 1995 10:48:10   DBRUCKS
// added OUTPUT_MBDATA_ADDRESS option
// 
//    Rev 1.18   05 Sep 1995 17:22:12   DBRUCKS
// u & v are offset by 8 from Y in YVU12ForEnc
// 
//    Rev 1.17   01 Sep 1995 17:13:52   DBRUCKS
// add adjustpels
// 
//    Rev 1.16   01 Sep 1995 09:49:34   DBRUCKS
// checkin partial ajdust pels changes
// 
//    Rev 1.15   29 Aug 1995 16:50:40   DBRUCKS
// add support for YVU9 playback
// 
//    Rev 1.14   28 Aug 1995 17:45:58   DBRUCKS
// add yvu12forenc
// 
//    Rev 1.13   28 Aug 1995 10:15:14   DBRUCKS
// update to 5 July Spec and 8/25 Errata
// 
//    Rev 1.12   24 Aug 1995 08:51:30   CZHU
// Turned off apsect ratio correction. 
// 
//    Rev 1.11   23 Aug 1995 12:25:10   DBRUCKS
// Turn on the color converters
// 
//    Rev 1.10   14 Aug 1995 16:40:34   DBRUCKS
// initialize block action stream
// 
//    Rev 1.9   11 Aug 1995 17:47:58   DBRUCKS
// cleanup
// 
//    Rev 1.8   11 Aug 1995 17:30:00   DBRUCKS
// copy source to bitstream
// 
//    Rev 1.7   11 Aug 1995 16:12:14   DBRUCKS
// add ptr check to MB data and add #ifndef early exit
// 
//    Rev 1.6   11 Aug 1995 15:10:18   DBRUCKS
// get ready to integrate with block level code and hook up macro block level code
// 
//    Rev 1.5   03 Aug 1995 14:57:56   DBRUCKS
// Add ASSERT macro
// 
//    Rev 1.4   02 Aug 1995 15:31:34   DBRUCKS
// added GOB header parsing
// 
//    Rev 1.3   01 Aug 1995 12:27:38   DBRUCKS
// add PSC parsing
// 
//    Rev 1.2   31 Jul 1995 16:28:00   DBRUCKS
// move loacl BITS defs to D3DEC.CPP
// 
//    Rev 1.1   31 Jul 1995 15:32:22   CZHU
// Moved global tables to d3tables.h
// 
//    Rev 1.0   31 Jul 1995 13:00:04   DBRUCKS
// Initial revision.
// 
//    Rev 1.3   28 Jul 1995 13:57:36   CZHU
// Started to add picture level decoding of fixed length codes.
// 
//    Rev 1.2   24 Jul 1995 14:57:52   CZHU
// Added global tables for VLD decoding. Also added instance initialization
// and termination. Several data structures are updated for H.263.
// 
//    Rev 1.1   17 Jul 1995 14:46:20   CZHU
// 
// 
//    Rev 1.0   17 Jul 1995 14:14:40   CZHU
// Initial revision.
////////////////////////////////////////////////////////////////////////////// 

#include "precomp.h"

#ifdef TRACK_ALLOCATIONS
char gsz1[32];
#endif

extern BYTE PalTable[236*4];

#if defined(H263P)
extern void EdgeFilter(unsigned char *lum, 
                       unsigned char *Cb, 
                       unsigned char *Cr, 
                       int width, int height, int pitch
                      );
extern void InitEdgeFilterTab();

/* map of coded and not-coded blocks */
char coded_map[18+1][22+1]; 
/* QP map */
char QP_map[18][22];
#else
#ifdef NEW_BEF // { NEW_BEF
// C version of block edge filter functions
// takes about 3 ms for QCIF and 12 ms for CIF on a Pentium 120. 
static void HorizEdgeFilter(unsigned char *rec, 
                            int width, int height, int pitch, int chr);
static void VertEdgeFilter(unsigned char *rec, 
                           int width, int height, int pitch, int chr);
static void EdgeFilter(unsigned char *lum, 
                       unsigned char *Cb, 
                       unsigned char *Cr, 
                       int width, int height, int pitch
                      );
static void InitEdgeFilterTab();
static void FreeEdgeFilterTab();
/* map of coded and not-coded blocks */
static char coded_map[18+1][22+1]; 
/* QP map */
static char QP_map[18][22];
/* table for de-blocking filter */
/* currently requires 11232 bytes */ 
signed char dtab[352*32];
#else // }{ NEW_BEF
// C version of block edge filter functions
// takes about 4 ms for QCIF and 16 ms for CIF. This is a large percentage
// of the decoding time, so we need to implement these in assembly before
// the next big release
void EdgeFilter(unsigned char *lum, unsigned char *Cb, unsigned char *Cr, 
                int pels, int lines, int pitch, int QP);
void HorizEdgeFilter(unsigned char *rec, int width, int height, int pitch, int QP, 
                     int chr, int *deltatab);
void VertEdgeFilter(unsigned char *rec, int width, int height, int pitch, int QP, 
                    int chr, int *deltatab);
/* stores information about coded and not-coded blocks */
static char coded_map[44][36]; // memory for this should probably be allocated somewhere else
#endif // } NEW_BEF
#endif
#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
/* Decoder Timing Data - per frame
*/
#define DEC_TIMING_INFO_FRAME_COUNT 105
#pragma message ("Current log decode timing computations handle 105 frames max")
void OutputDecodeTimingStatistics(char * szFileName, DEC_TIMING_INFO * pDecTimingInfo, U32 uStatFrameCount);
void OutputDecTimingDetail(FILE * pFile, DEC_TIMING_INFO * pDecTimingInfo);
#endif // } LOG_DECODE_TIMINGS_ON

extern "C" {
  void ExpandPlane(U32, U32, U32, U32);
}

static I32 iNumberOfGOBsBySourceFormat[8] = {
     0, /* FORBIDDEN */
     6, /* SQCIF */
     9, /* QCIF */
    18, /* CIF */
     0, /* 4CIF - Not supported */
     0, /* 16CIF - Not supported */
#ifdef H263P
	 0, /* Custom */
	 0  /* Extended PTYPE */
#else
     0, /* Reserved */
     0  /* Reserved */
#endif
};

static I32 iNumberOfMBsInAGOBBySourceFormat[8] = {
     0, /* FORBIDDEN */
     8, /* SQCIF */
    11, /* QCIF */
    22, /* CIF */
     0, /* 4CIF - Not supported */
     0, /* 16CIF - Not supported */
#ifdef H263P
	 0, /* Custom */
	 0  /* Extended PTYPE */
#else
     0, /* Reserved */
     0  /* Reserved */
#endif
};

//#pragma warning(disable:4101)
//#pragma warning(disable:4102)
static LRESULT IAPass1ProcessFrame(
    T_H263DecoderCatalog *DC,
    T_BlkAction          *fpBlockAction,
    T_MBInfo             *fpMBInfo,
    BITSTREAM_STATE      *fpbsState,
    U8                   *fpu8MaxPtr,
    U32                  *pN,
    T_IQ_INDEX           *pRUN_INVERSE_Q,
    const I32             iNumberOfGOBs,
    const I32             iNumberOfMBs,
    const I32             iGOB_start,
    const I32             iMB_start);

static void H263InitializeGOBBlockActionStream(
    T_H263DecoderCatalog *DC,
    const I32             iGOBno,
    const T_BlkAction FAR *fpStartGOBBlockActionStream
);

static void IAPass2ProcessFrame(
    T_H263DecoderCatalog *DC,
    T_BlkAction          *fpBlockAction,
    T_MBInfo             *fpMBInfo,
    U32                  *pN,
    T_IQ_INDEX           *pRUN_INVERSE_Q,
    const I32             iNumberOfGOBs,
    const I32             iNumberOfMBs
);

static long DibXY(ICDECOMPRESSEX FAR *lpicDecEx, LPINT lpiPitch, UINT yScale);

static void GetDecoderOptions(T_H263DecoderCatalog *);

static void ZeroFill(HPBYTE hpbY, HPBYTE hpbU, HPBYTE hpbV, int iPitch, U32 uWidth, U32 uHeight);

#define REUSE_DECODE    1
#define DEFAULT_BUFFER_SIZE  32768L

#if REUSE_DECODE
struct {             // Communicate Encoder's decode to display decode.
    U8 FAR * Address;                    // Addr at which encoded frame is placed.
    DECINSTINFO BIGG * PDecoderInstInfo; // Encoder's decoder instance.
    unsigned int  FrameNumber;           // Frame number last encoded, mod 128.
} CompandedFrame;
#endif


/**********************************************************************
 *  H263InitDeocderGlobal
 **********************************************************************/
LRESULT H263InitDecoderGlobal(void)
{

    return ICERR_OK;
}


/***********************************************************************
 *  Description:
 *    Initialize the MB action stream for GOB 'iGOBno'.
 *  Parameters:
 *    DC:
 *    iGOBno: GOB no counting from one;i.e. the first GOB in the frame is 1.
 *    fpStartGOBBlockActionStream: Pointer to start of the block action stream 
 *      for iGOBno.
 *  Note:
 *    This routine needs to change for picture sizes larger than CIF
 ***********************************************************************/
#pragma code_seg("IACODE1")
static void H263InitializeGOBBlockActionStream(
    T_H263DecoderCatalog *DC,
    const I32             iGOBno,
    T_BlkAction FAR      *fpStartGOBBlockActionStream
)
{
    const U32 uFrameHeight = DC->uFrameHeight;
    const U32 uFrameWidth = DC->uFrameWidth;
    const U32 uCurBlock = (U32) ((U8 FAR *)DC + DC->CurrFrame.X32_YPlane); 
    const U32 uRefBlock = (U32) ((U8 FAR *)DC + DC->PrevFrame.X32_YPlane);
    const U32 uBBlock = (U32) ((U8 FAR *)DC + DC->PBFrame.X32_YPlane);
    U32       uYOffset;
    U32       uUOffset;
    U32       uVOffset;
    U32       uYUpdate;
    U32       uUVUpdate;
    U32       uBlkNumber;
    T_BlkAction *fpBlockAction = fpStartGOBBlockActionStream;

    // assume that the width and height are multiples of 16
    ASSERT((uFrameHeight & 0xF) == 0);
    ASSERT((uFrameWidth & 0xF) == 0);

    // calculate distance to the next row.
    uYUpdate = (16 * PITCH)*(iGOBno - 1);
    uUVUpdate = (8 * PITCH)*(iGOBno - 1);

    // skip the padding used for unconstrained motion vectors
    uYOffset = Y_START + uYUpdate;
    uVOffset = DC->uSz_YPlane + UV_START + uUVUpdate;
    uUOffset = uVOffset + (PITCH >> 1);
    
    // Start with the first block of the GOB
    uBlkNumber = (iGOBno -1)*((uFrameWidth>>4)*6);

    // Initialize the array
    for (U32 xpos = 0 ; xpos < uFrameWidth ; xpos += 16) {
        U8 loadcacheline;
        // Four Y Blocks
        //     Y0 Y1
        //     Y2 Y3
        loadcacheline = fpBlockAction->u8BlkType;
        
        fpBlockAction->u8BlkType = BT_EMPTY;
        fpBlockAction->pCurBlock = uCurBlock + uYOffset;
        fpBlockAction->pRefBlock = uRefBlock + uYOffset;
        fpBlockAction->pBBlock = uBBlock + uYOffset;
        fpBlockAction->uBlkNumber = uBlkNumber++;
        fpBlockAction->i8MVx2=0;
        fpBlockAction->i8MVy2=0;
        uYOffset += 8;
        fpBlockAction++;
        
        fpBlockAction->u8BlkType = BT_EMPTY;
        fpBlockAction->pCurBlock = uCurBlock + uYOffset;
        fpBlockAction->pRefBlock = uRefBlock + uYOffset;
        fpBlockAction->pBBlock = uBBlock + uYOffset;
        fpBlockAction->uBlkNumber = uBlkNumber++;
        fpBlockAction->i8MVx2=0;
        fpBlockAction->i8MVy2=0;
        uYOffset = uYOffset - 8 + (8 * PITCH);
        fpBlockAction++;
        
        loadcacheline = fpBlockAction->u8BlkType;
        
        fpBlockAction->u8BlkType = BT_EMPTY;
        fpBlockAction->pCurBlock = uCurBlock + uYOffset;
        fpBlockAction->pRefBlock = uRefBlock + uYOffset;
        fpBlockAction->pBBlock = uBBlock + uYOffset;
        fpBlockAction->uBlkNumber = uBlkNumber++;
        fpBlockAction->i8MVx2=0;
        fpBlockAction->i8MVy2=0;
        uYOffset += 8;
        fpBlockAction++;
        
        fpBlockAction->u8BlkType = BT_EMPTY;
        fpBlockAction->pCurBlock = uCurBlock + uYOffset;
        fpBlockAction->pRefBlock = uRefBlock + uYOffset;
        fpBlockAction->pBBlock = uBBlock + uYOffset;
        fpBlockAction->uBlkNumber = uBlkNumber++;
        fpBlockAction->i8MVx2=0;
        fpBlockAction->i8MVy2=0;
        uYOffset = uYOffset + 8 - (8 * PITCH);
        fpBlockAction++;
        
        // Notice: although the blocks are read in YYYYUV order we store the 
        //         data in memory in Y V U order. This is accomplished because 
        //         block 5 (U) is written to the right of block 6 (V). 
        //         One Cb (U) Block
        loadcacheline = fpBlockAction->u8BlkType;
        
        fpBlockAction->u8BlkType = BT_EMPTY;
        fpBlockAction->pCurBlock = uCurBlock + uUOffset;
        fpBlockAction->pRefBlock = uRefBlock + uUOffset;
        fpBlockAction->pBBlock = uBBlock + uUOffset;
        fpBlockAction->uBlkNumber = uBlkNumber++;
        fpBlockAction->i8MVx2=0;
        fpBlockAction->i8MVy2=0;
        uUOffset += 8;
        fpBlockAction++;
        
        // One Cr (V) Block
        fpBlockAction->u8BlkType = BT_EMPTY;
        fpBlockAction->pCurBlock = uCurBlock + uVOffset;
        fpBlockAction->pRefBlock = uRefBlock + uVOffset;
        fpBlockAction->pBBlock = uBBlock + uVOffset;
        fpBlockAction->uBlkNumber = uBlkNumber++;
        fpBlockAction->i8MVx2=0;
        fpBlockAction->i8MVy2=0;
        uVOffset += 8;
        fpBlockAction++;
        
    }
} // end H263InitializeGOBBlockActionStream() 
#pragma code_seg()


/**********************************************************************
 *  H263InitDecoderInstance 
 *    This function allocates and initializes the per-instance tables used by 
 *    the H263 decoder. Note that in 16-bit Windows, the non-instance-specific
 *    global tables are copied to the per-instance data segment, so that they 
 *    can be used without segment override prefixes.
 ***********************************************************************/
LRESULT H263InitDecoderInstance(
    LPDECINST lpInst, 
    int       CodecID)
{ 
    U32 u32YActiveHeight, u32YActiveWidth;
    U32 u32UVActiveHeight, u32UVActiveWidth;
    U32 u32YPlane, u32VUPlanes ,u32YVUPlanes,u32SizeBlkActionStream;
    U32 uSizeBitStreamBuffer;
    U32 u32SizeT_IQ_INDEXBuffer, u32SizepNBuffer, u32SizeMBInfoStream;    // NEW
    U32 lOffset=0;
    U32 u32TotalSize;
    LRESULT iReturn= ICERR_OK;
    LPVOID pDecoderInstance;
    U32 * pInitLimit;
    U32 * pInitPtr;
    I32 i32xres, i32yres;

#ifdef H263P
	I32 i32xresActual, i32yresActual;	// i32xres and i32yres are padded to multiples of 16
#endif

    BOOL bIs320x240;
    T_H263DecoderCatalog * DC;
    U8                   * P32Inst;

	FX_ENTRY("H263InitDecoderInstance");

    if(IsBadWritePtr((LPVOID)lpInst, sizeof(DECINSTINFO)))
    {
		ERRORMESSAGE(("%s: Bad input parameter!\r\n", _fx_));
        iReturn = ICERR_BADPARAM;
        goto done;
    }

    lpInst->Initialized = FALSE;
    
#ifdef NO_BEF // { NO_BEF
	// default block edge filter
	lpInst->bUseBlockEdgeFilter = 0;
#else // }{ NO_BEF
	// default block edge filter
	lpInst->bUseBlockEdgeFilter = 1;
#endif // } NO_BEF

#if defined(FORCE_8BIT_OUTPUT) && defined(USE_WIN95_PAL) // { #if defined(FORCE_8BIT_OUTPUT) && defined(USE_WIN95_PAL)
	lpInst->UseActivePalette = TRUE;
	lpInst->InitActivePalette = TRUE;
	CopyMemory((PVOID)&lpInst->ActivePalette[10], (CONST VOID *)PalTable, (DWORD)sizeof(PalTable));
#endif // } #if defined(FORCE_8BIT_OUTPUT) && defined(USE_WIN95_PAL)

    // Peel off special cases here
    i32xres = lpInst->xres;
    i32yres = lpInst->yres;
    
    // use positive frame size{s}
    // (may be negative to signal frame mirroring or inverted video)
    if (i32xres < 0) i32xres = -i32xres;
    if (i32yres < 0) i32yres = -i32yres;

#ifdef H263P
	// Need to use the padded dimensions for decoding since H.263+ supports
	// custom picture formats, which are padded to multiples of 16 for encoding
	// and decoding. The actual dimensions are used for display only
	i32xresActual = i32xres;
	i32yresActual = i32yres;
	i32xres = (i32xresActual + 0xf) & ~0xf;
	i32yres = (i32yresActual + 0xf) & ~0xf;
#endif

    // Next check for 320x240 still
    if ( (CodecID == H263_CODEC) && (i32xres == 320) && (i32yres == 240) ) {
        i32xres = 352;
        i32yres = 288;
        bIs320x240 = TRUE;
    } else {
        bIs320x240 = FALSE;
    } 


#ifdef H263P
	// Add lower bounds and multiples of 4
	if ((CodecID == H263_CODEC && 
		(i32yresActual > 288 || i32yresActual < 4 || 
		 i32xresActual > 352 || i32xresActual < 4 ||
		 (i32yres & ~0x3) != i32yres || (i32xres & ~0x3) != i32xres)) ||
#else
    if ((CodecID ==  H263_CODEC && (i32yres > 288 || i32xres > 352)) ||
#endif
        (CodecID == YUV12_CODEC && (i32yres > 480 || i32xres > 640)) )
    {
		ERRORMESSAGE(("%s: Bad input image size!\r\n", _fx_));
        iReturn = ICERR_BADSIZE;
        goto done;
    }

    if (CodecID == YUV12_CODEC) 
    {
        /* The active height and width must be padded to a multiple of 8
         * since the adjustpels routine relies on it.
         */
        u32YActiveHeight  = ((i32yres + 0x7) & (~ 0x7));
        u32YActiveWidth   = ((i32xres + 0x7) & (~ 0x7));
        u32UVActiveHeight = ((i32yres + 0xF) & (~ 0xF)) >> 1;
        u32UVActiveWidth  = ((i32xres + 0xF) & (~ 0xF)) >> 1;

        u32YPlane         = u32YActiveWidth  * u32YActiveHeight;
        u32VUPlanes       = u32UVActiveWidth * u32UVActiveHeight * 2;
        u32YVUPlanes      = u32YPlane + u32VUPlanes;

        u32TotalSize = 512L + 0x1FL;   /* Just enough space for Decoder Catalog. */

    }
    else
    {
        ASSERT(CodecID == H263_CODEC);
        
        u32YActiveHeight  = i32yres + UMV_EXPAND_Y + UMV_EXPAND_Y ;
        u32YActiveWidth   = i32xres + UMV_EXPAND_Y + UMV_EXPAND_Y ;
        u32UVActiveHeight = u32YActiveHeight/2;
        u32UVActiveWidth  = u32YActiveWidth /2;
       
        u32YPlane         = PITCH * u32YActiveHeight;
        u32VUPlanes       = PITCH * u32UVActiveHeight;
        u32YVUPlanes      = u32YPlane + u32VUPlanes;

        // calculate the block action stream size.  The Y portion has one block 
        // for every 8x8 region.  The U and V portion has one block for every 
        // 16x16 region. We also want to make sure that the size is aligned to 
        // a cache line.
        u32SizeBlkActionStream = (i32xres >> 3) * (i32yres >> 3);
        u32SizeBlkActionStream += ((i32xres >> 4) * (i32yres >> 4)) * 2;
        u32SizeBlkActionStream *= sizeof (T_BlkAction);
        u32SizeBlkActionStream = (u32SizeBlkActionStream + 31) & ~0x1F;
        
        // calculate sizes of NEW data structures     
        u32SizeT_IQ_INDEXBuffer = (i32xres)*(i32yres*3)*sizeof(T_IQ_INDEX);
        u32SizepNBuffer = (i32xres>>4)*(i32yres>>4)*sizeof(U32)*12;
        u32SizeMBInfoStream = (i32xres>>4)*(i32yres>>4)*sizeof(T_MBInfo);

        // calculate the bitstream buffer size.  We copy the input data to a 
        // buffer in our space because we read ahead up to 4 bytes beyond the 
        // end of the input data.  The input data size changes for each frame.  
        // So the following is a very safe upper bound estimate.    I am using 
        // the same formula as in CompressGetSize().
        
        uSizeBitStreamBuffer = i32yres * i32xres;
        // RH:  allocate bit-stream buffer according to the max size
        //      specified in the spec.
		/*
        if ( 
            ((i32xres == 176) && (i32yres == 144)) 
            ||
            ((i32xres == 128) && (i32yres == 96))             
           )
           uSizeBitStreamBuffer = 8 * 1024;
        else 
        {
           if ( (i32xres == 352) && (i32yres == 288) )
              uSizeBitStreamBuffer = 32 * 1024;
           else    
           { // Should never happen
               DBOUT("ERROR :: H263InitDecoderInstance :: ICERR_BADSIZE");
               iReturn = ICERR_BADSIZE;
               goto done;
           } 
        }
        */
        u32TotalSize = INSTANCE_DATA_FIXED_SIZE +
                       u32SizeBlkActionStream +
                       u32YVUPlanes +            // current frame
                       u32YVUPlanes +            // prev frame
                       u32YVUPlanes +            // B frame
                       uSizeBitStreamBuffer +    // input data
                       MB_MC_BUFFER_SIZE +
                       u32SizeT_IQ_INDEXBuffer + // NEW
                       u32SizepNBuffer         + // NEW
                       u32SizeMBInfoStream     + // PB-NEW
#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
                       (DEC_TIMING_INFO_FRAME_COUNT+4) * sizeof (DEC_TIMING_INFO)     + // Timing infos
#endif // } LOG_DECODE_TIMINGS_ON
                       0x1F;
    }

    // allocate the memory for the instance
	lpInst->pDecoderInst = HeapAlloc(GetProcessHeap(), 0, u32TotalSize);
    if (lpInst->pDecoderInst == NULL)
    {
		ERRORMESSAGE(("%s: Can't allocate %ld bytes!\r\n", _fx_, u32TotalSize));
        iReturn = ICERR_MEMORY;
        goto  done;
    }

#ifdef TRACK_ALLOCATIONS
	// Track memory allocation
	wsprintf(gsz1, "D3DEC: %7ld Ln %5ld\0", u32TotalSize, __LINE__);
	AddName((unsigned int)lpInst->pDecoderInst, gsz1);
#endif

	pDecoderInstance = lpInst->pDecoderInst;

    //build the decoder catalog 
    P32Inst = (U8 *) pDecoderInstance;
    P32Inst = (U8 *) ((((U32) P32Inst) + 31) & ~0x1F);
 
    //  The catalog of per-instance data is at the start of the per-instance data.
    DC = (T_H263DecoderCatalog *) P32Inst;

    DC->DecoderType       = CodecID;
    DC->uFrameHeight      = i32yres;
    DC->uFrameWidth       = i32xres;

#ifdef H263P
	DC->uActualFrameHeight = i32yresActual;
	DC->uActualFrameWidth  = i32xresActual;

    if (CodecID == YUV12_CODEC) {
		// YUV12 data is not padded out to multiples of 16 as H.263+ frames are
		// Therefore, only use the actual frame dimensions!
		DC->uFrameHeight = DC->uActualFrameHeight;
		DC->uFrameWidth = DC->uActualFrameWidth;
	}
#endif

    DC->uYActiveHeight    = u32YActiveHeight;
    DC->uYActiveWidth     = u32YActiveWidth;
    DC->uUVActiveHeight   = u32UVActiveHeight;
    DC->uUVActiveWidth    = u32UVActiveWidth;
    DC->uSz_YPlane        = u32YPlane;
    DC->uSz_VUPlanes      = u32VUPlanes;
    DC->uSz_YVUPlanes     = u32YVUPlanes;
    DC->BrightnessSetting = H26X_DEFAULT_BRIGHTNESS;
    DC->ContrastSetting   = H26X_DEFAULT_CONTRAST;
    DC->SaturationSetting = H26X_DEFAULT_SATURATION;
    DC->iAPColorConvPrev  = 0;
    DC->pAPInstPrev       = NULL; // assume no previous AP instance.
    DC->p16InstPostProcess = NULL;
    DC->_p16InstPostProcess = (void *)NULL;
    DC->uIs320x240 = bIs320x240;
    DC->bReadSrcFormat = FALSE;

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
    DC->uStatFrameCount = 0;
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

    /* Get the Options
     */
    GetDecoderOptions(DC);

    if (CodecID == H263_CODEC)
    {
        // Notice: Decoder memory is stored in YVU order.  This simplifies 
        //         working with the color converters which use YVU12.  
        // LONG TERM: We may want to change this someday because the encoder 
        //            stores data in YUV order.  Or perhaps the encoder should 
        //            change?

        lOffset =  INSTANCE_DATA_FIXED_SIZE;
        DC->Ticker = 127;

        //instance dependent table here
        ASSERT((lOffset & 0x3) == 0);                   //  DWORD alignment
        DC->X16_BlkActionStream = lOffset;
        lOffset += u32SizeBlkActionStream;

        ASSERT((lOffset & 0x7) == 0);                   //  QWORD alignment
        DC->CurrFrame.X32_YPlane = lOffset;
        lOffset += DC->uSz_YPlane;

        ASSERT((lOffset & 0x7) == 0);                   //  QWORD alignment
        DC->CurrFrame.X32_VPlane = lOffset;
        DC->CurrFrame.X32_UPlane = DC->CurrFrame.X32_VPlane + PITCH / 2;
        ASSERT((DC->CurrFrame.X32_UPlane & 0x7) == 0);  // QWORD alignment
        lOffset += DC->uSz_VUPlanes;

        //no padding is needed 
        ASSERT((lOffset & 0x7) == 0);                   //  QWORD alignment
        DC->PrevFrame.X32_YPlane = lOffset;
        lOffset += DC->uSz_YPlane;

        ASSERT((lOffset & 0x7) == 0);                   //  QWORD alignment
        DC->PrevFrame.X32_VPlane = lOffset;
        DC->PrevFrame.X32_UPlane = DC->PrevFrame.X32_VPlane + PITCH / 2;
        ASSERT((DC->PrevFrame.X32_UPlane & 0x7) == 0);  // QWORD alignment
        lOffset += DC->uSz_VUPlanes;

        // B Frame
        ASSERT((lOffset & 0x7) == 0);                   //  QWORD alignment
        DC->PBFrame.X32_YPlane = lOffset;
        lOffset += DC->uSz_YPlane;
                   
        ASSERT((lOffset & 0x7) == 0);                   //  QWORD alignment
        DC->PBFrame.X32_VPlane = lOffset;
        DC->PBFrame.X32_UPlane = DC->PBFrame.X32_VPlane + PITCH / 2;
        ASSERT((DC->PBFrame.X32_UPlane & 0x7) == 0);    // QWORD alignment
        lOffset += DC->uSz_VUPlanes;

        // Bitstream
        ASSERT((lOffset & 0x3) == 0);                   //  DWORD alignment
        DC->X32_BitStream = lOffset;
        lOffset += uSizeBitStreamBuffer;
        DC->uSizeBitStreamBuffer = uSizeBitStreamBuffer;

        DC->uMBBuffer = lOffset;
        // MMX IDCT writes its output to (DC->uMBBuffer + BLOCK_BUFFER_OFFSET)
        // and so it must be aligned at QWORD
        ASSERT((( (U32)DC + DC->uMBBuffer + BLOCK_BUFFER_OFFSET) & 0x7) == 0);
        lOffset += MB_MC_BUFFER_SIZE;

        ASSERT((lOffset & 0x3) == 0);                   //  DWORD alignment
        DC->X32_InverseQuant = lOffset; 
        lOffset += u32SizeT_IQ_INDEXBuffer; 

        ASSERT((lOffset & 0x3) == 0);                   //  DWORD alignment
        DC->X32_pN = lOffset; 
        lOffset += u32SizepNBuffer; 

        ASSERT((lOffset & 0x3) == 0);                   //  DWORD alignment
        DC->X32_uMBInfoStream = lOffset; 
        lOffset += u32SizeMBInfoStream; 

#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
		// Decode Timing Info
		DC->X32_DecTimingInfo = lOffset;
		lOffset += (DEC_TIMING_INFO_FRAME_COUNT+4) * sizeof (DEC_TIMING_INFO);
#endif // } LOG_DECODE_TIMINGS_ON

        // init the data
        
        ASSERT((U32)lOffset <= u32TotalSize);
        pInitLimit = (U32  *) (P32Inst + lOffset);
        pInitPtr = (U32  *) (P32Inst + DC->CurrFrame.X32_YPlane);
        for (;pInitPtr < pInitLimit;pInitPtr++)    *pInitPtr =0;

		// Fill the Y,U,V Previous Frame space with black, this way
		// even if we lose an I frame, the background will remain black
		ZeroFill((HPBYTE)P32Inst + DC->PrevFrame.X32_YPlane + Y_START,
				(HPBYTE)P32Inst + DC->PrevFrame.X32_UPlane + UV_START,
				(HPBYTE)P32Inst + DC->PrevFrame.X32_VPlane + UV_START,           
				PITCH,
				DC->uFrameWidth,
				DC->uFrameHeight);

        // H263InitializeBlockActionStream(DC);

    } // H263

#ifdef NEW_BEF // { NEW_BEF
	// Initialize de-blocking filter
	{
		int i,j;

		for (j = 0; j < 19; j++) {
			for (i = 0; i < 23; i++) {
				coded_map[j][i] = 0;
			}
		}
		InitEdgeFilterTab();
	}	 
#endif // } NEW_BEF

    lpInst->Initialized = TRUE;
    iReturn = ICERR_OK;

done:
    return iReturn;
}

/***********************************************************************
 *  ZeroFill
 *    Fill the YVU data area with black.
 ***********************************************************************/
static void	ZeroFill(HPBYTE hpbY, HPBYTE hpbU, HPBYTE hpbV, int iPitch, U32 uWidth, U32 uHeight)
{
    U32 w,h;
    int y,u,v;
    U32 uNext;
    HPBYTE pY, pU, pV;

    y = 32;
    uNext = iPitch - uWidth;
    for (h = 0 ; h < uHeight ; h++) {
        pY = hpbY;
        for (w = 0; w < uWidth ; w++) {
            *hpbY++ = (U8)16;
        }
        hpbY += uNext;
    }
    uWidth = uWidth / 2;
    uHeight = uHeight / 2;
    uNext = iPitch - uWidth;
    for (h = 0 ; h < uHeight ; h++) {
        pV = hpbV;
        pU = hpbU;
        for (w = 0; w < uWidth ; w++) {
            *hpbV++ = (U8)128;
            *hpbU++ = (U8)128;
        }
        hpbV += uNext;
        hpbU += uNext;
    }
}

/***********************************************************************
 *  TestFill
 *    Fill the YVU data area with a test pattern.
 ***********************************************************************/
#if 0
static void
TestFill(
    HPBYTE hpbY,
    HPBYTE hpbU,
    HPBYTE hpbV,
    int    iPitch,
    U32    uWidth,
    U32    uHeight)
{
    U32 w,h;
    int y,u,v;
    U32 uNext;
    HPBYTE pY, pU, pV;

    y = 32;
    uNext = iPitch - uWidth;
    for (h = 0 ; h < uHeight ; h++) {
        pY = hpbY;
        for (w = 0; w < uWidth ; w++) {
            *hpbY++ = (U8) (y + (w & ~0xF));
        }
        hpbY += uNext;
    }
    uWidth = uWidth / 2;
    uHeight = uHeight / 2;
    u = 0x4e * 2;
    v = 44;
    uNext = iPitch - uWidth;
    for (h = 0 ; h < uHeight ; h++) {
        pV = hpbV;
        pU = hpbU;
        for (w = 0; w < uWidth ; w++) {
            *hpbV++ = (U8) v;
            *hpbU++ = (U8) u;
        }
        hpbV += uNext;
        hpbU += uNext;
    }
} /* end TestFill */
static void
TestFillUV(
    HPBYTE hpbU,
    HPBYTE hpbV,
    int iPitch,
    U32 uWidth,
    U32 uHeight)
{
    U32 w,h;
    int u,v;
    U32 uNext;
    HPBYTE pU, pV;

    uWidth = uWidth / 2;
    uHeight = uHeight / 2;
    u = 128;
    v = 128;
    uNext = iPitch - uWidth;
    for (h = 0 ; h < uHeight ; h++) {
        pV = hpbV;
        pU = hpbU;
        for (w = 0; w < uWidth ; w++) {
            *hpbV++ = (U8) v;
            *hpbU++ = (U8) u;
        }
        hpbV += uNext;
        hpbU += uNext;
    }
} // end TestFill
#endif


/*********************************************************************
 *  H263Decompress
 *    This function drives the decompress and display of one frame
 *********************************************************************/
LRESULT H263Decompress(
    LPDECINST            lpInst, 
    ICDECOMPRESSEX FAR * lpicDecEx, 
#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
    BOOL                 bIsDCI,
	BOOL				 bRealDecompress)
#else // }{ #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
    BOOL                 bIsDCI)
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

{
    LRESULT                iReturn = ICERR_ERROR;
    U8 FAR               * fpSrc;
    U8 FAR               * P32Inst;
    U8 FAR               * fpu8MaxPtr;
    LPVOID                 pDecoderInstance = NULL;
    T_H263DecoderCatalog * DC = NULL;
    I32                    iNumberOfGOBs, iNumberOfMBs, iBlockNumber = 0;
    T_BlkAction FAR      * fpBlockAction;
    LONG                   lOutput;
    int                    intPitch; 
    U32                    uNewOffsetToLine0, uNewFrameHeight;
    BOOL                   bShapingFlag, bMirror;
    U32                    uYPitch, uUVPitch;

    T_IQ_INDEX           * pRUN_INVERSE_Q;  
    U32                  * pN;                     
    T_MBInfo FAR         * fpMBInfo;      
    
    U32                    uSaveHeight, uSaveWidth, utemp, uYPlane, uUPlane;
	I32                    uVPlane;
    U8                   * pFrame;

    U32                   uWork;                 //  variables for reading bits
    U32                   uBitsReady; 
    BITSTREAM_STATE       bsState;
    BITSTREAM_STATE FAR * fpbsState = &bsState;
    I32                   gob_start = 1, mb_start = 1, b_skip;
	I8                    p8MVs[4]={0,0,0,0};
#ifdef H263P
	BOOL bTmpPostProcessBEF;
#endif

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
	U32 uStartLow;
	U32 uStartHigh;
	U32 uElapsed;
	U32 uBefore;
	U32	uDecodeTime = 0;
	U32 uBEFTime = 0;
	int bTimingThisFrame = 0;
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
	U32 uDecIDCTCoeffs = 0;
	U32 uHeaders = 0;
	U32 uMemcpy = 0;
	U32 uFrameCopy = 0;
	U32 uOutputCC = 0;
	U32 uIDCTandMC = 0;
#endif // } DETAILED_DECODE_TIMINGS_ON
#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
	DEC_TIMING_INFO * pDecTimingInfo = NULL;
#endif // } LOG_DECODE_TIMINGS_ON

	FX_ENTRY("H263Decompress");

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
	if (bRealDecompress)
	{
		TIMER_START(bTimingThisFrame,uStartLow,uStartHigh);
	}
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

    // check the input pointers
    if (IsBadWritePtr((LPVOID)lpInst, sizeof(DECINSTINFO))||
        IsBadReadPtr((LPVOID)lpicDecEx, sizeof(ICDECOMPRESSEX)))
    {
		ERRORMESSAGE(("%s: Bad input parameter!\r\n", _fx_));
        iReturn = ICERR_BADPARAM;
        goto done;
    }
    
    // Check for a bad length
    if (lpicDecEx->lpbiSrc->biSizeImage == 0) {
		ERRORMESSAGE(("%s: Bad image size!\r\n", _fx_));
        iReturn = ICERR_BADIMAGESIZE;    
        goto done;
    }

    // set local pointer to global memory
    pDecoderInstance = lpInst->pDecoderInst;

    // Set the frame mirroring flag
    bMirror = FALSE;
    if (lpicDecEx->lpbiDst != 0)
    {
        if(lpicDecEx->lpbiSrc->biWidth * lpicDecEx->lpbiDst->biWidth < 0)
            bMirror = TRUE;
    }

    // Build the decoder catalog pointer 
    P32Inst = (U8 FAR *) pDecoderInstance;
    P32Inst = (U8 FAR *) ((((U32) P32Inst) + 31) & ~0x1F);
    DC = (T_H263DecoderCatalog FAR *) P32Inst;

    if (DC->DecoderType == H263_CODEC)
    {

#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
		if (bRealDecompress)
		{
			if ((DC->uStatFrameCount <= DEC_TIMING_INFO_FRAME_COUNT) && (DC->ColorConvertor != YUV12ForEnc))
			{
				if (DC->X32_DecTimingInfo > 0)
					DC->pDecTimingInfo = (DEC_TIMING_INFO FAR *)( ((U8 FAR *)P32Inst) + DC->X32_DecTimingInfo );
				DC->uStartLow = uStartLow;
				DC->uStartHigh = uStartHigh;
			}
			else
			{	
				DC->pDecTimingInfo = (DEC_TIMING_INFO FAR *) NULL;
			}
			DC->bTimingThisFrame = bTimingThisFrame;
		}
#endif // } LOG_DECODE_TIMINGS_ON

		// Check if h263test.ini has been used to override custom message
		// for block edge filter. If BlockEdgeFilter is not specified in
		// the [Decode] section of h263test.ini, DC->bUseBlockEdgeFilter 
		// will be set to 2, and the value specified in a custom message
		// will be chosen.
		if (DC->bUseBlockEdgeFilter == 2) {	 
			DC->bUseBlockEdgeFilter = lpInst->bUseBlockEdgeFilter;
		}


        // First check to see if we are just going to return the P frame
        // which we have already decoded.
        
        /*********************************************************************
         *
         *  Hack for the special "Null" P frames for Windows
         *
         *********************************************************************/
        if (lpicDecEx->lpbiSrc->biSizeImage != 8)
        {

            /* Is there room to copy the bitstream data? */
            // OLD: ASSERT(lpicDecEx->lpbiSrc->biSizeImage <= DC->uSizeBitStreamBuffer);
            // RH:  Make sure that the bitstream can be fit in our allocated buffer. If
            // not, return an error.
            
            if ( lpicDecEx->lpbiSrc->biSizeImage > DC->uSizeBitStreamBuffer) {
				ERRORMESSAGE(("%s: Internal buffer (%ld bytes) too small for input data (%ld bytes)!\r\n", _fx_, DC->uSizeBitStreamBuffer, lpicDecEx->lpbiSrc->biSizeImage));
				if (!H263RTP_VerifyBsInfoStream(DC,
					                           (U8 *) lpicDecEx->lpSrc,
					                            lpicDecEx->lpbiSrc->biSizeImage)) 
				{
					ERRORMESSAGE(("%s: Input buffer too big without RTP extention!\r\n", _fx_));
					iReturn = ICERR_ERROR;
                    goto done;
				}
				else
				 lpicDecEx->lpbiSrc->biSizeImage= DC->uSizeBitStreamBuffer;
            }

            // Copy the source data to the bitstream region.
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

            fpSrc = (U8 FAR *)(P32Inst + DC->X32_BitStream);
            memcpy((char FAR *)fpSrc, (const char FAR *) lpicDecEx->lpSrc, 
                   lpicDecEx->lpbiSrc->biSizeImage);  

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uMemcpy)
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

            // Initialize the bit stream reader 
            GET_BITS_INIT(uWork, uBitsReady);

//#ifdef LOSS_RECOVERY
            DC->Sz_BitStream = lpicDecEx->lpbiSrc->biSizeImage;
            // H263RTP_VerifyBsInfoStream(DC,fpSrc,DC->Sz_BitStream);
            //RtpForcePacketLoss(fpSrc,lpicDecEx->lpbiSrc->biSizeImage,0);
//#endif    
            //  Initialize pointers to data structures which carry info 
            //  between passes
            pRUN_INVERSE_Q = (T_IQ_INDEX *)(P32Inst + DC->X32_InverseQuant);
            pN             = (U32 *)(P32Inst + DC->X32_pN);
            fpMBInfo       = (T_MBInfo FAR *) (P32Inst + DC->X32_uMBInfoStream);

            // Initialize block action stream  pointer
            iBlockNumber = 0;
            fpBlockAction = (T_BlkAction FAR *)(P32Inst + DC->X16_BlkActionStream);

            // Decode the Picture Header
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

            iReturn = H263DecodePictureHeader(DC, fpSrc, uBitsReady, uWork, 
                                              fpbsState);

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uHeaders)
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

            if (iReturn == PACKET_FAULT) 
            {
				ERRORMESSAGE(("%s: PSC lost!\r\n", _fx_));
                iReturn = RtpGetPicHeaderFromBsExt(DC);
                if (iReturn != ICERR_OK)
                    goto done;

                iReturn = RtpH263FindNextPacket(DC, fpbsState, &pN, 
                              &DC->uPQuant, (int *)&mb_start, (int *)&gob_start,p8MVs);
                if (iReturn == NEXT_MODE_A) 
                {    
                    //trick it for now, do not change without consulting Chad
                    gob_start++;
					mb_start++;  
                    ERRORMESSAGE(("%s: Next packet following lost PSC is in MODE A\r\n", _fx_));
                }
                else if ((iReturn == NEXT_MODE_B) || (iReturn == NEXT_MODE_C))
                {
					int k;
  					if (iReturn == NEXT_MODE_B) 
					{
						k=1;
						ERRORMESSAGE(("%s: Next packet in MODE B\r\n", _fx_));
					}
					else
					{
						ERRORMESSAGE(("%s: Next packet in MODE C\r\n", _fx_));
						k=2;
					}

#ifdef H263P
					// The number of MB's is merely (width / 16)
					iNumberOfMBs = DC->uFrameWidth >> 4;
#else
                    iNumberOfMBs = iNumberOfMBsInAGOBBySourceFormat[DC->uSrcFormat];
#endif

                    b_skip = (gob_start* iNumberOfMBs + mb_start)*6*k;
                    for ( k=0; k < b_skip; k++)  *pN++=0;
                    fpBlockAction += b_skip;
                    iBlockNumber  += b_skip;
                    fpMBInfo  += b_skip/6;
                    mb_start++;
                    gob_start++;
					/*for (k=0;k<6;k++)
					{
						fpBlockAction[k].i8MVx2 = p8MVs[0];
						fpBlockAction[k].i8MVy2 = p8MVs[1];
					} */

                }
                else 
                {
                    iReturn = ICERR_UNSUPPORTED;
                    goto done;
                }
            }
            else
            //old code before merging
            if (iReturn != ICERR_OK)
            {
				ERRORMESSAGE(("%s: Error reading the picture header!\r\n", _fx_));
                goto done;
            }
    
            // Set a limit for testing for bitstream over-run
            fpu8MaxPtr = fpSrc;
            fpu8MaxPtr += (lpicDecEx->lpbiSrc->biSizeImage - 1);  
            
            // Initialize some constants
#if defined(H263P) || defined(USE_BILINEAR_MSH26X)
			if (DC->uFrameHeight < 500)
				// Each GOB consists of 16 lines
				iNumberOfGOBs = DC->uFrameHeight >> 4;
			else if (DC->uFrameHeight < 996)
				// Each GOB consists of 32 lines
				iNumberOfGOBs = DC->uFrameHeight >> 5;
			else
				// Each GOB consists of 64 lines
				iNumberOfGOBs = DC->uFrameHeight >> 6;

			iNumberOfMBs = DC->uFrameWidth >> 4;
#else
            iNumberOfGOBs = iNumberOfGOBsBySourceFormat[DC->uSrcFormat];
            iNumberOfMBs = iNumberOfMBsInAGOBBySourceFormat[DC->uSrcFormat];
#endif
            DC->iNumberOfMBsPerGOB = iNumberOfMBs;
            
            /* 
             * Check dimensions:
             *  In H263 a GOB is a single row of MB, and a MB is 16x16 
             */
            ASSERT(((U32)iNumberOfGOBs * 16) == DC->uFrameHeight);
            ASSERT(((U32)iNumberOfMBs * 16) == DC->uFrameWidth);
            
            /*****************************************************************
              FIRST PASS - bitream parsing and IDCT prep work
              ***************************************************************/
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

#ifdef USE_MMX // { USE_MMX
            if (DC->bMMXDecoder)
            {
                __asm {
                    _emit 0x0f 
                    _emit 0x77  //  emms
                }
            }
#endif // } USE_MMX
            iReturn = IAPass1ProcessFrame(DC, 
                                          fpBlockAction, 
                                          fpMBInfo,
                                          fpbsState,
                                          fpu8MaxPtr,
                                          pN,
                                          pRUN_INVERSE_Q,
                                          iNumberOfGOBs,
                                          iNumberOfMBs,
                                          gob_start, 
                                          mb_start);

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
                // decode and inverse quantize the transform coefficients
				TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uDecIDCTCoeffs)
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

            if (iReturn != ICERR_OK) {
				ERRORMESSAGE(("%s: Error during first pass - bitream parsing and IDCT prep work!\r\n", _fx_));
                goto done;
            }
            
            /*****************************************************************
              SECOND PASS - IDCT and motion compensation (MC)
              ***************************************************************/
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

            if (DC->bAdvancedPrediction || DC->bUnrestrictedMotionVectors)
            {
                //  Change parameter profile once Bob is finished making
                //  changes to ExpandPlane routine : AG
                ExpandPlane((U32) (P32Inst + DC->PrevFrame.X32_YPlane + Y_START),
                            (U32) (DC->uFrameWidth),
                            (U32) (DC->uFrameHeight), 
                            16); // TODO 16  number of pels to expand by
                
                ExpandPlane((U32) (P32Inst + DC->PrevFrame.X32_VPlane + UV_START),
                            (U32) (DC->uFrameWidth>>1), 
                            (U32) (DC->uFrameHeight>>1), 
                            8); // TODO 8
                
                ExpandPlane((U32) (P32Inst + DC->PrevFrame.X32_UPlane + UV_START),
                            (U32) (DC->uFrameWidth>>1), 
                            (U32) (DC->uFrameHeight>>1), 
                            8);  // TODO 8
            }

            fpBlockAction  = (T_BlkAction FAR *) (P32Inst + DC->X16_BlkActionStream);
            pRUN_INVERSE_Q = (T_IQ_INDEX *)(P32Inst + DC->X32_InverseQuant);  
            pN             = (U32 *)(P32Inst + DC->X32_pN);                               
            fpMBInfo       = (T_MBInfo FAR *)(P32Inst + DC->X32_uMBInfoStream);

            IAPass2ProcessFrame(DC,
                                fpBlockAction,
                                fpMBInfo,
                                pN,
                                pRUN_INVERSE_Q,
                                iNumberOfGOBs,
                                iNumberOfMBs);

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uIDCTandMC)
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

#ifdef H263P
            if (DC->bDeblockingFilter) {
				// In the loop deblocking filter.
				// Annex J, document LBC-96-358
				// If the filtering is performed inside the loop, we
				// do not also perform a post-process block edge filter.

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
				if (bRealDecompress)
				{
					TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
				}
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

				bTmpPostProcessBEF = DC->bUseBlockEdgeFilter;
				DC->bUseBlockEdgeFilter = FALSE;

				EdgeFilter((U8 *)DC + DC->CurrFrame.X32_YPlane + Y_START,
                           (U8 *)DC + DC->CurrFrame.X32_VPlane + UV_START,
                           (U8 *)DC + DC->CurrFrame.X32_UPlane + UV_START,
                           DC->uFrameWidth,
                           DC->uFrameHeight,
                           PITCH);

	            if (DC->bPBFrame) 
				{
					// Filtering of B frames is not a manner of standardization.
					// We do it since we assume that it will yield improved
					// picture quality.
					// TODO, verify this assumption.
					EdgeFilter((U8 *)DC + DC->PBFrame.X32_YPlane + Y_START,
							   (U8 *)DC + DC->PBFrame.X32_VPlane + UV_START,
							   (U8 *)DC + DC->PBFrame.X32_UPlane + UV_START,
							   DC->uFrameWidth,
							   DC->uFrameHeight,
							   PITCH);
				}

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
				if (bRealDecompress)
				{
					TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uBEFTime)
				}
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

			} // if (DC->bDeblockingFilter)
#endif // H263P

            //copy to the reference frame to prepare for the next frame
            // Decide which frame to display
            if (DC->bPBFrame)
            {    // Set pointers to return B frame for PB pair
                DC->DispFrame.X32_YPlane = DC->PBFrame.X32_YPlane;
                DC->DispFrame.X32_VPlane = DC->PBFrame.X32_VPlane;
                DC->DispFrame.X32_UPlane = DC->PBFrame.X32_UPlane;
            }
            else 
            { // Set pointers to return future P of PB pair
                DC->DispFrame.X32_YPlane = DC->CurrFrame.X32_YPlane;
                DC->DispFrame.X32_VPlane = DC->CurrFrame.X32_VPlane;
                DC->DispFrame.X32_UPlane = DC->CurrFrame.X32_UPlane;
            }
            
            utemp                    = DC->CurrFrame.X32_YPlane;
            DC->CurrFrame.X32_YPlane = DC->PrevFrame.X32_YPlane;
            DC->PrevFrame.X32_YPlane = utemp;
            
            utemp                    = DC->CurrFrame.X32_VPlane ;
            DC->CurrFrame.X32_VPlane = DC->PrevFrame.X32_VPlane;
            DC->PrevFrame.X32_VPlane = utemp;
            
            utemp                    = DC->CurrFrame.X32_UPlane ;
            DC->CurrFrame.X32_UPlane = DC->PrevFrame.X32_UPlane;
            DC->PrevFrame.X32_UPlane = utemp;
        }
        /*********************************************************************
         *
         *  Hack for the special "Null" P frames for Windows
         *
         *********************************************************************/
        else  //  lpicDecEx->lpbiSrc->biSizeImage == 8
        { // Set pointers to return P frame for PB pair
#ifdef _DEBUG
            if (!DC->bPBFrame)
			{
                ERRORMESSAGE(("%s: Null frame received even though previous was not PB\r\n", _fx_));
            }
#endif
            DC->DispFrame.X32_YPlane = DC->PrevFrame.X32_YPlane;
            DC->DispFrame.X32_VPlane = DC->PrevFrame.X32_VPlane;
            DC->DispFrame.X32_UPlane = DC->PrevFrame.X32_UPlane;
        }
    }  // end of H263_CODEC
    else
    {    // why is this here???  Is it really needed for YUV12 display? 
        DC->DispFrame.X32_YPlane = DC->PrevFrame.X32_YPlane;
        DC->DispFrame.X32_VPlane = DC->PrevFrame.X32_VPlane;
        DC->DispFrame.X32_UPlane = DC->PrevFrame.X32_UPlane;
    }
    
    // Return if there is no need to update screen yet.
    if(lpicDecEx->dwFlags & ICDECOMPRESS_HURRYUP) {
        iReturn = ICERR_DONTDRAW;
        goto done;
    }

    if (DC->ColorConvertor == YUV12ForEnc) 
    {
        /* NOTICE: This color converter reverses the order of the data in 
         *         memory.  The decoder uses YVU order and the encoder uses 
         *         YUV order.
         */
        //  TODO can this be DispFrame ????  Trying to get rid of 
        //  references to PrevFrame and CurrFrame after this point  
        H26x_YUV12ForEnc ((HPBYTE)P32Inst,
                          DC->PrevFrame.X32_YPlane + Y_START,
                          DC->PrevFrame.X32_VPlane + UV_START,
                          DC->PrevFrame.X32_UPlane + UV_START,
                          DC->uFrameWidth,
                          DC->uFrameHeight,
                          PITCH,
                          (HPBYTE)lpicDecEx->lpDst,
                          (DWORD)Y_START,
                          (DWORD)(MAX_HEIGHT + 2L*UMV_EXPAND_Y) * PITCH + 8 + UV_START + PITCH / 2,
                          (DWORD)(MAX_HEIGHT + 2L*UMV_EXPAND_Y) * PITCH + 8 + UV_START);
        iReturn = ICERR_OK;
        goto done;
    }

#if 0
    // Fill the Y,U,V Current Frame space with a test pattern
    TestFill((HPBYTE)P32Inst + DC->DispFrame.X32_YPlane + Y_START,
             (HPBYTE)P32Inst + DC->DispFrame.X32_UPlane + UV_START,
             (HPBYTE)P32Inst + DC->DispFrame.X32_VPlane + UV_START,           
                 PITCH,
             DC->uFrameWidth,
             DC->uFrameHeight);
#endif

#if MAKE_GRAY
    // Fill the U,V Current Frame space with a test pattern
    TestFillUV((HPBYTE)P32Inst + DC->DispFrame.X32_UPlane + UV_START,
               (HPBYTE)P32Inst + DC->DispFrame.X32_VPlane + UV_START,           
                   PITCH,
               DC->uFrameWidth,
               DC->uFrameHeight);
#endif

    /* Special case the YVU12 for the encoder because it should not include 
     * BEF, Shaping or aspect ratio correction...
     */

    // Copy Planes to Post Processing area, and block edge filter.
    if (DC->DecoderType == H263_CODEC)
    {
        //  3/5/96: Steve asserted that mirroring is not needed for the remote 
        //  stream (i.e. H263_CODEC)  -a.g.
        //  But I will leave this code in.
        uYPitch  = PITCH;
        uUVPitch = PITCH;

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
		if (bRealDecompress)
		{
			TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
		}
#endif // } DETAILED_DECODE_TIMINGS_ON

        if(bMirror) 
        {
            // copy with mirroring
            pFrame  = (U8 *)DC->p16InstPostProcess;
            uYPlane = DC->PostFrame.X32_YPlane;
            uUPlane = DC->PostFrame.X32_UPlane;
            uVPlane = DC->PostFrame.X32_VPlane;

            FrameMirror((U8 *)DC + DC->DispFrame.X32_YPlane + Y_START,
                ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_YPlane,
#ifdef H263P
				DC->uActualFrameHeight,
				DC->uActualFrameWidth,
#else
                DC->uFrameHeight,
                DC->uFrameWidth,
#endif
                PITCH);
            FrameMirror((U8 *)DC + DC->DispFrame.X32_UPlane + UV_START,
                ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_UPlane,
#ifdef H263P
				DC->uActualFrameHeight/2,
				DC->uActualFrameWidth/2,
#else
                DC->uFrameHeight/2,
                DC->uFrameWidth/2,
#endif
                PITCH);
            FrameMirror((U8 *)DC + DC->DispFrame.X32_VPlane + UV_START,
                ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_VPlane,
#ifdef H263P
				DC->uActualFrameHeight/2,
				DC->uActualFrameWidth/2,
#else
                DC->uFrameHeight/2,
                DC->uFrameWidth/2,
#endif
                PITCH);

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			if (bRealDecompress)
			{
				TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uFrameCopy)
			}
#endif // } DETAILED_DECODE_TIMINGS_ON

        }
        else 
        { // no mirroring

            // check for 320x240 still
            if (DC->uIs320x240) {
                // save frame size, set 320 x 240 size, then copy as normal
                uSaveWidth = DC->uFrameWidth;
                uSaveHeight = DC->uFrameHeight;
                DC->uFrameWidth = 320;
                DC->uFrameHeight = 240;

                FrameCopy (((HPBYTE) P32Inst) + DC->DispFrame.X32_YPlane + Y_START,
                    ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_YPlane,
                   DC->uFrameHeight,
                   DC->uFrameWidth,
                   PITCH);
                FrameCopy (((HPBYTE) P32Inst) + DC->DispFrame.X32_UPlane + UV_START,
                   ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_UPlane,
                   DC->uFrameHeight/2,
                   DC->uFrameWidth/2,
                   PITCH);
                FrameCopy (((HPBYTE) P32Inst) + DC->DispFrame.X32_VPlane + UV_START,
                   ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_VPlane,
                   DC->uFrameHeight/2,
                   DC->uFrameWidth/2,
                   PITCH);

                pFrame  = (U8 *)DC->p16InstPostProcess;
                uYPlane = DC->PostFrame.X32_YPlane;
                uUPlane = DC->PostFrame.X32_UPlane;
                uVPlane = DC->PostFrame.X32_VPlane;

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
				if (bRealDecompress)
				{
					TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uFrameCopy)
				}
#endif // } DETAILED_DECODE_TIMINGS_ON

            }
            else
            {
				// Added checks for adjusting video effects. Since pFrame must be
				// set to DC->p16InstPostProcess to call AdjustPels, the FrameCopy
				// must be done.
				if (!(DC->bUseBlockEdgeFilter || DC->bAdjustLuma || DC->bAdjustChroma)) 
				{
					//  New color convertors do not destroy Y plane input and so
					//  we do not have to do a frame copy
	            	pFrame  = (U8 *)DC;
	            	uYPlane = DC->DispFrame.X32_YPlane + Y_START;
	            	uUPlane = DC->DispFrame.X32_UPlane + UV_START;
	            	uVPlane = DC->DispFrame.X32_VPlane + UV_START;

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
					if (bRealDecompress)
					{
						TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uFrameCopy)
					}
#endif // } DETAILED_DECODE_TIMINGS_ON

				}
				else
				{
					// The block edge filtered frame can not be used as a reference
					// and we need to make a copy of the frame before doing the
					// block edge filtering.
					// This is also true for adjusting pels.
			    	FrameCopy (((HPBYTE) P32Inst) + DC->DispFrame.X32_YPlane + Y_START,
				           ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_YPlane,
						   DC->uFrameHeight,
						   DC->uFrameWidth,
						   PITCH);
		            FrameCopy (((HPBYTE) P32Inst) + DC->DispFrame.X32_UPlane + UV_START,
				           ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_UPlane,
			               DC->uFrameHeight/2,
						   DC->uFrameWidth/2,
						   PITCH);
			    	FrameCopy (((HPBYTE) P32Inst) + DC->DispFrame.X32_VPlane + UV_START,
				           ((HPBYTE) DC->p16InstPostProcess) + DC->PostFrame.X32_VPlane,
			               DC->uFrameHeight/2,
						   DC->uFrameWidth/2,
						   PITCH);
					pFrame  = (U8 *)DC->p16InstPostProcess;
	            	uYPlane = DC->PostFrame.X32_YPlane;
	            	uUPlane = DC->PostFrame.X32_UPlane;
	            	uVPlane = DC->PostFrame.X32_VPlane;

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
					if (bRealDecompress)
					{
						TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uFrameCopy)
					}
#endif // } DETAILED_DECODE_TIMINGS_ON

					if (DC->bUseBlockEdgeFilter) {
						// C version of block edge filter
						// should this be added to the mirrored case?
						// it should not be added to the b320x240 case
						// since we want that to be as sharp as possible
#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
						if (bRealDecompress)
						{
							TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
						}
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

						EdgeFilter((unsigned char *)(pFrame + uYPlane),
								   (unsigned char *)(pFrame + uUPlane),
								   (unsigned char *)(pFrame + uVPlane),
#ifndef NEW_BEF // { NEW_BEF
								   DC->uPQuant,
#endif // } NEW_BEF
								   DC->uFrameWidth,
								   DC->uFrameHeight,
								   PITCH);

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
						if (bRealDecompress)
						{
							TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uBEFTime)
						}
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
					}
				}
			}
		} // end no mirroring case
#ifdef H263P
		if (DC->bDeblockingFilter) {
			// Restore post-process (i.e., outside of loop) block edge filter flag
			DC->bUseBlockEdgeFilter = bTmpPostProcessBEF;
		}
#endif
    }
    else   // YUV12
    {
        const U32 uHeight = DC->uFrameHeight;
        const U32 uWidth  = DC->uFrameWidth;
        const U32 uYPlaneSize = uHeight*uWidth;

        uYPitch  = uWidth;
        uUVPitch = uWidth >> 1;

        if(bMirror) // mirroring and YUV12
        {
            HPBYTE pSource, pDestination;

            pFrame  = DC->p16InstPostProcess;
            uYPlane = DC->PostFrame.X32_YPlane;
            uUPlane = uYPlane + uYPlaneSize;
            uVPlane = uUPlane + (uYPlaneSize>>2);

            pSource = (HPBYTE)lpicDecEx->lpSrc;
            pDestination = (HPBYTE)(DC->p16InstPostProcess + (DWORD)DC->PostFrame.X32_YPlane);
            FrameMirror (pSource, pDestination, uHeight, uWidth, uWidth);

            pSource      += uYPlaneSize;
            pDestination += uYPlaneSize;
            FrameMirror (pSource, pDestination, uHeight>>1, uWidth>>1, uWidth>>1);

            pSource += (uYPlaneSize>>2);
            pDestination += (uYPlaneSize>>2);
            FrameMirror (pSource, pDestination, uHeight>>1, uWidth>>1, uWidth>>1);
        }
        else // no mirroring
        {
            HPBYTE pSource, pDestination;
            if (DC->bAdjustLuma || DC->bAdjustChroma) {

				pFrame  = DC->p16InstPostProcess;
				uYPlane = DC->PostFrame.X32_YPlane;
				uUPlane = uYPlane + uYPlaneSize;
				uVPlane = uUPlane + (uYPlaneSize>>2);

				pSource = (HPBYTE)lpicDecEx->lpSrc;
				pDestination = (HPBYTE)(DC->p16InstPostProcess + (DWORD)DC->PostFrame.X32_YPlane);
				FrameCopy (pSource, pDestination, uHeight, uWidth, uWidth);

				pSource      += uYPlaneSize;
				pDestination += uYPlaneSize;
				FrameCopy (pSource, pDestination, uHeight>>1, uWidth>>1, uWidth>>1);

				pSource += (uYPlaneSize>>2);
				pDestination += (uYPlaneSize>>2);
				FrameCopy (pSource, pDestination, uHeight>>1, uWidth>>1, uWidth>>1);
			} else {
				// Copy the V plane from the source buffer into DC because the
				// input buffer may end at the end of a section. The assembler versions
				// of the color convertors are optimized to read ahead, in which case
				// a GPF occurs if the buffer is at the end of a section.
				pFrame  = (HPBYTE)lpicDecEx->lpSrc;
				uYPlane = 0;
				uUPlane = uYPlane + uYPlaneSize;
				uVPlane = uUPlane + (uYPlaneSize>>2);

                pSource = (HPBYTE)lpicDecEx->lpSrc + uYPlane + uYPlaneSize + (uYPlaneSize >> 2);
                pDestination = (HPBYTE)DC->p16InstPostProcess + DC->PostFrame.X32_YPlane +
					uYPlaneSize + (uYPlaneSize >> 2);
                FrameCopy (pSource, pDestination, uHeight>>1, uWidth>>1, uWidth>>1);
				uVPlane += (pDestination - pSource);
			}
        }
         
    }  //  else YUV12

    // Check if we are to do aspect ration correction on this frame.
    if (DC->bForceOnAspectRatioCorrection || lpInst->bCorrectAspectRatio) {
        bShapingFlag = 1;
        uNewFrameHeight = (DC->uFrameHeight * 11 / 12);
    } else {
        bShapingFlag = 0;
        uNewFrameHeight = DC->uFrameHeight;
    }

    // Do the PEL color adjustments if necessary.
    if(DC->bAdjustLuma) 
    {
        // width is rounded up to a multiple of 8
        AdjustPels(pFrame,
                   uYPlane,
                   DC->uFrameWidth,
                   uYPitch,
                   DC->uFrameHeight,
                   (U32) DC->X16_LumaAdjustment);
    }
    if(DC->bAdjustChroma) 
    {
        // width = Y-Width / 4 and then rounded up to a multiple of 8
        AdjustPels(pFrame,
                   uUPlane,
                   (DC->uFrameWidth >> 1),
                   uUVPitch,
                   (DC->uFrameHeight >> 1),
                  (U32) DC->X16_ChromaAdjustment);
        AdjustPels(pFrame,
                   uVPlane,
                   (DC->uFrameWidth >> 1),
                   uUVPitch,
                   (DC->uFrameHeight >> 1),
                   (U32) DC->X16_ChromaAdjustment);
    }

    // Determine parameters need for color conversion.
    if(lpicDecEx->lpbiDst->biCompression == FOURCC_YUY2)  /* output pitch, offset */
    {
		intPitch = (lpicDecEx->lpbiDst->biBitCount >> 3) * abs ((int)(lpicDecEx->lpbiDst->biWidth));
		lOutput = 0;                                       /* for YUY2 format */
		uNewOffsetToLine0 = DC->CCOffsetToLine0;
		bShapingFlag=FALSE;
    }
    else if ((lpicDecEx->lpbiDst->biCompression == FOURCC_YUV12) || (lpicDecEx->lpbiDst->biCompression == FOURCC_IYUV))  /* output pitch, offset */
    {
		intPitch = 0xdeadbeef;  // should not be used
		lOutput = 0;                                       /* for YUV format */
		uNewOffsetToLine0 = DC->CCOffsetToLine0;
		bShapingFlag=FALSE;
    }
    else  // not YUY2
    {
        // this call also sets intPitch
        lOutput = DibXY(lpicDecEx, &intPitch, lpInst->YScale);

        if (DC->uIs320x240)
            uNewOffsetToLine0 = DC->CCOffset320x240;
        else
            uNewOffsetToLine0 = DC->CCOffsetToLine0;

        if (!bIsDCI)
        {
             uNewOffsetToLine0 += 
                ( (U32)DC->uFrameHeight - (U32)uNewFrameHeight ) * (U32)intPitch;

            if(lpInst->YScale == 2)
                 uNewOffsetToLine0 += 
                    ( (U32)DC->uFrameHeight - (U32)uNewFrameHeight ) * (U32)intPitch;

        }  // end if (!bIsDCI)

    } // end if (YUY2) ... else ...

    // Call the H26x color convertors 
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
	TIMER_BEFORE(bTimingThisFrame,uStartLow,uStartHigh,uBefore);
#endif // } DETAILED_DECODE_TIMINGS_ON

#ifdef USE_MMX // { USE_MMX
    ColorConvertorCatalog[DC->ColorConvertor].ColorConvertor[DC->bMMXDecoder ? MMX_CC : PENTIUM_CC](
#else // }{ USE_MMX
    ColorConvertorCatalog[DC->ColorConvertor].ColorConvertor[PENTIUM_CC](
#endif // } USE_MMX
        (LPSTR) pFrame+uYPlane,                  // Y plane
        (LPSTR) pFrame+uVPlane,                  // V plane
        (LPSTR) pFrame+uUPlane,                  // U plane
#ifdef H263P
		// The actual frame dimensions are needed for the color conversion
		(UN) DC->uActualFrameWidth,
		(UN) DC->uActualFrameHeight,
#else
        (UN) DC->uFrameWidth,
        (UN) DC->uFrameHeight,
#endif
        (UN) uYPitch,
        (UN) uUVPitch,
        (UN) (bShapingFlag ? 12 : 9999),         // Aspect Adjustment Counter
        (LPSTR) lpicDecEx->lpDst,                // Color Converted Frame
        (U32) lOutput,                           // DCI offset
        (U32) uNewOffsetToLine0,                 // Color converter offset to line 0
        (int) intPitch,                          // Color converter pitch
        DC->ColorConvertor);

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
		TIMER_AFTER_P5(bTimingThisFrame,uStartLow,uStartHigh,uBefore,uElapsed,uOutputCC);
#endif // } DETAILED_DECODE_TIMINGS_ON

    // check for 320x240 still
    if (DC->uIs320x240) {
        // restore frame size for next frame
        DC->uFrameWidth = uSaveWidth;
        DC->uFrameHeight = uSaveHeight;
    }

    iReturn = ICERR_OK;

done:
#ifdef USE_MMX // { USE_MMX
	if(NULL != DC)
	{
		if (DC->bMMXDecoder)
		{
			__asm {
				_emit 0x0f 
				_emit 0x77  //  emms
			}
		}
	}
#endif // } USE_MMX

#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
	if (bRealDecompress)
	{
		TIMER_STOP(bTimingThisFrame,uStartLow,uStartHigh,uDecodeTime);
		if (bTimingThisFrame)
		{
			// Update the decompression timings counter
			#pragma message ("Current decode timing computations assume P5/90Mhz")
			UPDATE_COUNTER(g_pctrDecompressionTimePerFrame, (uDecodeTime + 45000UL) / 90000UL);
			UPDATE_COUNTER(g_pctrBEFTimePerFrame, (uBEFTime + 45000UL) / 90000UL);

			DEBUGMSG(ZONE_DECODE_DETAILS, ("%s: Decompression time: %ld\r\n", _fx_, (uDecodeTime + 45000UL) / 90000UL));
			DEBUGMSG(ZONE_DECODE_DETAILS, ("%s: Block Edge Filtering time: %ld\r\n", _fx_, (uBEFTime + 45000UL) / 90000UL));
		}
	}
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)

#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
	if (bRealDecompress)
	{
		if (bTimingThisFrame)
		{
			pDecTimingInfo = DC->pDecTimingInfo + DC->uStatFrameCount;
			pDecTimingInfo->uDecodeFrame = uDecodeTime;
			pDecTimingInfo->uBEF = uBEFTime;
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			pDecTimingInfo->uHeaders = uHeaders;
			pDecTimingInfo->uMemcpy = uMemcpy;
			pDecTimingInfo->uFrameCopy = uFrameCopy;
			pDecTimingInfo->uIDCTandMC = uIDCTandMC;
			pDecTimingInfo->uOutputCC = uOutputCC;
			pDecTimingInfo->uDecIDCTCoeffs = uDecIDCTCoeffs;
#endif // } DETAILED_DECODE_TIMINGS_ON
			DC->uStatFrameCount++;
		}
	}
#endif // } LOG_DECODE_TIMINGS_ON

    return iReturn;
}


/************************************************************************
 *  H263TermDecoderInstance
 *    This function frees the space allocated for an instance of the H263 
 *    decoder.
 ************************************************************************/
LRESULT H263TermDecoderInstance(
#if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON) // { #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
    LPDECINST lpInst,
	BOOL bRealDecompress)
#else // }{ #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
    LPDECINST lpInst)
#endif // } #if defined(DECODE_TIMINGS_ON) || defined(DETAILED_DECODE_TIMINGS_ON)
{ 
    LRESULT iReturn=ICERR_OK;
    T_H263DecoderCatalog * DC;

	FX_ENTRY("H263TermDecoderInstance");
    
    if(IsBadWritePtr((LPVOID)lpInst, sizeof(DECINSTINFO)))
    {
		ERRORMESSAGE(("%s: Bad input parameter!\r\n", _fx_));
        iReturn = ICERR_BADPARAM;
    }
    if(lpInst->Initialized == FALSE)
    {
		ERRORMESSAGE(("%s: Uninitialized instance!\r\n", _fx_));
        return(ICERR_OK);
    }
    
    lpInst->Initialized = FALSE;
    
    DC = (T_H263DecoderCatalog *) ((((U32) lpInst->pDecoderInst) + 31) & ~0x1F);
    
    if (DC->_p16InstPostProcess != NULL)
    {
		HeapFree(GetProcessHeap(), 0, DC->_p16InstPostProcess);
#ifdef TRACK_ALLOCATIONS
		// Track memory allocation
		RemoveName((unsigned int)DC->_p16InstPostProcess);
#endif
		// PhilF: Also freed in H263TerminateDecoderInstance! For now set to NULL to avoid second HeapFree.
		// Investigate reason for 2nd call later...
		DC->_p16InstPostProcess = NULL;
    }  
    
#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
	if (bRealDecompress && DC->X32_DecTimingInfo)
	{
		DC->pDecTimingInfo = (DEC_TIMING_INFO FAR *)( ((U8 FAR *)DC) + DC->X32_DecTimingInfo );
		OutputDecodeTimingStatistics("c:\\decode.txt", DC->pDecTimingInfo, DC->uStatFrameCount);
	}
#endif // } LOG_DECODE_TIMINGS_ON

    HeapFree(GetProcessHeap(), 0, lpInst->pDecoderInst);
#ifdef TRACK_ALLOCATIONS
	// Track memory allocation
	RemoveName((unsigned int)lpInst->pDecoderInst);
#endif

    return iReturn;
}

/***********************************************************************
 *  Description:
 *    This routine parses the bit-stream and initializes two major streams:
 *      1) pN: no of coefficients in each of the block (biased by 65 for INTRA)
 *      2) pRun_INVERSE_Q: de-quantized coefficient stream for the frame;
 *           MMX stream is scaled because we use scaled IDCT.
 *    Other information (e.g. MVs) is kept in decoder catalog, block action 
 *    stream, and MB infor stream.
 *  Parameters:
 *    DC:            Decoder catalog ptr
 *    fpBlockAction: block action stream ptr
 *    fpMBInfo:      Macroblock info ptr
 *    fpbsState:     bit-stream state pointer
 *    fpu8MaxPtr:    sentinel value to check for bit-stream overruns
 *    pN:            stream of no. of coeffs (biased by block type) for each block
 *    pRun_INVERSE_Q:stream of de-quantized (and scaled if using MMX) coefficients
 *    iNumberOfGOBs: no. of GOBs in the frame
 *    iNumberOfMBs:  no. of MBs in a GOB in the frame
 *    iGOB_start:    
 *    iMB_start:     
 *  Note:
 ***********************************************************************/
#pragma code_seg("IACODE1")
static LRESULT IAPass1ProcessFrame(
    T_H263DecoderCatalog *DC,
    T_BlkAction          *fpBlockAction,
    T_MBInfo             *fpMBInfo,
    BITSTREAM_STATE      *fpbsState,
    U8                   *fpu8MaxPtr,
    U32                  *pN,
    T_IQ_INDEX           *pRUN_INVERSE_Q,
    const I32             iNumberOfGOBs,
    const I32             iNumberOfMBs,
    const I32             iGOB_start,
    const I32             iMB_start)
{
    I32 g, m, gg, mm, iReturn, iBlockNumber = 0 ;
#if 1  
    I32 mb_start = iMB_start;
    I32 old_g, old_m, b_skip;
    U32 *pNnew;
	I8  p8MVs[4]={0,0,0,0};

	FX_ENTRY("IAPass1ProcessFrame");

    //  In case of H.263, iGOB_start will be 1; H.263RTP may have value
    //  larger than 1

    for (g = 1; g < iGOB_start; g++, fpBlockAction += iNumberOfMBs*6)
        H263InitializeGOBBlockActionStream(DC, g, fpBlockAction);        

    for (g = iGOB_start; g <= iNumberOfGOBs; g++) 
    {
        iReturn = H263DecodeGOBHeader(DC, fpbsState, g);
        if (iReturn != ICERR_OK) 
        {
			ERRORMESSAGE(("%s: Error reading GOB header!\r\n", _fx_));
            goto error;
        }

        if (g != 1) g = DC->uGroupNumber + 1;
         
        fpBlockAction = (T_BlkAction FAR *)((U8 *)DC + DC->X16_BlkActionStream);
		fpBlockAction += (g - 1)* iNumberOfMBs*6;

        H263InitializeGOBBlockActionStream(DC, g, fpBlockAction);        
        //  re-sync uBlockNum fpBlockAction, fpMBInfo at this point
        iBlockNumber  = (g - 1)* iNumberOfMBs*6+(mb_start-1)*6;
        fpBlockAction = (T_BlkAction FAR *)((U8 *)DC + DC->X16_BlkActionStream);
        fpMBInfo      = (T_MBInfo FAR *) ((U8 *)DC + DC->X32_uMBInfoStream);    
        fpBlockAction += iBlockNumber;
        fpMBInfo      += iBlockNumber/6;
        if (DC->bPBFrame)
		 pNnew         = (U32 *)((U8 *)DC + DC->X32_pN) + iBlockNumber*2;
        else
		 pNnew         = (U32 *)((U8 *)DC + DC->X32_pN) + iBlockNumber;

        while (pN < pNnew ) *pN++ = 0;
        
        // For each MB do ...
        for (m = mb_start; m <= iNumberOfMBs; m++, iBlockNumber += 6, fpBlockAction += 6, fpMBInfo++) 
        {
            if (mb_start != 1) mb_start = 1;     //use it only once     ?
            
            iReturn = H263DecodeMBHeader(DC, fpbsState, &pN, fpMBInfo);   // NEW - added pN

            if (iReturn == PACKET_FAULT)
            {
				ERRORMESSAGE(("%s: H263DecodeMBHeader() failed!\r\n", _fx_));

                old_g = g;
                old_m = m;
                //Find the next good packet and find GOB and MB lost
                iReturn = RtpH263FindNextPacket(DC, fpbsState, &pN, 
					                            &DC->uPQuant,(int *)&m, (int *)&g,
												p8MVs);
                if (iReturn == NEXT_MODE_A) 
                {
					ERRORMESSAGE(("%s: Next packet in MODE A\r\n", _fx_));
					MVAdjustment(fpBlockAction, iBlockNumber, old_g-1, old_m-1, g, m,iNumberOfMBs); //Chad,7/22/96
                    break;
                }
                else if ((iReturn == NEXT_MODE_B) ||(iReturn == NEXT_MODE_C) ) 
                {//lost multiple of MBs, could belong to more than one GOB
  					if (iReturn == NEXT_MODE_B) 
					{
					ERRORMESSAGE(("%s: Next packet in MODE B\r\n", _fx_));
					  b_skip = ((g - old_g+1)* iNumberOfMBs + m - old_m + 1)*6;
                      for (int k = 0; k < b_skip; k++)  *pN++ = 0;
					}
					else
					{
					ERRORMESSAGE(("%s: Next packet in MODE C\r\n", _fx_));
					  b_skip = ((g - old_g+1)* iNumberOfMBs + m - old_m + 1)*6*2;
                      for (int k = 0; k < b_skip; k++)  *pN++ = 0;
					  b_skip = b_skip /2;
                    }
					
                    for (int k=0;k< b_skip /6;k++)
					{
						fpMBInfo->i8MVDBx2=0;
						fpMBInfo->i8MVDBy2=0;
						fpMBInfo->i8MBType =0;
						fpMBInfo++;
                    }
					fpMBInfo--;
                    b_skip -= 6;     //this is a tricky one since the parameter 
                                     //below will be adjust again later
                                     //Chad, 8/28/96
                    fpBlockAction += b_skip;
                    iBlockNumber  += b_skip;
					g++;    //because g start with 1 instead of 0 as specified by H.263
					for (k=0;k<6;k++)
					{
						fpBlockAction[k].i8MVx2 = p8MVs[0];
						fpBlockAction[k].i8MVy2 = p8MVs[1];
					}

                }
                else //Added by Chad.
                if (iReturn == NEXT_MODE_LAST)
                { 
                    int ii, jj, kk;   //last packet found
                                    //set all the rest of MB and GOB to NOT CODED.
					ERRORMESSAGE(("%s: Last packet lost\r\n", _fx_));
                    for ( ii = m;ii <= iNumberOfMBs; ii++) 
                        for (kk = 0; kk < 6; kk++) 
                            *pN++ = 0;
                    for ( jj = g; jj <= iNumberOfGOBs; jj++)
                        for (ii = 0; ii <= iNumberOfMBs; ii++)
                            for (kk = 0; kk<6; kk++) 
                                *pN++ = 0;
                    m = iNumberOfMBs;
                    g = iNumberOfMBs;
                }
			    DC->bCoded = FALSE;
			}
            else if (iReturn != ICERR_OK) 
            {
				ERRORMESSAGE(("%s: Error reading MB header!\r\n", _fx_));
                goto error;
            }
            
#ifdef NEW_BEF // { NEW_BEF
            gg = (g - 1);
            mm = (m - 1);
#else // }{ NEW_BEF
			gg = (g-1)<<1;
			mm = (m-1)<<1;
#endif // } NEW_BEF
            if (DC->bCoded) 
            {
				// coded_map is used by the block edge filter to indicate
				// which blocks are coded, and which are not coded.
#ifdef NEW_BEF // { NEW_BEF
                coded_map[gg+1][mm+1]   = 1;
				QP_map[gg][mm] = (char)DC->uGQuant;
#else // }{ NEW_BEF
				coded_map[gg]  [mm]   = 1;
				coded_map[gg+1][mm]   = 1;
				coded_map[gg]  [mm+1] = 1;
				coded_map[gg+1][mm+1] = 1;
#endif // } NEW_BEF

                // decode and inverse quantize the transform coefficients
                iReturn = H263DecodeIDCTCoeffs(DC, 
                                               fpBlockAction, 
                                               iBlockNumber, 
                                               fpbsState, 
                                               fpu8MaxPtr,
                                               &pN,
                                               &pRUN_INVERSE_Q);
                
                if (iReturn != ICERR_OK) {
					ERRORMESSAGE(("%s: Error parsing MB data!\r\n", _fx_));
                    goto error;
                }
            }  //  end if DC->bCoded
			else
			{
#ifdef NEW_BEF // { NEW_BEF
                coded_map[gg+1][mm+1]   = 0;
#else // }{ NEW_BEF
				coded_map[gg]  [mm]   = 0;
				coded_map[gg+1][mm]   = 0;
				coded_map[gg]  [mm+1] = 0;
				coded_map[gg+1][mm+1] = 0;
#endif // } NEW_BEF
			}

        } // end for each MB
        
        /* allow the pointer to address up to four beyond the end - reading
         * by DWORD using postincrement.
         */
        if (fpbsState->fpu8 > fpu8MaxPtr+4)
            goto error;
        //  The test matrix includes the debug version of the driver.  The 
        //  following assertion creates a problem when testing with VideoPhone
        //  and so please do not check-in a version with the assertion 
        //  uncommented.
        // ASSERT(fpbsState->fpu8 <= fpu8MaxPtr+4);
        
    } // End for each GOB
    DC->iVerifiedBsExt=FALSE;

#else
//old code  
    for (g = 1; g <= iNumberOfGOBs; g++) 
    {
        iReturn = H263DecodeGOBHeader(DC, fpbsState, g);
        if (iReturn != ICERR_OK) {
			ERRORMESSAGE(("%s: Error reading GOB header!\r\n", _fx_));
            goto error;
        }
        H263InitializeGOBBlockActionStream(DC, g, fpBlockAction);        
        
        /* For each MB do ...
         */
        for (m = 1; m <= iNumberOfMBs; 
             m++, iBlockNumber+=6, fpBlockAction += 6, fpMBInfo++) 
        {
            iReturn = H263DecodeMBHeader(DC, fpbsState, &pN, fpMBInfo);
            
            if (iReturn != ICERR_OK) {
				ERRORMESSAGE(("%s: Error reading MB header!\r\n", _fx_));
                goto error;
            }
            
            if (DC->bCoded) {
                // decode and inverse quantize the transform coefficients
                iReturn = H263DecodeIDCTCoeffs(DC, 
                                               fpBlockAction, 
                                               iBlockNumber, 
                                               fpbsState, 
                                               fpu8MaxPtr,
                                               &pN,
                                               &pRUN_INVERSE_Q);
                if (iReturn != ICERR_OK) 
                {
					ERRORMESSAGE(("%s: Error parsing MB data!\r\n", _fx_));
                    goto error;
                }
            }  //  end if DC->bCoded
        } // end for each MB
        
        /* allow the pointer to address up to four beyond the end - reading
         * by DWORD using postincrement.
         */
        ASSERT(fpbsState->fpu8 <= fpu8MaxPtr+4);
        
    } // End for each GOB
#endif

    return ICERR_OK;

error:
    return ICERR_ERROR;
}
#pragma code_seg()


/***********************************************************************
 *  Description:
 *    This routines does IDCT and motion compensation.
 *  Parameters:
 *    DC:            Decoder catalog ptr
 *    fpBlockAction: block action stream ptr
 *    fpMBInfo:      Macroblock info ptr
 *    pN:            stream of no. of coeffs (biased by block type) for each block
 *    pRun_INVERSE_Q:stream of de-quantized (and scaled if using MMX) coefficients
 *    iNumberOfGOBs: no. of GOBs in the frame
 *    iNumberOfMBs:  no. of MBs in a GOB in the frame
 *  Note:
 ***********************************************************************/
#pragma code_seg("IACODE2")
static void IAPass2ProcessFrame(
    T_H263DecoderCatalog *DC,
    T_BlkAction          *fpBlockAction,
    T_MBInfo             *fpMBInfo,
    U32                  *pN,
    T_IQ_INDEX           *pRUN_INVERSE_Q,
    const I32             iNumberOfGOBs,
    const I32             iNumberOfMBs
)
{
    I32 g, m, b, uBlockNumber = 0, iEdgeFlag=0;
    U32 pRef[6];

    // for each GOB do
    for (g = 1 ; g <= iNumberOfGOBs; g++) 
    {
        // for each MB do
        for (m = 1; m <= iNumberOfMBs; m++, fpBlockAction+=6, fpMBInfo++) 
        {
            //  Motion Vectors need to be clipped if they point outside the 
            //  16 pels wide edge
            if (DC->bUnrestrictedMotionVectors)   
            {
                iEdgeFlag = 0;
                if (m == 1)
                    iEdgeFlag |= LEFT_EDGE;
                if (m == DC->iNumberOfMBsPerGOB)
                    iEdgeFlag |= RIGHT_EDGE;
                if (g == 1)
                    iEdgeFlag |= TOP_EDGE;
                if (g == iNumberOfGOBsBySourceFormat[DC->uSrcFormat])
                    iEdgeFlag |= BOTTOM_EDGE;
            }
            // for each block do
            for (b = 0; b < 6; b++) 
            {     // AP-NEW
                // do inverse transform & motion compensation for the block
                H263IDCTandMC(DC, fpBlockAction, b, m, g, pN, pRUN_INVERSE_Q, 
                              fpMBInfo, iEdgeFlag); // AP-NEW
                // Adjust pointers for next block     
                if ( *pN >= 65 )
                    pRUN_INVERSE_Q += *pN - 65;
                else
                    pRUN_INVERSE_Q += *pN;
                pN++;
            }  // end for each block
            
            // if this is a PB Frame
            if (DC->bPBFrame) 
            {
                // Compute the B Frame motion vectors
                H263BBlockPrediction(DC, fpBlockAction, pRef, fpMBInfo, 
                                     iEdgeFlag);  // AP-NEW
                // For each B block
                for (b = 0; b < 6; b++) 
                {
                    //  perform inverse transform & bi-directional motion 
                    //  compensation
                    H263BFrameIDCTandBiMC(DC, fpBlockAction, b, pN, 
                                          pRUN_INVERSE_Q, pRef);
                    // Adjust pointers for next block     
                    pRUN_INVERSE_Q += *pN;
                    pN++;
                }  // end for each B block
            }  // end if PB Frame
        }  // end for each MB
    }  // End for each GOB
}
#pragma code_seg()


/****************************************************************************
 *  DibXY
 *    This function is used to map color converted output to the screen.
 *    note: this function came from the H261 code base.
 ****************************************************************************/
static long DibXY(ICDECOMPRESSEX FAR *lpicDecEx, LPINT lpiPitch, UINT yScale)
{
    int                 iPitch;             /* width of DIB                */
    long                lOffset = 0;
    LPBITMAPINFOHEADER  lpbi = lpicDecEx->lpbiDst;

    iPitch = ( ( (abs((int)lpbi->biWidth) * (int)lpbi->biBitCount) >> 3) + 3) & ~3;

    if(lpicDecEx->xDst > 0)                 /* go to proper X position     */
        lOffset += ((long)lpicDecEx->xDst * (long)lpbi->biBitCount) >> 3;

    if(lpbi->biHeight * lpicDecEx->dxSrc < 0) { /* DIB is bottom to top    */
        lOffset +=  (long) abs((int)lpbi->biWidth) * 
                    (long) abs((int)lpbi->biHeight) *
                    ((long) lpbi->biBitCount >> 3) - 
                    (long) iPitch;

    /************************************************************************
     *  This next line is used to subtract the amount that Brian added
     *  to CCOffsetToLine0 in COLOR.C during initialization.  This is 
     *  needed because, for DCI, the pitch he used is incorrect. 
     ***********************************************************************/

        lOffset -=    ((long) yScale * (long) lpicDecEx->dySrc - 1) *     
                    (long) lpicDecEx->dxDst * ((long) lpbi->biBitCount >> 3);  

        iPitch *= -1;
    }

    if(lpicDecEx->yDst > 0)                 /* go to proper Y position     */
        lOffset += ((long)lpicDecEx->yDst * (long)iPitch);

    if(lpicDecEx->dxSrc > 0) {
        lOffset += ((long)lpicDecEx->dyDst * (long)iPitch) - (long)iPitch;
        iPitch *= -1;
    }

    if( (lpicDecEx->dxDst == 0) && (lpicDecEx->dyDst == 0) )
        *lpiPitch = -iPitch;
    else
        *lpiPitch = iPitch;
  
    return(lOffset);
}


/************************************************************************
 *  GetDecoderOptions:
 *    Get the options, saving them in the catalog
 ***********************************************************************/
static void GetDecoderOptions(
    T_H263DecoderCatalog * DC)
{
    /* Default Options
     */
#ifdef NO_BEF // { NO_BEF
    DC->bUseBlockEdgeFilter = 0;
#else // }{ NO_BEF
    DC->bUseBlockEdgeFilter = 1;
#endif // } NO_BEF
    DC->bForceOnAspectRatioCorrection = 0;
#ifdef USE_MMX // { USE_MMX
    DC->bMMXDecoder = MMxVersion;
#endif // } USE_MMX

	FX_ENTRY("GetDecoderOptions");

    /* Can only use force aspect ratio correction on if SQCIF, QCIF, or CIF
     */
    if (DC->bForceOnAspectRatioCorrection)
    {
        if (! ( ((DC->uFrameWidth == 128) && (DC->uFrameHeight ==  96)) ||
                ((DC->uFrameWidth == 176) && (DC->uFrameHeight == 144)) ||
                ((DC->uFrameWidth == 352) && (DC->uFrameHeight == 288)) ) )
        {
			ERRORMESSAGE(("%s: Aspect ratio correction can not be forced on unless the dimensions are SQCIF, QCIF, or CIF!\r\n", _fx_));
            DC->bForceOnAspectRatioCorrection = 0;
        }
    }

    /* Display the options
     */
    if (DC->bUseBlockEdgeFilter)
    {
		DEBUGMSG (ZONE_INIT, ("%s: Decoder option (BlockEdgeFilter) is ON\r\n", _fx_));
    }
    else
    {
		DEBUGMSG (ZONE_INIT, ("%s: Decoder option (BlockEdgeFilter) is OFF\r\n", _fx_));
    }
    if (DC->bForceOnAspectRatioCorrection)
    {
		DEBUGMSG (ZONE_INIT, ("%s: Decoder option (ForceOnAspectRatioCorrection) is ON\r\n", _fx_));
    }
    else
    {
		DEBUGMSG (ZONE_INIT, ("%s: Decoder option (ForceOnAspectRatioCorrection) is OFF\r\n", _fx_));
    }
#ifdef USE_MMX // { USE_MMX
    if (DC->bMMXDecoder)
    {
		DEBUGMSG (ZONE_INIT, ("%s: Decoder option (MMXDecoder) is ON\r\n", _fx_));
    }
    else
    {
		DEBUGMSG (ZONE_INIT, ("%s: Decoder option (MMXDecoder) is OFF\r\n", _fx_));
    }
#else // }{ USE_MMX
	DEBUGMSG (ZONE_INIT, ("%s: Decoder option (MMXDecoder) is OFF\r\n", _fx_));
#endif // } USE_MMX
} /* end GetDecoderOptions() */


#if !defined(H263P)
#ifdef NEW_BEF // { NEW_BEF
/**********************************************************************
 *
 *      Name:           EdgeFilter
 *      Description:    performs deblocking filtering on
 *                      reconstructed frames
 *      
 *      Input:          pointers to reconstructed frame and difference 
 *                      image
 *      Returns:       
 *      Side effects:
 *
 *      Date: 951129    Author: Gisle.Bjontegaard@fou.telenor.no
 *                              Karl.Lillevold@nta.no
 *      Modified for annex J in H.263+: 961120   Karl O. Lillevold
 *
 ***********************************************************************/
static void EdgeFilter(unsigned char *lum, 
                       unsigned char *Cb, 
                       unsigned char *Cr, 
                       int width, int height, int pitch
                      )
{

    /* Luma */
    HorizEdgeFilter(lum, width, height, pitch, 0);
    VertEdgeFilter (lum, width, height, pitch, 0);

    /* Chroma */
    HorizEdgeFilter(Cb, width>>1, height>>1, pitch, 1);
    VertEdgeFilter (Cb, width>>1, height>>1, pitch, 1);
    HorizEdgeFilter(Cr, width>>1, height>>1, pitch, 1);
    VertEdgeFilter (Cr, width>>1, height>>1, pitch, 1);

    return;
}

/***********************************************************************/
static void HorizEdgeFilter(unsigned char *rec, 
                            int width, int height, int pitch, int chr)
{
  int i,j,k;    
  int delta;
  int mbc, mbr, do_filter;
  unsigned char *r_2, *r_1, *r, *r1;
  signed char *deltatab;

  /* horizontal edges */
  r = rec + 8*pitch;
  r_2 = r - 2*pitch;
  r_1 = r - pitch;
  r1 = r + pitch;

  for (j = 8; j < height; j += 8) {
    for (i = 0; i < width; i += 8) {

      if (!chr) {
        mbr = (j >> 4); 
        mbc = (i >> 4);
      }
      else {
        mbr = (j >> 3); 
        mbc = (i >> 3);
      }

      deltatab = dtab + 176 + 351 * (QP_map[mbr][mbc] - 1);

      do_filter = coded_map[mbr+1][mbc+1] || coded_map[mbr][mbc+1];

      if (do_filter) {
        for (k = i; k < i+8; k++) {
          delta = (int)deltatab[ (( (int)(*(r_2 + k) * 3) -
                                    (int)(*(r_1 + k) * 8) +
                                    (int)(*(r   + k) * 8) -
                                    (int)(*(r1  + k) * 3)) >>4)];
                        
          *(r + k) = ClampTbl[ (int)(*(r + k)) - delta + CLAMP_BIAS];
          *(r_1 + k) = ClampTbl[ (int)(*(r_1 + k)) + delta + CLAMP_BIAS];

        }
      }
    }
    r   += (pitch<<3);
    r1  += (pitch<<3);
    r_1 += (pitch<<3);
    r_2 += (pitch<<3);
  }
  return;
}

/***********************************************************************/
static void VertEdgeFilter(unsigned char *rec, 
                           int width, int height, int pitch, int chr)
{
  int i,j,k;
  int delta;
  int mbc, mbr;
  int do_filter;
  signed char *deltatab;
  unsigned char *r;

  /* vertical edges */
  for (i = 8; i < width; i += 8) 
  {
    r = rec;
    for (j = 0; j < height; j +=8) 
    {
      if (!chr) {
        mbr = (j >> 4); 
        mbc = (i >> 4);
      }
      else {
        mbr = (j >> 3); 
        mbc = (i >> 3);
      }
        
      deltatab = dtab + 176 + 351 * (QP_map[mbr][mbc] - 1);

      do_filter = coded_map[mbr+1][mbc+1] || coded_map[mbr+1][mbc];

      if (do_filter) {
        for (k = 0; k < 8; k++) {
          delta = (int)deltatab[(( (int)(*(r + i-2 ) * 3) - 
                                   (int)(*(r + i-1 ) * 8) + 
                                   (int)(*(r + i   ) * 8) - 
                                   (int)(*(r + i+1 ) * 3)  ) >>4)];

          *(r + i   ) = ClampTbl[ (int)(*(r + i  )) - delta + CLAMP_BIAS];
          *(r + i-1 ) = ClampTbl[ (int)(*(r + i-1)) + delta + CLAMP_BIAS]; 
          r   += pitch;
        }
      }
      else {
        r += (pitch<<3);
      }
    }
  }
  return;
}

#define sign(a)        ((a) < 0 ? -1 : 1)

static void InitEdgeFilterTab()   
{
  int i,QP;
  
  for (QP = 1; QP <= 31; QP++) {
    for (i = -176; i <= 175; i++) {
      dtab[i+176 +(QP-1)*351] = sign(i) * (max(0,abs(i)-max(0,2*abs(i) - QP)));
    }
  }
}

#else // }{ NEW_BEF

/**********************************************************************
 *
 *      Name:           EdgeFilter
 *      Description:    performs in the loop edge-filtering on
 *                      reconstructed frames
 *      
 *      Input:          pointers to reconstructed frame and difference 
 *                      image
 *      Returns:       
 *      Side effects:
 *
 *      Date: 951129    Author: Gisle.Bjontegaard@fou.telenor.no
 *                              Karl.Lillevold@nta.no
 *
 ***********************************************************************/
void EdgeFilter(unsigned char *lum, unsigned char *Cb, unsigned char *Cr, int QP, int pels, int lines, int pitch)
{

  int dtab[512];
  int *deltatab;
  int i;

  deltatab = &dtab[0] + 256;

  for (i=-256; i < 0; i++)
    deltatab[i] = min(0,i-min(0,((i + (QP>>1))<<1)));   
  for (i=0; i < 256; i++)
    deltatab[i] = max(0,i-max(0,((i - (QP>>1))<<1)));

  /* Luma */
  HorizEdgeFilter(lum, pels, lines, pitch, QP, 0, deltatab);
  VertEdgeFilter (lum, pels, lines, pitch, QP, 0, deltatab);

  /* Chroma */
  HorizEdgeFilter(Cb,  pels>>1, lines>>1, pitch, QP, 1, deltatab);
  VertEdgeFilter (Cb,  pels>>1, lines>>1, pitch, QP, 1, deltatab);
  HorizEdgeFilter(Cr,  pels>>1, lines>>1, pitch, QP, 1, deltatab);
  VertEdgeFilter (Cr,  pels>>1, lines>>1, pitch, QP, 1, deltatab);

  /* that's it */
  return;
}

/***********************************************************************/
void HorizEdgeFilter(unsigned char *rec, int width, int height, int pitch, int QP, 
                     int chr, int *deltatab)
{
  int i,j,k;
  int delta;
  int mbc, mbr, do_filter;
  int coded1, coded2;
  unsigned char *r_2, *r_1, *r, *r1;


  /* horizontal edges */
  r = rec + 8*pitch;
  r_2 = r - 2*pitch;
  r_1 = r - pitch;
  r1 = r + pitch;

  if (!chr) {
    for (j = 8; j < height; j += 8) {
      for (i = 0; i < width; i += 8) {

        mbr = (j >> 3); 
        mbc = (i >> 3);

          do_filter = coded_map[mbr][mbc] | coded_map[mbr-1][mbc];

        if (do_filter) {
          for (k = i; k < i+8; k++) {
              delta = deltatab[ (( (int)(*(r_2 + k)) +
                                   (int)(*(r_1 + k) * (-3)) +
                                   (int)(*(r   + k) * ( 3)) -
                                   (int)(*(r1  + k) )) >>3)];

              *(r + k) = ClampTbl[ (int)(*(r + k)) - delta + CLAMP_BIAS];
              *(r_1 + k) = ClampTbl[ (int)(*(r_1 + k)) + delta + CLAMP_BIAS];

          }
        }
      }
      r   += (pitch<<3);
      r1  += (pitch<<3);
      r_1 += (pitch<<3);
      r_2 += (pitch<<3);
    }
  }
  else { /* chr */
    for (j = 8; j < height; j += 8) {
      for (i = 0; i < width; i += 8) {

        mbr = (j >> 3); 
        mbc = (i >> 3);

          coded1 = 
            coded_map[2*mbr][2*mbc] |
            coded_map[2*mbr][2*mbc+1] |
            coded_map[2*mbr+1][2*mbc] |
            coded_map[2*mbr+1][2*mbc+1];
          coded2 = 
            coded_map[2*(mbr-1)][2*mbc] |
            coded_map[2*(mbr-1)][2*mbc+1] |
            coded_map[2*(mbr-1)+1][2*mbc] |
            coded_map[2*(mbr-1)+1][2*mbc+1];
          do_filter = coded1 | coded2;

        if (do_filter) {
          for (k = i; k < i+8; k++) {
              delta = deltatab[ (( (int)(*(r_2 + k)) +
                                   (int)(*(r_1 + k) * (-3)) +
                                   (int)(*(r   + k) * ( 3)) -
                                   (int)(*(r1  + k) )) >>3)];

              *(r + k) = ClampTbl[ (int)(*(r + k)) - delta + CLAMP_BIAS];
              *(r_1 + k) = ClampTbl[ (int)(*(r_1 + k)) + delta + CLAMP_BIAS];

          }
        }
      }
      r   += (pitch<<3);
      r1  += (pitch<<3);
      r_1 += (pitch<<3);
      r_2 += (pitch<<3);
    }
  }
  return;
}

/***********************************************************************/
void VertEdgeFilter(unsigned char *rec, int width, int height, int pitch, int QP, 
                    int chr, int *deltatab)
{
  int i,j,k;
  int delta;
  int mbc, mbr;
  int do_filter, coded1, coded2;
  unsigned char *r;
  extern const U8 ClampTbl[CLAMP_BIAS+256+CLAMP_BIAS];

  /* vertical edges */
  for (i = 8; i < width; i += 8) {
    r = rec;
    for (j = 0; j < height; j +=8) {
      mbr = (j >> 3); 
      mbc = (i >> 3);

      if (!chr) {
        do_filter = coded_map[mbr][mbc] | coded_map[mbr][mbc-1];
      }
      else {
        coded1 = 
          coded_map[2*mbr][2*mbc] |
          coded_map[2*mbr][2*mbc+1] |
          coded_map[2*mbr+1][2*mbc] |
          coded_map[2*mbr+1][2*mbc+1];
        coded2 = 
          coded_map[2*mbr][2*(mbc-1)] |
          coded_map[2*mbr][2*(mbc-1)+1] |
          coded_map[2*mbr+1][2*(mbc-1)] |
          coded_map[2*mbr+1][2*(mbc-1)+1];
        do_filter = coded1 | coded2;
      }
      if (do_filter) {
        for (k = 0; k < 8; k++) {

          delta = deltatab[(( (int)(*(r + i-2 )       ) + 
                              (int)(*(r + i-1 ) * (-3)) + 
                              (int)(*(r + i   ) * ( 3)) - 
                              (int)(*(r + i+1 ) )  ) >>3)];


          *(r + i   ) = ClampTbl[ (int)(*(r + i  )) - delta + CLAMP_BIAS];
          *(r + i-1 ) = ClampTbl[ (int)(*(r + i-1)) + delta + CLAMP_BIAS]; 
          r   += pitch;
        }
      }
      else {
        r += (pitch<<3);
      }
    }
  }
  return;
}
#endif // } NEW_BEF
#endif

#ifdef LOG_DECODE_TIMINGS_ON // { LOG_DECODE_TIMINGS_ON
void OutputDecodeTimingStatistics( char * szFileName, DEC_TIMING_INFO * pDecTimingInfo, U32 uStatFrameCount)
{
	FILE * pFile;
	DEC_TIMING_INFO * pTempDecTimingInfo;
	DEC_TIMING_INFO dtiTemp;
	int i;
	int iCount;

	FX_ENTRY("OutputDecodeTimingStatistics")

	pFile = fopen(szFileName, "a");
	if (pFile == NULL)
	{
		ERRORMESSAGE("%s: Error opening decode stat file\r\n", _fx_));
		goto done;
	}

	/* Output the detail information
	*/
	fprintf(pFile,"\nDetail Timing Information\n");
	// for ( i = 0, pTempDecTimingInfo = pDecTimingInfo ; i < uStatFrameCount ; i++, pTempDecTimingInfo++ )
	for ( i = 0, pTempDecTimingInfo = pDecTimingInfo ; i < DEC_TIMING_INFO_FRAME_COUNT ; i++, pTempDecTimingInfo++ )
	{
		if (pTempDecTimingInfo->uDecodeFrame != 0)
		{
			fprintf(pFile, "Frame %d Detail Timing Information\n", i);
			OutputDecTimingDetail(pFile, pTempDecTimingInfo);
		}
	}

	/* Compute the total information
	 */
	memset(&dtiTemp, 0, sizeof(DEC_TIMING_INFO));
	iCount = 0;

	// for ( i = 0, pTempDecTimingInfo = pDecTimingInfo ; i < uStatFrameCount ; i++, pTempDecTimingInfo++ )
	for ( i = 0, pTempDecTimingInfo = pDecTimingInfo ; i < DEC_TIMING_INFO_FRAME_COUNT ; i++, pTempDecTimingInfo++ )
	{
		if (pTempDecTimingInfo->uDecodeFrame != 0)
		{
			iCount++;

			dtiTemp.uDecodeFrame  += pTempDecTimingInfo->uDecodeFrame;
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
			dtiTemp.uHeaders	  += pTempDecTimingInfo->uHeaders;
			dtiTemp.uMemcpy       += pTempDecTimingInfo->uMemcpy;
			dtiTemp.uFrameCopy    += pTempDecTimingInfo->uFrameCopy;
			dtiTemp.uOutputCC     += pTempDecTimingInfo->uOutputCC;
			dtiTemp.uIDCTandMC    += pTempDecTimingInfo->uIDCTandMC;
			dtiTemp.uDecIDCTCoeffs+= pTempDecTimingInfo->uDecIDCTCoeffs;
#endif // } DETAILED_DECODE_TIMINGS_ON
			dtiTemp.uBEF          += pTempDecTimingInfo->uBEF;
		}
	}

	if (iCount > 0) 
	{
		/* Output the total information
		*/
		fprintf(pFile,"Total for %d frames\n", iCount);
		OutputDecTimingDetail(pFile, &dtiTemp);

		/* Compute the average
		*/
		dtiTemp.uDecodeFrame  = (dtiTemp.uDecodeFrame + (iCount / 2)) / iCount;
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
		dtiTemp.uHeaders	  = (dtiTemp.uHeaders + (iCount / 2)) / iCount;
		dtiTemp.uMemcpy       = (dtiTemp.uMemcpy + (iCount / 2)) / iCount;
		dtiTemp.uFrameCopy    = (dtiTemp.uFrameCopy + (iCount / 2)) / iCount;
		dtiTemp.uOutputCC     = (dtiTemp.uOutputCC + (iCount / 2)) / iCount;
		dtiTemp.uIDCTandMC    = (dtiTemp.uIDCTandMC+ (iCount / 2)) / iCount;
		dtiTemp.uDecIDCTCoeffs= (dtiTemp.uDecIDCTCoeffs+ (iCount / 2)) / iCount;
#endif // } DETAILED_DECODE_TIMINGS_ON
		dtiTemp.uBEF          = (dtiTemp.uBEF + (iCount / 2)) / iCount;

		/* Output the average information
		*/
		fprintf(pFile,"Average over %d frames\n", iCount);
		OutputDecTimingDetail(pFile, &dtiTemp);
	}

	fclose(pFile);
done:

    return;
}

void OutputDecTimingDetail(FILE * pFile, DEC_TIMING_INFO * pDecTimingInfo)
{
	U32 uOther;
	U32 uRoundUp;
	U32 uDivisor;

	fprintf(pFile, "\tDecode Frame =      %10d (%d milliseconds at 90Mhz)\n", pDecTimingInfo->uDecodeFrame,
			(pDecTimingInfo->uDecodeFrame + 45000) / 90000);
	uOther = pDecTimingInfo->uDecodeFrame;
	
	/* This is needed because of the integer truncation.
	 */
	uDivisor = pDecTimingInfo->uDecodeFrame / 100; // to yield a percent
	uRoundUp = uDivisor / 2;
	
#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
	fprintf(pFile, "\tmemcpy =            %10d (%2d%%)\n", pDecTimingInfo->uMemcpy, 
			(pDecTimingInfo->uMemcpy + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uMemcpy;
								   
	fprintf(pFile, "\tHeaders =           %10d (%2d%%)\n", pDecTimingInfo->uHeaders, 
			(pDecTimingInfo->uHeaders + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uHeaders;
								   
	fprintf(pFile, "\tFrameCopy =         %10d (%2d%%)\n", pDecTimingInfo->uFrameCopy, 
			(pDecTimingInfo->uFrameCopy + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uFrameCopy;

	fprintf(pFile, "\tDecode DCT Coeffs = %10d (%2d%%)\n", pDecTimingInfo->uDecIDCTCoeffs, 
			(pDecTimingInfo->uDecIDCTCoeffs + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uDecIDCTCoeffs;

	fprintf(pFile, "\tIDCT and MC       = %10d (%2d%%)\n", pDecTimingInfo->uIDCTandMC, 
			(pDecTimingInfo->uIDCTandMC + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uIDCTandMC;
#endif // } DETAILED_DECODE_TIMINGS_ON

	fprintf(pFile, "\tBlock Edge Filter = %10d (%2d%%)\n", pDecTimingInfo->uBEF, 
			(pDecTimingInfo->uBEF + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uBEF;

#ifdef DETAILED_DECODE_TIMINGS_ON // { DETAILED_DECODE_TIMINGS_ON
	fprintf(pFile, "\tOutput CC =         %10d (%2d%%)\n", pDecTimingInfo->uOutputCC, 
			(pDecTimingInfo->uOutputCC + uRoundUp) / uDivisor);
	uOther -= pDecTimingInfo->uOutputCC;
#endif // } DETAILED_DECODE_TIMINGS_ON

	fprintf(pFile, "\tOther =             %10d (%2d%%)\n", uOther, 
			(uOther + uRoundUp) / uDivisor);

}
#endif // } LOG_DECODE_TIMINGS_ON