windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/i386/cxm12161.asm

;-------------------------------------------------------------------------
;    INTEL Corporation Proprietary Information
;
;    This listing is supplied under the terms of a license
;    agreement with INTEL Corporation and may not be copied
;    nor disclosed except in accordance with the terms of
;    that agreement.
;
;    Copyright (c) 1996 Intel Corporation.
;    All Rights Reserved.
;
;-------------------------------------------------------------------------

;-------------------------------------------------------------------------
;//
;// $Header:   S:\h26x\src\dec\cx512161.asv
;//
;// $Log:   S:\h26x\src\dec\cxm12161.asv  $
;// 
;//    Rev 1.9   24 May 1996 11:12:10   AGUPTA2
;// 
;// Modified version of final drop from IDC.  Fixed alignment, global var,
;// referencing beyond stack pointer problems.  Cosmetic changes to adhere
;// to a common coding convention in all MMX color convertor files.
;//
;//    Rev 1.8   17 Apr 1996 09:51:08   ISRAELH
;// Added AspectRatio adjustement, emms.
;//
;//    Rev 1.7   11 Apr 1996 09:51:08   RMCKENZX
;// Changed return to pop the stack.
;//
;//    Rev 1.6   09 Apr 1996 10:00:44   RMCKENZX
;//
;// Changed calling sequence to __stdcall.
;//
;//    Rev 1.5   05 Apr 1996 10:40:20   RMCKENZX
;// Hacked in Aspect Ratio correction.  This is accomplished
;// by simply overwriting the next even line after the aspect
;// count has been matched or exceeded.
;//
;//    Rev 1.4   29 Mar 1996 07:52:56   RMCKENZX
;// re-fixed bug in 655 setup.
;//
;//    Rev 1.3   28 Mar 1996 14:35:38   RMCKENZX
;// Cleaned up code, added comments, revised calling sequence,
;// moved global variables onto stack.
;//
;//    Rev 1.2   21 Mar 1996 08:10:06   RMCKENZX
;// Fixed 655 case -- initialized GLeftShift at 5.
;//
;//    Rev 1.1   20 Mar 1996 11:18:52   RMCKENZX
;// March 96 version.
;
;     Rev 1.3   19 Feb 1996 11:49:42   israelh
;  bug fix.
;  new algorithm for RGB16 bit pack.
;
;     Rev 1.3   18 Feb 1996 20:58:44   israelh
;  better algorithm and bug fix
;
;     Rev 1.2   29 Jan 1996 19:53:50   mikeh
;
;  added Ifdef timing
;
;     Rev 1.1   29 Jan 1996 16:29:16   mikeh
;  remvoed $LOG stuff
;
;     Rev 1.0   29 Jan 1996 11:49:44   israelh
;  Initial revision.
;//
;// MMX 1.3 14 Jan 1996 IsraelH
;// Implementing runtime RGB bit allocation according to BValLo[0]:
;// It contains the ColorConvertor value from d1color.cpp module.
;// Compiler flag RTIME16 for using runtime allocation.
;//
;// MMX 1.2 10 Jan 1996 IsraelH
;// Implementing RGB16x565 (5-R 5-G 5-B) as default
;// Compiler flag MODE555 for RGB16555 (5-R 5-G 5-B)
;//
;// MMX 1.1 09 Jan 1996 IsraelH
;// Implementing RGB16x555 (5-R 5-G 5-B)
;// Commenting out RGB16664 (6-R 6-G 4-B)
;// Adding performance measurements in runtime
;//
;// MMX 1.0 25 Dec 1995 IsraelH
;// Port to MMX(TM) without using tables
;
;-------------------------------------------------------------------------
;
; +---------- Color convertor.
; |+--------- For both H261 and H263.
; ||+-------- MMx Version.
; |||++------ Convert from YUV12.
; |||||++---- Convert to RGB16.
; |||||||+--- Zoom by one, i.e. non-zoom.
; ||||||||
; cxm12161 -- This function performs YUV12-to-RGB16 color conversion for H26x.
;             It handles any format in which there are three fields, the low
;             order field being B and fully contained in the low order byte, the
;             second field being G and being somewhere in bits 4 through 11,
;             and the high order field being R and fully contained in the high
;             order byte.
;
;             The YUV12 input is planar, 8 bits per pel.  The Y plane may have
;             a pitch of up to 768.  It may have a width less than or equal
;             to the pitch.  It must be DWORD aligned, and preferably QWORD
;             aligned.  Pitch and Width must be a multiple of four.  For best
;             performance, Pitch should not be 4 more than a multiple of 32.
;             Height may be any amount, but must be a multiple of two.  The U
;             and V planes may have a different pitch than the Y plane, subject
;             to the same limitations.
;
OPTION CASEMAP:NONE
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro

.586
.xlist
include iammx.inc
include memmodel.inc
.list

MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS

MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS

MMXDATA1 SEGMENT
ALIGN 8
Minusg              DWORD   00800080h, 00800080h
Yadd                DWORD   10101010h, 10101010h
VtR                 DWORD   00660066h, 00660066h ;01990199h,01990199h
VtG                 DWORD   00340034h, 00340034h ;00d000d0h,00d000d0h
UtG                 DWORD   00190019h, 00190019h ;00640064h,00640064h
UtB                 DWORD   00810081h, 00810081h ;02050205h,02050205h
Ymul                DWORD   004a004ah, 004a004ah ;012a012ah,012a012ah
UVtG                DWORD   00340019h, 00340019h ;00d00064h,00d00064h
VtRUtB              DWORD   01990205h, 01990205h
fourbitu            DWORD  0f0f0f0f0h, 0f0f0f0f0h
fivebitu            DWORD  0e0e0e0e0h, 0e0e0e0e0h
sixbitu             DWORD  0c0c0c0c0h, 0c0c0c0c0h
MMXDATA1 ENDS

MMXCODE1 SEGMENT
MMX_YUV12ToRGB16 PROC DIST LANG PUBLIC,
  AYPlane:              DWORD,
  AVPlane:              DWORD,
  AUPlane:              DWORD,
  AFrameWidth:          DWORD,
  AFrameHeight:         DWORD,
  AYPitch:              DWORD,
  AVPitch:              DWORD,
  AAspectAdjustmentCnt: DWORD,
  AColorConvertedFrame: DWORD,
  ADCIOffset:           DWORD,
  ACCOffsetToLine0:     DWORD,
  ACCOPitch:            DWORD,
  ACCType:              DWORD

LocalFrameSize           = 256
RegisterStorageSize      = 16
argument_base            EQU ebp + RegisterStorageSize
local_base               EQU esp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Arguments:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
YPlane                   EQU argument_base +  4
VPlane                   EQU argument_base +  8
UPlane                   EQU argument_base + 12
FrameWidth               EQU argument_base + 16
FrameHeight              EQU argument_base + 20
YPitch                   EQU argument_base + 24
ChromaPitch              EQU argument_base + 28
AspectAdjustmentCount    EQU argument_base + 32
ColorConvertedFrame      EQU argument_base + 36
DCIOffset                EQU argument_base + 40
CCOffsetToLine0          EQU argument_base + 44
CCOPitch                 EQU argument_base + 48
CCType                   EQU argument_base + 52
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Locals (on local stack frame)
;   (local_base is aligned at cache-line boundary in the prologue)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
localFrameWidth          EQU local_base + 0
localYPitch              EQU local_base + 4
localChromaPitch         EQU local_base + 8
localAspectAdjustmentCount EQU local_base + 12
localCCOPitch            EQU local_base + 16
CCOCursor                EQU local_base + 20
CCOSkipDistance          EQU local_base + 24
YLimit                   EQU local_base + 28
DistanceFromVToU         EQU local_base + 32
currAspectCount          EQU local_base + 36
YCursorEven              EQU local_base + 40
YCursorOdd               EQU local_base + 44
tmpCCOPitch              EQU local_base + 48
StashESP                 EQU local_base + 52
; space for two DWORD locals
temp_mmx                 EQU local_base + 64  ; note it is 64 bytes
RLeftShift               EQU local_base +128
GLeftShift               EQU local_base +136
RRightShift              EQU local_base +144
GRightShift              EQU local_base +152
BRightShift              EQU local_base +160
RUpperLimit              EQU local_base +168
GUpperLimit              EQU local_base +176
BUpperLimit              EQU local_base +184

; Switches used by RGB color convertors to determine the exact conversion type.
RGB16555 =  9
RGB16664 = 14
RGB16565 = 18
RGB16655 = 22

  push       esi
   push      edi
  push       ebp
   push      ebx
  mov        ebp, esp
   sub       esp, LocalFrameSize
  and        esp, -32
   mov       [StashESP], ebp
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;  Save some parameters on local stack frame
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  mov        ebx, [FrameWidth]
   ;
  mov        [localFrameWidth], ebx
   mov       ebx, [YPitch]
  mov        [localYPitch], ebx
   mov       ebx, [ChromaPitch]
  mov        [localChromaPitch], ebx
   mov       ebx, [AspectAdjustmentCount]
  mov        [localAspectAdjustmentCount], ebx
   mov       ebx, [CCOPitch]
  mov        [localCCOPitch], ebx
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;  Set-up rest of the local stack frame
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   mov       al, [CCType]
  cmp        al, RGB16555
   je        RGB555
  cmp        al, RGB16664
   je        RGB664
  cmp        al, RGB16565
   je        RGB565
  cmp        al, RGB16655
   je        RGB655
RGB555:
  xor        eax, eax
   mov       ebx, 2   ; 10-8 for byte shift
  mov        [RLeftShift], ebx
   mov       [RLeftShift+4], eax
  mov        ebx, 5
   mov       [GLeftShift+4], eax
  mov        [GLeftShift], ebx
   mov       ebx, 9
  mov        [RRightShift], ebx
   mov       [RRightShift+4], eax
  mov        [GRightShift], ebx
   mov       [GRightShift+4], eax
  mov        [BRightShift], ebx
   mov       [BRightShift+4], eax
  movq       mm0, fivebitu
   ;
  movq       [RUpperLimit], mm0
   ;
  movq       [GUpperLimit], mm0
   ;
  movq       [BUpperLimit], mm0
   jmp       RGBEND

RGB664:
  xor        eax, eax
   mov       ebx, 2   ; 8-6
  mov        [RLeftShift], ebx
   mov       [RLeftShift+4], eax
  mov        ebx, 4
   mov       [GLeftShift+4], eax
  mov        [GLeftShift], ebx
   mov       ebx, 8
  mov        [RRightShift], ebx
   mov       [RRightShift+4], eax
  mov        [GRightShift], ebx
   mov       [GRightShift+4], eax
  mov        ebx, 10
   mov       [BRightShift+4], eax
  mov        [BRightShift], ebx
   ;
  movq       mm0, sixbitu
   ;
  movq       [RUpperLimit], mm0
   ;
  movq       [GUpperLimit], mm0
   ;
  movq       mm0, fourbitu
   ;
  movq       [BUpperLimit], mm0
   jmp       RGBEND

RGB565:
  xor        eax, eax
   mov       ebx, 3   ; 8-5
  mov        [RLeftShift], ebx
   mov       [RLeftShift+4], eax
  mov        ebx, 5
   mov       [GLeftShift+4], eax
  mov        [GLeftShift], ebx
   mov       ebx, 9
  mov        [RRightShift+4], eax
   mov       [RRightShift], ebx
  mov        [BRightShift], ebx
   mov       [BRightShift+4], eax
  mov        ebx, 8
   mov       [GRightShift+4], eax
  mov        [GRightShift], ebx
   ;
  movq       mm0, fivebitu
   ;
  movq       [RUpperLimit], mm0
   ;
  movq       [BUpperLimit], mm0
   ;
  movq       mm0, sixbitu
   ;
  movq       [GUpperLimit], mm0
   jmp       RGBEND

RGB655:
  xor        eax, eax
   mov       ebx, 2   ; 8-6
  mov        [RLeftShift], ebx
   mov       [RLeftShift+4], eax
  mov        ebx, 5
   mov       [GLeftShift+4], eax
  mov        [GLeftShift], ebx
   mov       ebx, 8
  mov        [RRightShift], ebx
   mov       [RRightShift+4], eax
  mov        ebx, 9
   mov       [GRightShift+4], eax
  mov        [GRightShift], ebx
   mov       [BRightShift], ebx
  mov        [BRightShift+4], eax
   ;
  movq       mm0, sixbitu
   ;
  movq       [RUpperLimit], mm0
   ;
  movq       mm0, fivebitu
   ;
  movq       [GUpperLimit], mm0
   ;
  movq       [BUpperLimit], mm0
   jmp       RGBEND

RGBEND:
  mov        ebx, [VPlane]
   mov       ecx, [UPlane]
  sub        ecx, ebx
   mov       eax, [ColorConvertedFrame]
  mov        [DistanceFromVToU], ecx
   mov       edx, [DCIOffset]
  add        eax, edx
   mov       edx, [CCOffsetToLine0]
  add        eax, edx
   mov       edx, [FrameHeight]
  mov        [CCOCursor], eax
   mov       ecx, [YPitch]
  imul       edx, ecx                        ; FrameHeight*YPitch
   ;
  mov        ebx, [FrameWidth]
   mov       eax, [CCOPitch]
  sub        eax, ebx                        ; CCOPitch-FrameWidth
   mov       esi, [YPlane]                   ; Fetch cursor over luma plane.
  sub        eax, ebx                        ; CCOPitch-2*FrameWidth
   mov       [CCOSkipDistance], eax          ; CCOPitch-2*FrameWidth
  add        edx, esi                        ; YPlane+Size_of_Y_array
   ;
  mov        [YLimit], edx
   mov       edx, [AspectAdjustmentCount]
  cmp        edx,1
   je        finish
  mov        esi, [VPlane]
   mov       [currAspectCount], edx
  mov        [localAspectAdjustmentCount], edx
   xor       eax, eax
  mov        edi, [CCOCursor]
   mov       edx, [DistanceFromVToU]
  mov        ebp, [YPlane]
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;  cannot access parameters beyond this point
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   mov       ebx, [localFrameWidth]
  mov        eax, [localYPitch]
   add       ebp, ebx
  mov        [YCursorEven], ebp				 ; YPlane + FrameWidth
   add       ebp, eax						 
  sar        ebx, 1							 ; FrameWidth/2
   mov       [YCursorOdd], ebp				 ; YPlane + FrameWidth + YPitch
  add        esi, ebx						 ; VPlane + FrameWidth/2
   ;
  add        edx, esi						 ; UPlane + FrameWidth/2
   neg       ebx
  mov       [localFrameWidth], ebx           ; -FrameWidth/2

;  Register Usage:
;
;------------------------------------------------------------------------------
PrepareChromaLine:
  mov        ebp, [currAspectCount]
   mov       ebx, [localFrameWidth]
  sub        ebp, 2
   mov       eax, [localCCOPitch]
  mov        [tmpCCOPitch], eax
   ja        continue
  xor        eax, eax
   add       ebp, [localAspectAdjustmentCount]
  mov        [tmpCCOPitch], eax

continue:
  mov       [currAspectCount], ebp

do_next_8x2_block:
  mov        ebp, [YCursorEven]
; here is even line
  movdt      mm1, [edx+ebx]                  ; 4 u values
   pxor      mm0, mm0                        ; mm0=0
  movdt      mm2, [esi+ebx]                  ; 4 v values
   punpcklbw mm1, mm0                        ; get 4 unsign u
  psubw      mm1, Minusg                     ; get 4 unsign u-128
   punpcklbw mm2, mm0                        ; get unsign v
  psubw      mm2, Minusg                     ; get unsign v-128
   movq      mm3, mm1                        ; save the u-128 unsign
  movq       mm5, mm1                        ; save u-128 unsign
   punpcklwd mm1, mm2                        ; get 2 low u, v unsign pairs
  pmaddwd    mm1, UVtG
   punpckhwd mm3, mm2                        ; create high 2 unsign uv pairs
  pmaddwd    mm3, UVtG
   ;
  movq       [temp_mmx], mm2                 ; save v-128
   ;
  movq       mm6, [ebp+2*ebx]                ; mm6 has 8 y pixels
   ;
  psubusb    mm6, Yadd                       ; mm6 has 8 y-16 pixels
   packssdw  mm1, mm3                        ; packed the results to signed words
  movq       mm7, mm6                        ; save the 8 y-16 pixels
   punpcklbw mm6, mm0                        ; mm6 has 4 low y-16 unsign
  pmullw     mm6, Ymul
   punpckhbw mm7, mm0                        ; mm7 has 4 high y-16 unsign
  pmullw     mm7, Ymul
   movq      mm4, mm1
  movq       [temp_mmx+8], mm1               ; save 4 chroma G values
   punpcklwd mm1, mm1                        ; chroma G replicate low 2
  movq       mm0, mm6                        ; low  y
   punpckhwd mm4, mm4                        ; chroma G replicate high 2
  movq       mm3, mm7                        ; high y
   psubw     mm6, mm1                        ;  4 low G
  psraw      mm6, [GRightShift]
   psubw     mm7, mm4                        ; 4 high G values in signed 16 bit
  movq       mm2, mm5
   punpcklwd mm5, mm5                        ; replicate the 2 low u pixels
  pmullw     mm5, UtB
   punpckhwd mm2, mm2
  psraw      mm7, [GRightShift]
   pmullw    mm2, UtB
  packuswb   mm6, mm7                        ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
   ;
  movq       [temp_mmx+16], mm5              ; low chroma B
   paddw     mm5, mm0                        ; 4 low B values in signed 16 bit
  movq       [temp_mmx+40], mm2              ; high chroma B
   paddw     mm2, mm3                        ; 4 high B values in signed 16 bit
  psraw      mm5, [BRightShift]              ; low B scaled down by 6+(8-5)
   ;
  psraw      mm2, [BRightShift]              ; high B scaled down by 6+(8-5)
   ;
  packuswb   mm5, mm2                        ; mm5: B7 B6 B5 B4 B3 B2 B1 B0
   ;
  movq       mm2, [temp_mmx]                 ; 4 v values
   movq      mm1, mm5                        ; save B
  movq       mm7, mm2
   punpcklwd mm2, mm2                        ; replicate the 2 low v pixels
  pmullw     mm2, VtR
   punpckhwd mm7, mm7
  pmullw     mm7, VtR
   ;
  paddusb    mm1, [BUpperLimit]              ; mm1: saturate B+0FF-15
   ;
  movq       [temp_mmx+24], mm2              ; low chroma R
   ;
  paddw      mm2, mm0                        ; 4 low R values in signed 16 bit
   ;
  psraw      mm2, [RRightShift]              ; low R scaled down by 6+(8-5)
   pxor      mm4, mm4                        ; mm4=0 for 8->16 conversion
  movq       [temp_mmx+32], mm7              ; high chroma R
   paddw     mm7, mm3                        ; 4 high R values in signed 16 bit
  psraw      mm7, [RRightShift]              ; high R scaled down by 6+(8-5)
   ;
  psubusb    mm1, [BUpperLimit]
   packuswb  mm2, mm7                        ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
  paddusb    mm6, [GUpperLimit]              ; G fast patch ih
   ;
  psubusb    mm6, [GUpperLimit]              ; fast patch ih
   ;
  paddusb    mm2, [RUpperLimit]              ; R
   ;
  psubusb    mm2, [RUpperLimit]
   ;

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; here we are packing from RGB24 to RGB16
  ; input:
  ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
  ; mm1: B7 B6 B5 B4 B3 B2 B1 B0
  ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
  ; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
  ; when  H=2**xBITS-1 (x is for R G B)
  ; output:
  ;        mm1- result: 4 low RGB16
  ;        mm7- result: 4 high RGB16
  ; using: mm0- zero register
  ;        mm3- temporary results
  ; algorithm:
  ;   for (i=0; i<8; i++) {
  ;     RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
  ;   }
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

  psllq      mm2, [RLeftShift]  ; position R in the most significant part of the byte
   movq      mm7, mm1                        ; mm1: Save B

  ; note: no need for shift to place B on the least significant part of the byte
  ;   R in left position, B in the right position so they can be combined

  punpcklbw  mm1, mm2                        ; mm1: 4 low 16 bit RB
   pxor      mm0, mm0                        ; mm0: 0
  punpckhbw  mm7, mm2                        ; mm5: 4 high 16 bit RB
   movq      mm3, mm6                        ; mm3: G
  punpcklbw  mm6, mm0                        ; mm6: low 4 G 16 bit
   ;
  psllw      mm6, [GLeftShift]               ; shift low G 5 positions
   ;
  punpckhbw  mm3, mm0                        ; mm3: high 4 G 16 bit
   por       mm1, mm6                        ; mm1: low RBG16
  psllw      mm3, [GLeftShift]               ; shift high G 5 positions
   ;
  por        mm7, mm3                        ; mm5: high RBG16
   ;
  mov        ebp, [YCursorOdd]               ; moved to here to save cycles before odd line
   ;
  movq       [edi], mm1                      ; !! aligned
   ;
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;- start odd line
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  movq       mm1, [ebp+2*ebx]                ; mm1 has 8 y pixels
   pxor      mm2, mm2
  psubusb    mm1, Yadd                       ; mm1 has 8 pixels y-16
   ;
  movq       mm5, mm1
   punpcklbw mm1, mm2                        ; get 4 low y-16 unsign pixels word
  pmullw     mm1, Ymul                       ; low 4 luminance contribution
   punpckhbw mm5, mm2                        ; 4 high y-16
  pmullw     mm5, Ymul                       ; high 4 luminance contribution
   ;
  movq       [edi+8], mm7                    ; !! aligned
   movq      mm0, mm1
  paddw      mm0, [temp_mmx+24]              ; low 4 R
   movq      mm6, mm5
  psraw      mm0, [RRightShift]              ; low R scaled down by 6+(8-5)
   ;
  paddw      mm5, [temp_mmx+32]              ; high 4 R
   movq      mm2, mm1
  psraw      mm5, [RRightShift]              ; high R scaled down by 6+(8-5)
   ;
  paddw      mm2, [temp_mmx+16]              ; low 4 B
   packuswb  mm0, mm5                        ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
  psraw      mm2, [BRightShift]              ; low B scaled down by 6+(8-5)
   movq      mm5, mm6
  paddw      mm6, [temp_mmx+40]              ; high 4 B
   ;
  psraw      mm6, [BRightShift]              ; high B scaled down by 6+(8-5)
   ;
  movq       mm3, [temp_mmx+8]               ; chroma G  low 4
   ;
  packuswb   mm2, mm6                        ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
   movq      mm4, mm3
  punpcklwd  mm3, mm3                        ; replicate low 2
   ;
  punpckhwd  mm4, mm4                        ; replicate high 2
   psubw     mm1, mm3                        ;  4 low G
  psraw      mm1, [GRightShift]              ; low G scaled down by 6+(8-5)
   psubw     mm5, mm4                        ;  4 high G values in signed 16 bit
  psraw      mm5, [GRightShift]              ; high G scaled down by 6+(8-5)
   ;
  paddusb    mm2, [BUpperLimit]              ; mm1: saturate B+0FF-15
   packuswb  mm1, mm5                        ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
  psubusb    mm2, [BUpperLimit]
   ;
  paddusb    mm1, [GUpperLimit]              ; G
   ;
  psubusb    mm1, [GUpperLimit]
   ;
  paddusb    mm0, [RUpperLimit]              ; R
   ;
  mov        eax, [tmpCCOPitch]
   ;
  psubusb    mm0, [RUpperLimit]
   ;
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; here we are packing from RGB24 to RGB16
  ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
  ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
  ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
  ; output:
  ;        mm2- result: 4 low RGB16
  ;        mm7- result: 4 high RGB16
  ; using: mm4- zero register
  ;        mm3- temporary results
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

  psllq      mm0, [RLeftShift]               ; position R in the most significant part of the byte
   movq      mm7, mm2                        ; mm7: Save B

  ; note: no need for shift to place B on the least significant part of the byte
  ;   R in left position, B in the right position so they can be combined

  punpcklbw  mm2, mm0                        ; mm1: 4 low 16 bit RB
   pxor      mm4, mm4                        ; mm4: 0
  movq       mm3, mm1                        ; mm3: G
   punpckhbw mm7, mm0                        ; mm7: 4 high 16 bit RB
  punpcklbw  mm1, mm4                        ; mm1: low 4 G 16 bit
   ;
  punpckhbw  mm3, mm4                        ; mm3: high 4 G 16 bit
   ;
  psllw      mm1, [GLeftShift]               ; shift low G 5 positions
   por       mm2, mm1                        ; mm2: low RBG16
  psllw      mm3, [GLeftShift]               ; shift high G 5 positions
   ;
  por        mm7, mm3                        ; mm7: high RBG16
   ;
  movq       [edi+eax], mm2
   ;
  movq       [edi+eax+8], mm7                ; aligned
   ;
  add        edi, 16                         ; ih take 16 bytes (8 pixels-16 bit)
   add       ebx, 4                          ; ? to take 4 pixels together instead of 2
  jl         do_next_8x2_block               ; ? update the loop for 8 y pixels at once
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;  Update:
  ;    edi: output RGB plane pointer for odd and even line
  ;    ebp: Y Plane address
  ;    esi: V Plane address
  ;    edx: U Plane address
  ;    YcursorEven: Even Y line address
  ;    YCursorOdd:  Odd Y line address
  ;  Note:  eax, ebx, ecx can be used as scratch registers
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  mov        ecx, [CCOSkipDistance]
   mov       eax, [localYPitch]
  add        edi, ecx                        ; go to begin of next even line
   mov       ecx, [tmpCCOPitch]
  add        edi, ecx                        ; skip odd line
   mov       ecx, [localChromaPitch]
  add        esi, ecx
   add       ebp, eax                        ; skip two lines
  mov        [YCursorEven], ebp              ; save even line address
   mov       ecx, [localChromaPitch]
  add        edx, ecx
   add       ebp, eax                        ; odd line address
  mov        [YCursorOdd], ebp               ; save odd line address
   mov       eax, [YLimit]                   ; Done with last line?
  cmp        ebp, eax
   jbe       PrepareChromaLine
;  ADDedi     CCOSkipDistance        ; go to begin of next line
;  ADDedi     tmpCCOPitch           ; skip odd line (if it is needed)
;  Leax       YPitch
;  Lebp       YCursorOdd
;  add        ebp, eax       ; skip one line
;  Sebp       YCursorEven
;
;  add        ebp, eax       ; skip one line
;  Sebp       tmpYCursorOdd
;  ADDesi     ChromaPitch
;  ADDedx     ChromaPitch
;  Leax       YLimit                  ; Done with last line?
;  cmp        ebp, eax
;  jbe        PrepareChromaLine

finish:
  mov        esp, [StashESP]
   ;
  pop        ebx
   pop       ebp
  pop        edi
   pop       esi
  ret

MMX_YUV12ToRGB16 ENDP

MMXCODE1 ENDS

END