windows-server-2003/enduser/netmeeting/av/codecs/intel/h263/i386/cxm1281.asm

;-------------------------------------------------------------------------
;    INTEL Corporation Proprietary Information
;
;    This listing is supplied under the terms of a license
;    agreement with INTEL Corporation and may not be copied
;    nor disclosed except in accordance with the terms of
;    that agreement.
;
;    Copyright (c) 1996 Intel Corporation.
;    All Rights Reserved.
;
;-------------------------------------------------------------------------

;-------------------------------------------------------------------------
;// $Header:   S:\h26x\src\dec\cx51281.asv
;//
;// $Log:   S:\h26x\src\dec\cxm1281.asv  $
;// 
;//    Rev 1.7   25 Jul 1996 13:47:58   AGUPTA2
;// Fixed blockiness problem; dither matrices were not created properly.
;// 
;//    Rev 1.6   14 Jun 1996 16:28:24   AGUPTA2
;// Cosmetic changes to adhere to common coding convention.
;// 
;//    Rev 1.5   13 May 1996 11:01:34   AGUPTA2
;// Final drop from IDC.
;//
;//    Rev 1.1   20 Mar 1996 11:19:24   RMCKENZX
;// March 96 version.
;//
;//    Rev 1.2   01 Feb 1996 10:45:58   vladip
;// Reduced number of locals, DataSegment changed to PARA
;//
;//    Rev 1.1   29 Jan 1996 18:53:40   vladip
;//
;// IFDEF TIMING is added
;//
;//    Rev 1.0   29 Jan 1996 17:28:06   vladip
;// Initial mmx verision.
;//
;-------------------------------------------------------------------------
;
; +---------- Color convertor.
; |+--------- For both H261 and H263.
; ||+-------- MMx Version.
; |||++------ Convert from YUV12.
; |||||+----- Convert to CLUT8.
; ||||||+---- Zoom by one, i.e. non-zoom.
; |||||||
; cxm1281  -- This function performs YUV12 to CLUT8 color conversion for H26x.
;             It dithers among 9 chroma points and 26 luma points, mapping the
;             8 bit luma pels into the 26 luma points by clamping the ends and
;             stepping the luma by 8.
;
;                Color convertor is not destructive.
; Requirement:
;                U and V plane SHOULD be followed by 4 bytes (for read only)
;                Y plane SHOULD be followed by 8 bytes (for read only)

OPTION CASEMAP:NONE
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro

.586
.xlist
include iammx.inc
include memmodel.inc
.list

MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
MMXCODE1 ENDS

MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
MMXDATA1 ENDS

MMXDATA1 SEGMENT
ALIGN 8

PUBLIC Y0_low
PUBLIC Y1_low
PUBLIC U_low_value
PUBLIC V_low_value
PUBLIC U2_V0high_bound
PUBLIC U2_V0low_bound
PUBLIC V2_U0high_bound
PUBLIC V2_U0low_bound
PUBLIC return_from_Y_high
PUBLIC saturate_to_Y_high
PUBLIC clean_MSB_mask
PUBLIC convert_to_sign

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  U,V,Y impacts are calculated as follows:
;              0    U < 64h
;    U impact  1ah  64h <= U < 84h
;              24h  U >= 84h
;
;              0    V < 64h
;    V impact  4eh  64h <= V < 84h
;              9ch  V >= 84h
;
;              0    Y < 1bh
;    Y impact  Y/8  1bh <= Y < ebh
;              19h  Y >= ebh
;  and the dither pattern is added to the input Y,U,V values and is a
;  4X4 matrix as defined below:
;    U
;      10h  8   18h  0
;      18h  0   10h  8
;      8    10h 0    18h
;      0    18h 8    10h
;    V 
;      8    10h 0    18h
;      0    18h 8    10h
;      10h  8   18h  0
;      18h  0   10h  8
;    Y
;      4    2   6    0
;      6    0   4    2
;      2    4   0    6
;      0    6   2    4
;  Note the following equalities in dither matrices which will explain funny
;  data declarations below:
;    U0=V2
;    U1=V3
;    U2=V0
;    U3=V1
;  More gory details can be found in the color convertor document written 
;  by IDC.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
V2_U0low_bound   DWORD  0f3ebfbe3h, 0f3ebfbe3h   ; 746c7c64746c7c64 - 8080808080808080
U2_V0low_bound   DWORD  0ebf3e3fbh, 0ebf3e3fbh,  ; 6c74647c6c74647c - 8080808080808080
                        0f3ebfbe3h, 0f3ebfbe3h   ; 746c7c64746c7c64 - 8080808080808080

U3_V1low_bound   DWORD  0e3fbebf3h, 0e3fbebf3h   ; 647c6c74647c6c74 - 8080808080808080
V3_U1low_bound   DWORD  0fbe3f3ebh, 0fbe3f3ebh,  ; 7c64746c7c64746c - 8080808080808080
                        0e3fbebf3h, 0e3fbebf3h   ; 647c6c74647c6c74 - 8080808080808080

V2_U0high_bound  DWORD  0130b1b03h, 0130b1b03h   ; 948c9c84948c9c84 - 8080808080808080
U2_V0high_bound  DWORD  00b13031bh, 00b13031bh,  ; 8c94849c8c94849c - 8080808080808080
                        0130b1b03h, 0130b1b03h   ; 948c9c84948c9c84 - 8080808080808080

U3_V1high_bound  DWORD  0031b0b13h, 0031b0b13h   ; 849c8c94849c8c94 - 8080808080808080
V3_U1high_bound  DWORD  01b03130bh, 01b03130bh,  ; 9c84948c9c84948c - 8080808080808080
                        0031b0b13h, 0031b0b13h   ; 849c8c94849c8c94 - 8080808080808080


U_low_value      DWORD  01a1a1a1ah, 01a1a1a1ah
V_low_value      DWORD  04e4e4e4eh, 04e4e4e4eh
convert_to_sign  DWORD  080808080h, 080808080h


; Y0_low,Y1_low are arrays
Y0_low           DWORD  01719151bh, 01719151bh,  ; 1b1b1b1b1b1b1b1b - 0402060004020600 ; for line%4=0
                        019171b15h, 019171b15h   ; 1b1b1b1b1b1b1b1b - 0204000602040006 ; for line%4=2

Y1_low           DWORD  0151b1719h, 0151b1719h,  ; 1b1b1b1b1b1b1b1b - 0600040206000402 ; for line%4=1
                        01b151917h, 01b151917h   ; 1b1b1b1b1b1b1b1b - 0006020400060204 ; for line%4=3

clean_MSB_mask      DWORD  01f1f1f1fh, 01f1f1f1fh
saturate_to_Y_high  DWORD  0e6e6e6e6h, 0e6e6e6e6h   ; ffh-19h
return_from_Y_high  DWORD  0dcdcdcdch, 0dcdcdcdch   ; ffh-19h-ah (return back and ADD ah);

MMXDATA1 ENDS

MMXCODE1 SEGMENT
MMX_YUV12ToCLUT8 PROC DIST LANG PUBLIC,
  AYPlane:              DWORD,
  AVPlane:              DWORD,
  AUPlane:              DWORD,
  AFrameWidth:          DWORD,
  AFrameHeight:         DWORD,
  AYPitch:              DWORD,
  AVPitch:              DWORD,
  AAspectAdjustmentCnt: DWORD,
  AColorConvertedFrame: DWORD,
  ADCIOffset:           DWORD,
  ACCOffsetToLine0:     DWORD,
  ACCOPitch:            DWORD,
  ACCType:              DWORD
LocalFrameSize           =   108
RegisterStorageSize      =   16
argument_base            EQU ebp + RegisterStorageSize
local_base               EQU esp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Arguments:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
YPlane                   EQU   argument_base +  4
VPlane                   EQU   argument_base +  8
UPlane                   EQU   argument_base + 12
FrameWidth               EQU   argument_base + 16
FrameHeight              EQU   argument_base + 20
YPitch                   EQU   argument_base + 24
ChromaPitch              EQU   argument_base + 28
AspectAdjustmentCount    EQU   argument_base + 32
ColorConvertedFrame      EQU   argument_base + 36
DCIOffset                EQU   argument_base + 40
CCOffsetToLine0          EQU   argument_base + 44
CCOPitch                 EQU   argument_base + 48
CCType                   EQU   argument_base + 52
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Locals (on local stack frame)
;   (local_base is aligned at cache-line boundary in the prologue)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
tmpV2_U0low_bound        EQU   local_base + 0       ; qword
tmpU2_V0low_bound        EQU   local_base + 8       ; qword
tmpU3_V1low_bound        EQU   local_base + 16      ; qword
tmpV3_U1low_bound        EQU   local_base + 24      ; qword
tmpV2_U0high_bound       EQU   local_base + 32      ; qword
tmpU2_V0high_bound       EQU   local_base + 40      ; qword
tmpU3_V1high_bound       EQU   local_base + 48      ; qword
tmpV3_U1high_bound       EQU   local_base + 56      ; qword
tmpY0_low                EQU   local_base + 64      ; qword
tmpY1_low                EQU   local_base + 72      ; qword
tmpBlockParity           EQU   local_base + 80
YLimit                   EQU   local_base + 84
AspectCount              EQU   local_base + 88
tmpYCursorEven           EQU   local_base + 92
tmpYCursorOdd            EQU   local_base + 96
tmpCCOPitch              EQU   local_base + 100
StashESP                 EQU   local_base + 104

U_low                    EQU   mm6
V_low                    EQU   mm7
U_high                   EQU   U_low
V_high                   EQU   V_low
		
  push       esi
   push      edi
  push       ebp
   push      ebx
  mov        ebp, esp
   sub       esp, LocalFrameSize
  and        esp, -32                        ; align at cache line boundary
   mov       [StashESP], ebp

  mov        ecx, [YPitch]
   mov       edx, [FrameHeight]
  mov        ebx, [FrameWidth]
   ;
  imul       edx, ecx
   ;
  mov        eax, [YPlane]
   add       edx, eax                        ; edx is relative to YPlane
  add        eax, ebx                        ; Points to end of Y even line
   ;
  mov        [tmpYCursorEven], eax
   add       eax, ecx                        ; add YPitch
  mov        [tmpYCursorOdd], eax
   lea       edx, [edx+2*ebx]                ; final value of Y-odd-pointer
  mov        [YLimit], edx
   mov       esi, [VPlane]
  mov        edx, [UPlane]
   mov       eax, [ColorConvertedFrame]
  add        eax, [DCIOffset]
   ;
  add        eax, [CCOffsetToLine0]
   sar       ebx, 1
  add        esi, ebx
   add       edx, ebx
  lea        edi, [eax+2*ebx]                ; CCOCursor
   mov       ecx, [AspectAdjustmentCount]
  mov        [AspectCount], ecx
   test      ecx, ecx                        ; if AspectCount=0 we should not drop any lines
  jnz        non_zero_AspectCount
   dec       ecx
non_zero_AspectCount:
  mov        [AspectCount], ecx
   cmp       ecx, 1
  jbe        finish
   ;
  neg        ebx
   ;
  mov        [FrameWidth], ebx
   ;
  movq       mm6, U_low_value                ; store some frequently used values in registers
   ;
  movq       mm7, V_low_value
   xor       eax, eax
  mov        [tmpBlockParity], eax

  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;  Register Usage:
  ;
  ;  esi -- points to the end of V Line
  ;  edx -- points to the end of U Line.
  ;  edi -- points to the end of even line of output.
  ;  ebp -- points to the end of odd  line of output.
  ;
  ;  ecx -- points to the end of even/odd Y Line
  ;  eax -- 8*(line&2) == 0,  on line%4=0,1
  ;                    == 8,  on line%4=2,3
  ;         in the loop, eax points to the end of even Y line
  ;  ebx -- Number of points, we havn't done yet. (multiplyed by -0.5)
  ;
  ;
  ; Noise matrix is of size 4x4 , so we have different noise values in even 
  ; pair of lines, and in odd pair of lines. But in our loop we are doing 2 
  ; lines. So here we are prepairing constants for next two lines.  This code
  ; is done each time we are starting to convert next pair of lines.
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
PrepareNext2Lines:
  mov        eax, [tmpBlockParity]
   ;
  ;constants for odd line
  movq       mm0, V3_U1low_bound[eax]
   ;
  movq       mm1, V3_U1high_bound[eax]
   ;
  movq       mm2, U3_V1low_bound[eax]
   ;
  movq       mm3, U3_V1high_bound[eax]
   ;
  movq       [tmpV3_U1low_bound], mm0
   ;
  movq       [tmpV3_U1high_bound], mm1
   ;
  movq       [tmpU3_V1low_bound], mm2
   ;
  movq       [tmpU3_V1high_bound], mm3
   ;
  ;
  ;constants for even line
  ;
  movq       mm0, V2_U0low_bound[eax]
   ;
  movq       mm1, V2_U0high_bound[eax]
   ;
  movq       mm2, U2_V0low_bound[eax]
   ;
  movq       mm3, U2_V0high_bound[eax]
   ;
  movq       [tmpV2_U0low_bound], mm0
   ;
  movq       [tmpV2_U0high_bound], mm1
   ;
  movq       [tmpU2_V0low_bound], mm2
   ;
  movq       [tmpU2_V0high_bound], mm3
   ;
  ;
  ; Constants for Y values
  ;
  movq       mm4, Y0_low[eax]
   ;
  movq       mm5, Y1_low[eax]
   ;
  xor        eax, 8
   mov       [tmpBlockParity], eax
  movq       [tmpY0_low], mm4
   ;
  movq       [tmpY1_low], mm5
   ;
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; if AspectCount<2 we should skip a line. In this case we are still doing two
  ; lines, but output pointers are the same, so we just overwriting line 
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  mov        eax, [CCOPitch]
   mov       ebx, [AspectCount]
  xor        ecx, ecx
   sub       ebx, 2
  mov        [tmpCCOPitch], eax
   ja        continue
  mov        eax, [AspectAdjustmentCount]
   mov       [tmpCCOPitch], ecx              ; 0
  lea        ebx, [ebx+eax]                  ; calculate new AspectCount
   jnz       continue                        ; skiping even line
  ;
  ;skip_odd_line
  ;
  mov       eax, [tmpYCursorEven]
   ;
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; set odd constants to be equal to even_constants
  ; Odd line will be performed as even
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  movq       [tmpV3_U1low_bound], mm0
   ;
  movq       [tmpV3_U1high_bound], mm1
   ;
  movq       [tmpU3_V1low_bound], mm2
   ;
  movq       [tmpU3_V1high_bound], mm3
   ;
  movq       [tmpY1_low], mm4
   ;
  mov        [tmpYCursorOdd], eax
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; when we got here, we already did all preparations.
  ; we are entering a main loop which is starts at do_next_2x8_block label
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
continue:
  mov        [AspectCount], ebx
   mov       ebx, [FrameWidth]
  mov        ebp, edi
   ;
  add        ebp, [tmpCCOPitch]              ; ebp points to the end of odd line
   mov       eax, [tmpYCursorEven]
  mov        ecx, [tmpYCursorOdd]
   ;
  movdt      mm0, [edx+ebx]                  ; 0:0:0:0|u3:u2:u1:u0 unsigned
   ;
  movdt      mm2, [esi+ebx]                  ; 0:0:0:0|v3:v2:v1:v0 unsigned
   punpcklbw mm0, mm0                        ; u3:u3:u2:u2|u1:u1:u0:u0 unsigned
  psubb      mm0, convert_to_sign            ; u3:u3:u2:u2|u1:u1:u0:u0 signed
   punpcklbw mm2, mm2                        ; v3:v3:v2:v2|v1:v1:v0:v0 unsigned
  movq       mm4, [eax+2*ebx]                ; y7|..|y0
   ;
  movq       mm1, mm0                        ; u3:u3:u2:u2|u1:u1:u0:u0
   ;
do_next_2x8_block:
  psubb      mm2, convert_to_sign            ; v3:v3:v2:v2|v1:v1:v0:v0 signed
   movq      mm5, mm1                        ; u3:u3:u2:u2|u1:u1:u0:u0
  pcmpgtb    mm0, [tmpV2_U0low_bound]
   movq      mm3, mm2
  pcmpgtb    mm1, [tmpV2_U0high_bound]
   pand      mm0, U_low
  psubusb    mm4, [tmpY0_low]
   pand      mm1, U_high
  pcmpgtb    mm2, [tmpU2_V0low_bound]
   psrlq     mm4, 3
  pand       mm4, clean_MSB_mask
   pand      mm2, V_low
  paddusb    mm4, saturate_to_Y_high
   paddb     mm0, mm1                        ; U03:U03:U02:U02|U01:U01:U00:U00
  psubusb    mm4, return_from_Y_high
   movq      mm1, mm5
  pcmpgtb    mm5, [tmpV3_U1low_bound]
   paddd     mm0, mm2
  pcmpgtb    mm1, [tmpV3_U1high_bound]
   pand      mm5, U_low
  paddd      mm0, mm4
   movq      mm2, mm3
  pcmpgtb    mm3, [tmpU2_V0high_bound]
   pand      mm1, U_high
  movq       mm4, [ecx+2*ebx]                ; read next 8 Y points from odd line
   paddb     mm5, mm1                        ; u impact on odd line
  psubusb    mm4, [tmpY1_low]
   movq      mm1, mm2
  pcmpgtb    mm2, [tmpU3_V1low_bound]
   psrlq     mm4, 3
  pand       mm4, clean_MSB_mask
   pand      mm2, V_low
  paddusb    mm4, saturate_to_Y_high
   paddd     mm5, mm2
  psubusb    mm4, return_from_Y_high
   pand      mm3, V_high
  pcmpgtb    mm1, [tmpU3_V1high_bound]
   paddb     mm3, mm0
  movdt      mm0, [edx+ebx+4]                ; read next 4 U points
   pand      mm1, V_high
  movdt      mm2, [esi+ebx+4]                ; read next 4 V points
   paddd     mm5, mm4
  movq       mm4, [eax+2*ebx+8]              ; read next 8 Y points from even line
   paddb     mm5, mm1
  psubb      mm0, convert_to_sign
   punpcklbw mm2, mm2                        ; v3:v3:v2:v2|v1:v1:v0:v0
  movq       [edi+2*ebx], mm3                ; write even line
   punpcklbw mm0, mm0                        ; u3:u3:u2:u2|u1:u1:u0:u0
  movq       [ebp+2*ebx], mm5                ; write odd line
   movq      mm1, mm0                        ; u3:u3:u2:u2|u1:u1:u0:u0
  add        ebx, 4
   jl        do_next_2x8_block
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ; update pointes to input and output buffers, to point to the next lines
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  mov        ebp, [StashESP]
   mov       eax, [tmpYCursorEven]
  mov        ecx, [YPitch]
   add       edi, [CCOPitch]                 ; go to the end of next line
  add        edi, [tmpCCOPitch]              ; skip odd line
   lea       eax, [eax+2*ecx]
  mov        [tmpYCursorEven], eax
   add       eax, [YPitch]
  mov        [tmpYCursorOdd], eax
   add       esi, [ChromaPitch]
  mov        ecx, [YLimit]                   ; Done with last line?
   add       edx, [ChromaPitch]
  cmp        eax, ecx
   jb        PrepareNext2Lines

finish:
  mov        esp, [StashESP]
   ;
  pop        ebx
   pop       ebp
  pop        edi
   pop       esi
  ret

MMX_YUV12ToCLUT8 ENDP

MMXCODE1 ENDS

END