Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

529 lines
19 KiB

  1. ;-------------------------------------------------------------------------
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;-------------------------------------------------------------------------
  13. ;-------------------------------------------------------------------------
  14. ;//
  15. ;// $Header: S:\h26x\src\dec\cxm12322.asv
  16. ;//
  17. ;// $Log: S:\h26x\src\dec\cxm12322.asv $
  18. ;//
  19. ;// Rev 1.6 01 Apr 1997 12:53:20 BNICKERS
  20. ;// Fix bugs # 153 and 156 -- wrong color when U is small; right edge flickeri
  21. ;//
  22. ;// Rev 1.5 11 Mar 1997 13:50:24 JMCVEIGH
  23. ;// Same ARC bug fix (#94) as was done in cxm12162.asm. Without
  24. ;// this, zoom by 2 and ARC causes black lines in output (every 12th).
  25. ;//
  26. ;// Rev 1.4 06 Sep 1996 16:08:16 BNICKERS
  27. ;// Re-written to filter new points.
  28. ;//
  29. ;-------------------------------------------------------------------------
  30. ;
  31. ; +---------- Color convertor.
  32. ; |+--------- For both H261 and H263.
  33. ; ||+-------- Version for Intel Microprocessors with MMX Technology
  34. ; |||++------ Convert from YUV12.
  35. ; |||||++---- Convert to RGB32.
  36. ; |||||||+--- Zoom by two.
  37. ; ||||||||
  38. ; cxm12322 -- This function performs zoom-by-2 YUV12-to-RGB32 color conversion
  39. ; for H26x. It is tuned for best performance on Intel
  40. ; Microprocessors with MMX Technology. This version adds new rows
  41. ; and columns by averaging them with the originals to either side.
  42. ;
  43. ; The YUV12 input is planar, 8 bits per pel. The Y plane may have
  44. ; a pitch of up to 768. It may have a width less than or equal
  45. ; to the pitch. It must be QWORD aligned. Pitch and Width must
  46. ; be a multiple of eight. Height may be any amount, but must be
  47. ; a multiple of two. The U and V planes may have a different
  48. ; pitch than the Y plane, subject to the same limitations.
  49. ;
  50. ; The color convertor is non-destructive; the input Y, U, and V
  51. ; planes will not be clobbered.
  52. OPTION PROLOGUE:None
  53. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  54. include ccinst.inc
  55. .xlist
  56. include iammx.inc
  57. include memmodel.inc
  58. .list
  59. MMXCCDATA SEGMENT PAGE
  60. ALIGN 16
  61. Luma0020004000200000 LABEL DWORD
  62. REPEAT 16
  63. DD 0, 0
  64. ENDM
  65. CNT = 0
  66. REPEAT 219
  67. DW 0
  68. DW (CNT*04A7FH)/00200H
  69. DW (CNT*04A7FH)/00100H
  70. DW (CNT*04A7FH)/00200H
  71. CNT = CNT + 1
  72. ENDM
  73. REPEAT 21
  74. DW 00000H
  75. DW 01FFFH
  76. DW 03FFFH
  77. DW 01FFFH
  78. ENDM
  79. UContribToBandG LABEL DWORD
  80. DW -(-128*0C83H)/00040H
  81. DW 08000H
  82. DW -(-127*0C83H)/00040H
  83. DW 08000H
  84. CNT = -126
  85. REPEAT 253
  86. DW -(CNT*00C83H)/00040H
  87. DW (CNT*0408BH)/00040H
  88. CNT = CNT + 1
  89. ENDM
  90. DW (127*0C83H)/00040H
  91. DW 07FFFH
  92. VContribToRandG LABEL DWORD
  93. CNT = -128
  94. REPEAT 256
  95. DW -(CNT*01A04H)/00040H
  96. DW (CNT*03312H)/00040H
  97. CNT = CNT + 1
  98. ENDM
  99. CFF00FF00FF00FF00 DD 0FF00FF00H, 0FF00FF00H
  100. MMXCCDATA ENDS
  101. .CODE
  102. ASSUME ds : FLAT
  103. ASSUME es : FLAT
  104. ASSUME fs : FLAT
  105. ASSUME gs : FLAT
  106. ASSUME ss : FLAT
  107. ; void FAR ASM_CALLTYPE YUV12ToRGB32ZoomBy2 (U8 * YPlane,
  108. ; U8 * VPlane,
  109. ; U8 * UPlane,
  110. ; UN FrameWidth,
  111. ; UN FrameHeight,
  112. ; UN YPitch,
  113. ; UN VPitch,
  114. ; UN AspectAdjustmentCount,
  115. ; U8 * ColorConvertedFrame,
  116. ; U32 DCIOffset,
  117. ; U32 CCOffsetToLine0,
  118. ; IN CCOPitch,
  119. ; IN CCType)
  120. ;
  121. ; CCOffsetToLine0 is relative to ColorConvertedFrame.
  122. ;
  123. ; due to the need for the ebp reg, these parameter declarations aren't used,
  124. ; they are here so the assembler knows how many bytes to relieve from the stack
  125. PUBLIC MMX_YUV12ToRGB32ZoomBy2
  126. MMX_YUV12ToRGB32ZoomBy2 proc DIST LANG AYPlane: DWORD,
  127. AVPlane: DWORD,
  128. AUPlane: DWORD,
  129. AFrameWidth: DWORD,
  130. AFrameHeight: DWORD,
  131. AYPitch: DWORD,
  132. AVPitch: DWORD,
  133. AAspectAdjustmentCnt: DWORD,
  134. AColorConvertedFrame: DWORD,
  135. ADCIOffset: DWORD,
  136. ACCOffsetToLine0: DWORD,
  137. ACCOPitch: DWORD,
  138. ACCType: DWORD
  139. MAXWIDTH = 768
  140. LocalFrameSize = MAXWIDTH*20+64
  141. RegisterStorageSize = 16
  142. ; Arguments:
  143. YPlane_arg = RegisterStorageSize + 4
  144. VPlane_arg = RegisterStorageSize + 8
  145. UPlane_arg = RegisterStorageSize + 12
  146. FrameWidth_arg = RegisterStorageSize + 16
  147. FrameHeight = RegisterStorageSize + 20
  148. YPitch_arg = RegisterStorageSize + 24
  149. ChromaPitch_arg = RegisterStorageSize + 28
  150. AspectAdjustmentCount_arg = RegisterStorageSize + 32
  151. ColorConvertedFrame = RegisterStorageSize + 36
  152. DCIOffset = RegisterStorageSize + 40
  153. CCOffsetToLine0 = RegisterStorageSize + 44
  154. CCOPitch_arg = RegisterStorageSize + 48
  155. CCType = RegisterStorageSize + 52
  156. EndOfArgList = RegisterStorageSize + 56
  157. ; Locals (on local stack frame)
  158. CCOCursor EQU [esp+ 0]
  159. CCOPitch EQU [esp+ 4]
  160. YCursor EQU [esp+ 8]
  161. YLimit EQU [esp+ 12]
  162. YPitch EQU [esp+ 16]
  163. UCursor EQU [esp+ 20]
  164. DistanceFromUToV EQU [esp+ 24]
  165. ChromaPitch EQU [esp+ 28]
  166. AspectCount EQU [esp+ 32]
  167. AspectAdjustmentCount EQU [esp+ 36]
  168. StartIndexOfYLine EQU [esp+ 40]
  169. StashESP EQU [esp+ 44]
  170. FiltLine0 EQU [esp+ 64] ; Must be 32 byte aligned.
  171. FiltLine1 EQU [esp+ 72]
  172. FiltLine2 EQU [esp+ 80]
  173. FiltLine3 EQU [esp+ 88]
  174. HFiltLinePrev EQU [esp+ 96]
  175. push esi
  176. push edi
  177. push ebp
  178. push ebx
  179. mov edi,esp
  180. and esp,0FFFFF000H
  181. sub esp,4096
  182. mov eax,[esp]
  183. sub esp,4096
  184. mov eax,[esp]
  185. sub esp,4096
  186. mov eax,[esp]
  187. sub esp,LocalFrameSize-12288
  188. mov eax,[esp]
  189. mov eax,768
  190. sub eax,[edi+FrameWidth_arg]
  191. imul eax,20
  192. mov StartIndexOfYLine,eax
  193. mov eax,[edi+YPlane_arg]
  194. mov YCursor,eax
  195. mov ebx,[edi+YPitch_arg]
  196. mov YPitch,ebx
  197. mov ecx,[edi+FrameHeight]
  198. imul ebx,ecx
  199. add eax,ebx
  200. mov YLimit,eax
  201. mov eax,[edi+UPlane_arg]
  202. mov ebx,[edi+VPlane_arg]
  203. mov UCursor,eax
  204. sub ebx,eax
  205. mov DistanceFromUToV,ebx
  206. mov eax,[edi+ColorConvertedFrame]
  207. add eax,[edi+DCIOffset]
  208. add eax,[edi+CCOffsetToLine0]
  209. mov CCOCursor,eax
  210. mov eax,[edi+ChromaPitch_arg]
  211. mov ChromaPitch,eax
  212. mov eax,[edi+CCOPitch_arg]
  213. mov CCOPitch,eax
  214. mov eax,[edi+AspectAdjustmentCount_arg]
  215. mov AspectAdjustmentCount,eax
  216. mov AspectCount,eax
  217. mov StashESP,edi
  218. mov esi,YCursor
  219. mov ebp,YPitch
  220. mov edi,StartIndexOfYLine
  221. xor eax,eax
  222. lea edx,[esi+ebp*2]
  223. xor ebx,ebx
  224. mov YCursor,edx
  225. mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge).
  226. mov al,[esi] ; Get Y00 (A of line L2; for left edge).
  227. movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 >
  228. mov bl,[esi+ebp*1+2] ; Get c.
  229. movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 >
  230. mov al,[esi+2] ; Get C.
  231. ; esi -- Cursor over input line of Y.
  232. ; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
  233. ; ebp -- Pitch from one line of Y to the next.
  234. ; al, bl -- Y pels
  235. ; mm0 -- For line 0, contribution of pel to left of two pels under cursor now.
  236. ; mm1 -- For line 1, contribution of pel to left of two pels under cursor now.
  237. ; mm2-mm6 -- Scratch.
  238. Next2PelsOfFirst2LumaLines:
  239. movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 >
  240. psrlq mm1,32 ; L1:< 0 0 32a 64a >
  241. movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 >
  242. punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a >
  243. xor ebx,ebx
  244. xor eax,eax
  245. mov bl,[esi+ebp*1+1] ; Get b.
  246. psrlq mm0,32 ; L0:< 0 0 32A 64A >
  247. mov al,[esi+1] ; Get B.
  248. add edi,40 ; Inc filtered luma temp stg idx.
  249. paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a >
  250. punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A >
  251. paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
  252. movq HFiltLinePrev[edi-40],mm1 ; Save L1 as next iters LPrev.
  253. paddw mm1,mm0 ; L0+L1
  254. paddw mm0,mm0 ; 2L0
  255. add esi,2 ; Increment input index.
  256. movq FiltLine3[edi-40],mm1 ; Save filtered line L0+L1.
  257. movq mm1,mm3 ; Next iters a.
  258. movq FiltLine2[edi-40],mm0 ; Save filtered line 2L0.
  259. movq mm0,mm2 ; Next iters A.
  260. mov bl,[esi+ebp*1+2] ; Get c.
  261. cmp edi,MAXWIDTH*20-40 ; Done yet.
  262. mov al,[esi+2] ; Get C.
  263. jl Next2PelsOfFirst2LumaLines
  264. xor ebx,ebx
  265. xor ecx,ecx
  266. mov bl,[esi+ebp*1+1] ; Get c.
  267. cmp edi,MAXWIDTH*20 ; Done yet.
  268. mov al,[esi+1] ; Get C.
  269. jl Next2PelsOfFirst2LumaLines
  270. mov ebp,DistanceFromUToV
  271. lea eax,FiltLine2
  272. mov esi,UCursor
  273. mov edx,StartIndexOfYLine
  274. jmp DoOutputLine
  275. Last2OutputLines:
  276. mov ebp,DistanceFromUToV
  277. lea esi,[edi+40]
  278. ja Done
  279. ; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
  280. ; mm0-mm6 -- Scratch.
  281. movq mm0,HFiltLinePrev[edi] ; Fetch horizontally filtered line LP.
  282. paddw mm0,mm0 ; 2LP
  283. Next2PelsOfLast2LumaLines:
  284. movq FiltLine3[edi],mm0 ; Save horz and vert filt line 2LP.
  285. movq FiltLine2[edi],mm0 ; Save horz and vert filt line 2LP.
  286. movq mm0,HFiltLinePrev[edi+40]; Fetch horizontally filtered line LP.
  287. add edi,40
  288. paddw mm0,mm0 ; 2LP
  289. cmp edi,MAXWIDTH*20 ; Done yet.
  290. jne Next2PelsOfLast2LumaLines
  291. lea eax,FiltLine2
  292. mov edx,StartIndexOfYLine
  293. mov esi,UCursor
  294. jmp DoOutputLine
  295. Next4OutputLines:
  296. mov esi,YCursor
  297. mov ebp,YPitch
  298. mov edi,StartIndexOfYLine
  299. mov ecx,YLimit
  300. lea edx,[esi+ebp*2]
  301. xor eax,eax
  302. mov YCursor,edx
  303. xor ebx,ebx
  304. mov al,[esi] ; Get Y00 (A of line L2; for left edge).
  305. cmp esi,ecx
  306. mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge).
  307. jae Last2OutputLines
  308. movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 >
  309. mov bl,[esi+ebp*1+2] ; Get c.
  310. movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 >
  311. mov al,[esi+2] ; Get C.
  312. ; esi -- Cursor over input line of Y.
  313. ; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
  314. ; ebp -- Pitch from one line of Y to the next.
  315. ; al, bl -- Y pels
  316. ; mm0 -- For line 0, contribution of pel to left of two pels under cursor now.
  317. ; mm1 -- For line 1, contribution of pel to left of two pels under cursor now.
  318. ; mm2-mm6 -- Scratch.
  319. Next2PelsOf2LumaLines:
  320. movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 >
  321. psrlq mm1,32 ; L1:< 0 0 32a 64a >
  322. movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 >
  323. punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a >
  324. movq mm4,HFiltLinePrev[edi] ; LP
  325. psrlq mm0,32 ; L0:< 0 0 32A 64A >
  326. xor ebx,ebx
  327. xor eax,eax
  328. mov bl,[esi+ebp*1+1] ; Get b.
  329. movq mm5,mm4 ; LP
  330. mov al,[esi+1] ; Get B.
  331. add esi,2 ; Increment input index.
  332. paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a >
  333. punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A >
  334. paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
  335. paddw mm5,mm5 ; 2LP
  336. movq HFiltLinePrev[edi],mm1 ; Save L1 as next iters LPrev.
  337. paddw mm4,mm0 ; LP+L0
  338. movq FiltLine0[edi],mm5 ; Save 2LP
  339. paddw mm1,mm0 ; L0+L1
  340. movq FiltLine1[edi],mm4 ; Save LP+L0
  341. paddw mm0,mm0 ; 2L0
  342. movq FiltLine3[edi],mm1 ; Save L0+L1
  343. movq mm1,mm3 ; Next iters a.
  344. movq FiltLine2[edi],mm0 ; Save 2L0
  345. movq mm0,mm2 ; Next iters A.
  346. add edi,40 ; Inc filtered luma temp stg idx.
  347. mov bl,[esi+ebp*1+2] ; Get c.
  348. cmp edi,MAXWIDTH*20-40 ; Done yet.
  349. mov al,[esi+2] ; Get C.
  350. jl Next2PelsOf2LumaLines
  351. xor ebx,ebx
  352. xor ecx,ecx
  353. mov bl,[esi+ebp*1+1] ; Get c.
  354. cmp edi,MAXWIDTH*20 ; Done yet.
  355. mov al,[esi+1] ; Get C.
  356. jl Next2PelsOf2LumaLines
  357. mov ebp,DistanceFromUToV
  358. mov esi,UCursor
  359. lea eax,FiltLine0
  360. mov edx,StartIndexOfYLine
  361. DoOutputLine:
  362. mov edi,CCOCursor
  363. mov ecx,AspectCount
  364. dec ecx ; If count is non-zero, we keep the line.
  365. mov ebx,CCOPitch
  366. mov AspectCount,ecx
  367. je SkipOutputLine
  368. add ebx,edi
  369. xor ecx,ecx
  370. mov cl,[esi]
  371. add eax,MAXWIDTH*20
  372. mov CCOCursor,ebx
  373. pcmpeqw mm6,mm6
  374. movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gu >
  375. psllw mm6,15 ; Four words of -32768
  376. mov cl,[esi+ebp*1]
  377. sub edx,MAXWIDTH*20
  378. jmp StartDoOutputLine
  379. ; ebp -- Distance from U to V
  380. ; esi -- Cursor over U
  381. ; edi -- Cursor over output
  382. ; edx -- Index over Y storage area
  383. ; eax -- Base address of Y line
  384. ; mm6 -- Four words of -32768, to clamp at floor.
  385. DoNext4OutputPels:
  386. movq [edi-8],mm3 ; Save 2 output pels.
  387. movq mm0,mm4
  388. StartDoOutputLine:
  389. movdt mm2,VContribToRandG[ecx*4] ; < 0 0 Rv Gv >
  390. punpcklwd mm0,mm0 ; < Bu Bu Gu Gu >
  391. movq mm1,mm0 ; < junk junk Gu Gu >
  392. punpcklwd mm2,mm2 ; < Rv Rv Gv Gv >
  393. paddw mm1,mm2 ; < junk junk Guv Guv >
  394. punpckhdq mm0,mm0 ; < Bu Bu Bu Bu >
  395. paddsw mm0,[eax+edx] ; < B B B B > with ceiling clamped.
  396. punpckldq mm1,mm1 ; < Guv Guv Guv Guv >
  397. paddsw mm1,[eax+edx] ; < G G G G > with ceiling clamped.
  398. punpckhdq mm2,mm2 ; < Rv Rv Rv Rv >
  399. paddsw mm2,[eax+edx] ; < R R R R > with ceiling clamped.
  400. paddsw mm0,mm6 ; B with floor clamped.
  401. psubsw mm0,mm6 ; B back in range.
  402. paddsw mm1,mm6 ; G with floor clamped.
  403. psubsw mm1,mm6 ; G back in range.
  404. paddsw mm2,mm6 ; R with floor clamped.
  405. add edi,16 ; Advance output cursor.
  406. xor ecx,ecx
  407. psubsw mm2,mm6 ; R back in range.
  408. psrlw mm0,7 ; Position B bits.
  409. mov cl,[esi+1] ; Fetch next U.
  410. paddw mm1,mm1 ; Position G bits.
  411. pand mm1,CFF00FF00FF00FF00 ; Extract G bits.
  412. psrlw mm2,7 ; Position R bits.
  413. movdt mm4,UContribToBandG[ecx*4] ; < 0 0 Bu Gv > next iter.
  414. por mm1,mm0 ; <G3 B3 G2 B2 G1 B1 G0 B0>
  415. movq mm3,mm1
  416. punpcklwd mm1,mm2 ; < 0 R1 G1 B1 0 R0 G0 B0>
  417. inc esi ; Advance input cursor
  418. add edx,40 ; Increment Y index.
  419. movq [edi-16],mm1 ; Store 2 output pels.
  420. punpckhwd mm3,mm2 ; < 0 R3 G3 B3 0 R2 G2 B2>
  421. mov cl,[esi+ebp*1] ; Fetch next V.
  422. jne DoNext4OutputPels
  423. movq [edi-8],mm3 ; Save 2 output pels.
  424. PrepareForNextOutputLine:
  425. mov edx,StartIndexOfYLine
  426. add eax,8-MAXWIDTH*20 ; Advance to next filtered line of Y.
  427. mov esi,UCursor
  428. test al,8 ; Jump if just did line 0 or 2.
  429. mov ebx,ChromaPitch
  430. jne DoOutputLine
  431. add esi,ebx ; Advance to next chroma line.
  432. test al,16 ; Jump if about to do line 2.
  433. mov UCursor,esi
  434. jne DoOutputLine
  435. sub esi,ebx ; Done with 4 lines. Restore UCursor.
  436. mov UCursor,esi
  437. jmp Next4OutputLines
  438. SkipOutputLine:
  439. mov ecx,AspectAdjustmentCount
  440. add eax,MAXWIDTH*20
  441. mov AspectCount,ecx
  442. jmp PrepareForNextOutputLine
  443. Done:
  444. mov esp,StashESP
  445. pop ebx
  446. pop ebp
  447. pop edi
  448. pop esi
  449. rturn
  450. MMX_YUV12ToRGB32ZoomBy2 endp
  451. END