Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

573 lines
24 KiB

  1. ;-------------------------------------------------------------------------
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;-------------------------------------------------------------------------
  13. ;//
  14. ;// $Header: S:\h26x\src\dec\cx512241.asv
  15. ;//
  16. ;// $Log: S:\h26x\src\dec\cxm12241.asv $
  17. ;//
  18. ;// Rev 1.7 28 May 1996 17:57:10 AGUPTA2
  19. ;// Cosmetic changes to adhere to common coding convention in all MMX
  20. ;// color convertors plus bug fixes.
  21. ;//
  22. ;//
  23. ;// Rev 1.2 26 Mar 1996 11:15:30 RMCKENZX
  24. ;//
  25. ;// Changed calling sequence to MMX_..., changed parameters to
  26. ;// new type (eliminated YUV base, etc.). put data in MMXDATA1 segment
  27. ;// and code in MMXCODE1 segment. cleaned and commented code.
  28. ;//
  29. ;// Rev 1.1 20 Mar 1996 11:19:20 RMCKENZX
  30. ;// March 96 version.
  31. ;
  32. ; Rev 1.3 18 Feb 1996 20:57:18 israelh
  33. ; new mmx version
  34. ;
  35. ; Rev 1.2 29 Jan 1996 19:53:52 mikeh
  36. ;
  37. ; added Ifdef timing
  38. ;
  39. ; Rev 1.1 29 Jan 1996 16:29:16 mikeh
  40. ; remvoed $LOG stuff
  41. ;
  42. ; Rev 1.0 29 Jan 1996 11:49:48 israelh
  43. ; Initial revision.
  44. ;//
  45. ;//
  46. ;// MMX 1.2 26 Jan 1996 IsraelH
  47. ;// Optimized code.
  48. ;// Adding runtime performane measurments
  49. ;//
  50. ;// MMX 1.1 23 Dec 1995 IsraelH
  51. ;// Using direct calculations with 10.6 precission.
  52. ;// Using 8x2 loop to use the same U,V contibutions for both of the lines.
  53. ;//
  54. ;// MMX 1.0 16 Dec 1995 IsraelH
  55. ;// Port to MMX(TM) without using look up tables
  56. ;//
  57. ;-------------------------------------------------------------------------
  58. ;
  59. ; +---------- Color convertor.
  60. ; |+--------- For both H261 and H263.
  61. ; ||+-------- MMx Version.
  62. ; |||++------ Convert from YUV12.
  63. ; |||||++---- Convert to RGB24.
  64. ; |||||||+--- Zoom by one, i.e. non-zoom.
  65. ; ||||||||
  66. ; cxm12241 -- This function performs YUV12-to-RGB24 color conversion for H26x.
  67. ; It handles the format in which the low order byte is B, the
  68. ; second byte is G, and the high order byte is R.
  69. ;
  70. ; The YUV12 input is planar, 8 bits per pel. The Y plane may have
  71. ; a pitch of up to 768. It may have a width less than or equal
  72. ; to the pitch. It must be DWORD aligned, and preferably QWORD
  73. ; aligned. Pitch and Width must be a multiple of 8. The U
  74. ; and V planes may have a different pitch than the Y plane, subject
  75. ; to the same limitations.
  76. ;
  77. OPTION CASEMAP:NONE
  78. OPTION PROLOGUE:None
  79. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  80. .586
  81. .xlist
  82. include iammx.inc
  83. include memmodel.inc
  84. .list
  85. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  86. MMXCODE1 ENDS
  87. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  88. MMXDATA1 ENDS
  89. MMXDATA1 SEGMENT
  90. ALIGN 8
  91. ;constants for direct RGB calculation: 4x10.6 values
  92. ;PUBLIC Minusg, VtR, VtG, UtG, UtB, Ymul, Yadd, UVtG, lowrgb, lowrgbn, higp,
  93. ; highpn, highwn, mzero
  94. Minusg DWORD 00800080h, 00800080h
  95. VtR DWORD 00660066h, 00660066h
  96. VtG DWORD 00340034h, 00340034h
  97. UtG DWORD 00190019h, 00190019h
  98. UtB DWORD 00810081h, 00810081h
  99. Ymul DWORD 004a004ah, 004a004ah
  100. Yadd DWORD 10101010h, 10101010h
  101. UVtG DWORD 00340019h, 00340019h
  102. lowrgb DWORD 00ffffffh, 00000000h
  103. lowrgbn DWORD 0ff000000h, 0ffffffffh
  104. highp DWORD 00000000h, 0ff000000h
  105. highpn DWORD 0ffffffffh, 00ffffffh
  106. highwn DWORD 0ffffffffh, 0000ffffh
  107. mzero DWORD 00000000h, 00000000h
  108. MMXDATA1 ENDS
  109. MMXCODE1 SEGMENT
  110. MMX_YUV12ToRGB24 PROC DIST LANG PUBLIC,
  111. AYPlane: DWORD,
  112. AVPlane: DWORD,
  113. AUPlane: DWORD,
  114. AFrameWidth: DWORD,
  115. AFrameHeight: DWORD,
  116. AYPitch: DWORD,
  117. AVPitch: DWORD,
  118. AAspectAdjustmentCnt: DWORD,
  119. AColorConvertedFrame: DWORD,
  120. ADCIOffset: DWORD,
  121. ACCOffsetToLine0: DWORD,
  122. ACCOPitch: DWORD,
  123. ACCType: DWORD
  124. LocalFrameSize = 128
  125. RegisterStorageSize = 16
  126. argument_base EQU ebp + RegisterStorageSize
  127. local_base EQU esp
  128. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  129. ; Arguments:
  130. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  131. YPlane EQU argument_base + 4
  132. VPlane EQU argument_base + 8
  133. UPlane EQU argument_base + 12
  134. FrameWidth EQU argument_base + 16
  135. FrameHeight EQU argument_base + 20
  136. YPitch EQU argument_base + 24
  137. ChromaPitch EQU argument_base + 28
  138. AspectAdjustmentCount EQU argument_base + 32
  139. ColorConvertedFrame EQU argument_base + 36
  140. DCIOffset EQU argument_base + 40
  141. CCOffsetToLine0 EQU argument_base + 44
  142. CCOPitch EQU argument_base + 48
  143. CCType EQU argument_base + 52
  144. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  145. ; Locals (on local stack frame)
  146. ; (local_base is aligned at cache-line boundary in the prologue)
  147. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  148. localFrameWidth EQU local_base + 0
  149. localYPitch EQU local_base + 4
  150. localChromaPitch EQU local_base + 8
  151. localAspectAdjustmentCount EQU local_base + 12
  152. localCCOPitch EQU local_base + 16
  153. CCOCursor EQU local_base + 20
  154. CCOSkipDistance EQU local_base + 24
  155. YLimit EQU local_base + 28
  156. DistanceFromVToU EQU local_base + 32
  157. currAspectCount EQU local_base + 36
  158. YCursorEven EQU local_base + 40
  159. YCursorOdd EQU local_base + 44
  160. tmpCCOPitch EQU local_base + 48
  161. StashESP EQU local_base + 52
  162. ; space for two DWORD locals
  163. temp_mmx EQU local_base + 64 ; note it is 64 bytes, align at QWORD
  164. push esi
  165. push edi
  166. push ebp
  167. push ebx
  168. mov ebp, esp
  169. sub esp, LocalFrameSize
  170. and esp, -32 ; align at cache line boundary
  171. mov [StashESP], ebp
  172. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  173. ; Save some parameters on local stack frame
  174. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  175. mov ebx, [FrameWidth]
  176. ;
  177. mov [localFrameWidth], ebx
  178. mov ebx, [YPitch]
  179. mov [localYPitch], ebx
  180. mov ebx, [ChromaPitch]
  181. mov [localChromaPitch], ebx
  182. mov ebx, [AspectAdjustmentCount]
  183. mov [localAspectAdjustmentCount], ebx
  184. mov ebx, [CCOPitch]
  185. mov [localCCOPitch], ebx
  186. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  187. ; Set-up rest of the local stack frame
  188. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  189. mov ebx, [VPlane]
  190. mov ecx, [UPlane]
  191. mov eax, [ColorConvertedFrame]
  192. sub ecx, ebx
  193. mov edx, [DCIOffset]
  194. mov [DistanceFromVToU], ecx ; UPlane - VPlane
  195. mov ecx, [CCOffsetToLine0]
  196. add eax, edx ; ColorConvertedFrame+DCIOffset
  197. mov edx, [FrameHeight]
  198. add eax, ecx ; ColorConvertedFrame+DCIOffset+CCOffsetToLine0
  199. mov ecx, [localYPitch]
  200. mov [CCOCursor],eax ; ColorConvertedFrame+DCIOffset+CCOffsetToLine0
  201. mov ebx, [localFrameWidth]
  202. mov eax, [CCOPitch]
  203. ;
  204. imul edx, ecx ; FrameHeight*YPitch
  205. ;
  206. sub eax, ebx ; CCOPitch-FrameWidth
  207. mov esi, [YPlane] ; Fetch cursor over luma plane.
  208. sub eax, ebx ; CCOPitch-2*FrameWidth
  209. add edx, esi ; YPlane+Size_of_Y_array
  210. sub eax, ebx ; CCOPitch-3*FrameWidth
  211. mov [YLimit], edx ; YPlane+Size_of_Y_array
  212. mov [CCOSkipDistance], eax ; CCOPitch-3*FrameWidth
  213. mov edx, [localAspectAdjustmentCount]
  214. mov esi, [VPlane]
  215. cmp edx,1
  216. je finish
  217. mov [currAspectCount], edx
  218. mov eax, [localYPitch]
  219. mov edi, [CCOCursor]
  220. mov edx, [DistanceFromVToU]
  221. mov ebp, [YPlane]
  222. mov ebx, [localFrameWidth]
  223. add ebp,ebx
  224. ;
  225. mov [YCursorEven], ebp
  226. add ebp,eax
  227. mov [YCursorOdd], ebp
  228. ;
  229. sar ebx,1
  230. ;
  231. add esi,ebx
  232. ;
  233. add edx,esi
  234. neg ebx
  235. mov [localFrameWidth], ebx ; -FrameWidth/2
  236. ;
  237. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  238. ;
  239. ; The following loops do two lines of Y (one line of UV).
  240. ; The inner loop (do_next_8x2_block) does 8 pels on the even line and
  241. ; the 8 pels immediately below them (sharing the same chroma) on the
  242. ; odd line.
  243. ;
  244. ; Core Register Usage:
  245. ; eax output pitch (for odd line writes)
  246. ; ebx cursor within the line. Starts at -Width, runs up to 0
  247. ; ecx -- unused --
  248. ; edx U plane base address
  249. ; ebp Y plane base address
  250. ; esi V plane base address
  251. ; edi output RGB plane pointer
  252. ;
  253. ; The YUV plane base addresses are previously biased by -Width and are
  254. ; used in conjunction with ebx.
  255. ;
  256. ; CAUTION: Parameters should not be referenced beyond this point.
  257. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  258. PrepareChromaLine:
  259. mov ebp, [currAspectCount]
  260. mov ebx, [localFrameWidth]
  261. sub ebp, 2
  262. mov eax, [localCCOPitch]
  263. mov [tmpCCOPitch], eax
  264. ja continue
  265. xor eax, eax
  266. add ebp, [localAspectAdjustmentCount]
  267. mov [tmpCCOPitch], eax
  268. continue:
  269. mov [currAspectCount], ebp
  270. do_next_8x2_block:
  271. mov ebp, [YCursorEven]
  272. ;
  273. movdt mm1, [edx+ebx] ; mm1 = xxxxxxxx U76 U54 U32 U10
  274. pxor mm0, mm0 ; mm0 = 0
  275. movdt mm2, [esi+ebx] ; mm2 = xxxxxxxx V76 V54 V32 V10
  276. punpcklbw mm1, mm0 ; mm1 = .U76 .U54 .U32 .U10
  277. psubw mm1, Minusg ; unbias U (sub 128)
  278. punpcklbw mm2, mm0 ; mm2 = .V76 .V54 .V32 .V10
  279. psubw mm2, Minusg ; unbias V (sub 128)
  280. movq mm3, mm1 ; mm3 = .U76 .U54 .U32 .U10
  281. ; *** delay cycle for store ***
  282. movq [temp_mmx+48], mm1 ; stash .U76 .U54 .U32 .U10
  283. punpcklwd mm1, mm2 ; mm1 = .V32 .U32 .V10 .U10
  284. pmaddwd mm1, UVtG ; mm1 = .....G32 .....G10 (from chroma)
  285. punpckhwd mm3, mm2 ; mm3 = .V76 .U76 .V54 .U54
  286. pmaddwd mm3, UVtG ; mm3 = .....G76 .....G54 (from chroma)
  287. ;
  288. movq [temp_mmx], mm2 ; stash .V76 .V54 .V32 .V10
  289. ;
  290. movq mm6, [ebp+2*ebx] ; mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
  291. ;
  292. psubusb mm6, Yadd ; unbias Y (sub 16) & clip at 0
  293. packssdw mm1, mm3 ; mm1 = .G76 .G54 .G32 .G10 (from chroma)
  294. movq mm7, mm6 ; mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
  295. punpcklbw mm6, mm0 ; mm6 = ..Y3 ..Y2 ..Y1 ..Y0
  296. pmullw mm6, Ymul ; mm6 = ..G3 ..G2 ..G1 ..G0 (from luma)
  297. punpckhbw mm7, mm0 ; mm7 = ..Y7 ..Y6 ..Y5 ..Y4
  298. pmullw mm7, Ymul ; mm7 = ..G7 ..G6 ..G5 ..G4 (from luma)
  299. movq mm4, mm1 ; mm4 = .G76 .G54 .G32 .G10 (from chroma)
  300. movq [temp_mmx+8], mm1 ; stash .G76 .G54 .G32 .G10 (from chroma)
  301. punpcklwd mm1, mm1 ; mm1 = .G32 .G32 .G10 .G10 (from chroma)
  302. punpckhwd mm4, mm4 ; mm4 = .G76 .G76 .G54 .G54 (from chroma)
  303. movq mm0, mm6 ; mm0 = RGB3 RGB2 RGB1 RGB0 (from luma)
  304. movq mm3, mm7 ; mm3 = RGB7 RGB6 RGB5 RGB4 (from luma)
  305. psubw mm6, mm1 ; mm6 = ..G3 ..G2 ..G1 ..G0 (scaled total)
  306. movq mm1, [temp_mmx+48] ; mm1 = .U76 .U54 .U32 .U10
  307. psubw mm7, mm4 ; mm1 = ..G7 ..G6 ..G5 ..G4 (scaled total)
  308. psraw mm6, 6 ; mm6 = ..G3 ..G2 ..G1 ..G0 (total)
  309. movq mm2, mm1 ; mm2 = .U76 .U54 .U32 .U10
  310. punpcklwd mm1, mm1 ; mm1 = .U32 .U32 .U10 .U10
  311. ;
  312. pmullw mm1, UtB ; mm1 = .B32 .B32 .B10 .B10 (from U)
  313. punpckhwd mm2, mm2 ; mm2 = .U76 .U76 .U54 .U54
  314. pmullw mm2, UtB ; mm2 = .B76 .B76 .B54 .B54 (from U)
  315. psraw mm7, 6 ; mm6 = ..G7 ..G6 ..G5 ..G4 (total)
  316. packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
  317. ; ; -------- green done --------
  318. movq [temp_mmx+16], mm1 ; stash .B32 .B32 .B10 .B10 (from U)
  319. ;
  320. movq [temp_mmx+40], mm2 ; stash .B76 .B76 .B54 .B54 (from U)
  321. paddw mm1, mm0 ; mm1 = ..B3 ..B2 ..B1 ..B0 (scaled total)
  322. paddw mm2, mm3 ; mm1 = ..B7 ..B6 ..B5 ..B4 (scaled total)
  323. psraw mm1, 6 ; mm1 = ..B3 ..B2 ..B1 ..B0 (total)
  324. psraw mm2, 6 ; mm1 = ..B7 ..B6 ..B5 ..B4 (total)
  325. ;
  326. packuswb mm1, mm2 ; mm1: B7 B6 B5 B4 B3 B2 B1 B0
  327. ; ; -------- blue done --------
  328. movq mm2, [temp_mmx] ; mm2 = .V76 .V54 .V32 .V10
  329. ;
  330. movq mm7, mm2 ; mm7 = .V76 .V54 .V32 .V10
  331. punpcklwd mm2, mm2 ; mm2 = .V32 .V32 .V10 .V10
  332. pmullw mm2, VtR ; mm2 = .R32 .R32 .R10 .R10 (from V)
  333. punpckhwd mm7, mm7 ; mm7 = .V76 .V76 .V54 .V54
  334. pmullw mm7, VtR ; mm7 = .R76 .R76 .R54 .R54 (from V)
  335. ;
  336. ; *** delay for multiply ***
  337. movq [temp_mmx+24], mm2 ; stash .R32 .R32 .R10 .R10 (from V)
  338. paddw mm2, mm0 ; mm2 = ..R3 ..R2 ..R1 ..R0 (total scaled)
  339. psraw mm2, 6 ; mm2 = ..R3 ..R2 ..R1 ..R0 (total)
  340. ;
  341. movq [temp_mmx+32], mm7 ; stash .R76 .R76 .R54 .R54 (from V)
  342. paddw mm7, mm3 ; mm7 = ..R7 ..R6 ..R5 ..R4 (total scaled)
  343. psraw mm7, 6 ; mm7 = ..R7 ..R6 ..R5 ..R4 (total)
  344. movq mm5, mm1 ; mm5 = B7 B6 B5 B4 B3 B2 B1 B0
  345. packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
  346. ; ; -------- red done --------
  347. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  348. ; shuffle up the results:
  349. ; red = mm2
  350. ; green = mm6
  351. ; blue = mm1
  352. ; into red-green-blue order and store
  353. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  354. punpcklbw mm5, mm6 ; mm5: G3 B3 G2 B2 G1 B1 G0 B0
  355. movq mm4, mm2 ; mm4 = R7 R6 R5 R4 R3 R2 R1 R0
  356. punpcklbw mm4, mm4 ; mm4: R3 R3 R2 R2 R1 R1 R0 R0
  357. movq mm3, mm5 ; mm3 = G3 B3 G2 B2 G1 B1 G0 B0
  358. punpcklwd mm5, mm4 ; mm5: R1 R1 G1 B1 R0 R0 G0 B0
  359. ;
  360. movq mm0, mm5 ; mm0 = R1 R1 G1 B1 R0 R0 G0 B0
  361. ;
  362. pand mm5, lowrgb ; mm5: 0 0 0 0 0 R0 G0 B0
  363. ;
  364. pand mm0, lowrgbn ; mm0: R1 R1 G1 B1 R0 0 0 0
  365. ;
  366. psrlq mm0, 8 ; mm0: 0 R1 R1 G1 B1 R0 0 0
  367. ;
  368. por mm0, mm5 ; mm0: x x R1 G1 B1 R0 G0 B0
  369. ;
  370. pand mm0, highwn ; mm3: 0 0 R1 G1 B1 R0 G0 B0
  371. movq mm5, mm3 ; mm5 = G3 B3 G2 B2 G1 B1 G0 B0
  372. punpckhwd mm5, mm4 ; mm5: R3 R3 G3 B3 R2 R2 G2 B2
  373. ;
  374. movq mm4, mm5 ; mm4 = R3 R3 G3 B3 R2 R2 G2 B2
  375. ;
  376. psllq mm4, 48 ; mm4: G2 B2 0 0 0 0 0 0
  377. ;
  378. por mm0, mm4 ; mm0: G2 B2 R1 G1 B1 R0 G0 B0
  379. psrlq mm5, 24 ; mm5: 0 0 0 R3 R3 G3 B3 R2
  380. punpckhbw mm1, mm6 ; mm1: G7 B7 G6 B6 G5 B5 G4 B4
  381. ;
  382. punpckhbw mm2, mm2 ; mm2: R7 R7 R6 R6 R5 R5 R4 R4
  383. ;
  384. movq [edi], mm0 ; !! aligned
  385. movq mm7, mm1 ; mm7: G7 B7 G6 B6 G5 B5 G4 B4
  386. punpcklwd mm1, mm2 ; mm1: R5 R5 G5 B5 R4 R4 G4 B4
  387. ;
  388. movq mm6, mm1 ; mm6: R5 R5 G5 B5 R4 R4 G4 B4
  389. punpckldq mm5, mm1 ; mm5: R4 R4 G4 B4 R3 G3 B3 R2
  390. pand mm5, highpn ; mm5: 0 R4 G4 B4 R3 G3 B3 R2
  391. psllq mm6, 24 ; mm6: B5 R4 R4 G4 B4 0 0 0
  392. pand mm6, highp ; mm6: B5 0 0 0 0 0 0 0
  393. psrlq mm1, 40 ; mm1: 0 0 0 0 0 R5 R5 G5
  394. mov ebp, [YCursorOdd] ; moved to here to save cycles before odd line
  395. por mm5, mm6 ; mm5: B5 R4 G4 B4 R3 G3 B3 R2
  396. punpckhwd mm7, mm2 ; mm7: R7 R7 G7 B7 R6 R6 G6 B6
  397. ;
  398. punpcklwd mm1, mm7 ; mm1: x x x x G6 B6 R5 G5
  399. ;
  400. movq [edi+8], mm5 ; !! aligned
  401. ;
  402. movdf [edi+16], mm1 ; !!!! aligned
  403. ;
  404. ;
  405. ; start odd line
  406. ;
  407. movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels
  408. psrlq mm7, 24 ; belong to even line - for cycles saving
  409. movdf [edi+20], mm7 ; !!!! aligned
  410. ;
  411. psubusb mm1, Yadd ; mm1 has 8 pixels y-16
  412. ;
  413. movq mm5, mm1
  414. ;
  415. punpcklbw mm1, mzero ; get 4 low y-16 unsign pixels word
  416. ;
  417. punpckhbw mm5, mzero ; 4 high y-16
  418. ;
  419. pmullw mm1, Ymul ; low 4 luminance contribution
  420. ;
  421. pmullw mm5, Ymul ; high 4 luminance contribution
  422. movq mm0, mm1
  423. paddw mm0, [temp_mmx+24] ; low 4 R
  424. movq mm6, mm5
  425. paddw mm5, [temp_mmx+32] ; high 4 R
  426. psraw mm0, 6
  427. psraw mm5, 6
  428. ;
  429. movq mm2, mm1
  430. packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
  431. ; -------- red done --------
  432. paddw mm2, [temp_mmx+16] ; low 4 B
  433. movq mm5, mm6
  434. paddw mm5, [temp_mmx+40] ; high 4 B
  435. psraw mm2, 6
  436. psraw mm5, 6
  437. ;
  438. packuswb mm2, mm5 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
  439. ; ; -------- blue done --------
  440. movq mm3, [temp_mmx+8] ; chroma G low 4
  441. ;
  442. movq mm4, mm3
  443. punpcklwd mm3, mm3 ; replicate low 2
  444. punpckhwd mm4, mm4 ; replicate high 2
  445. psubw mm1, mm3 ; 4 low G
  446. psubw mm6, mm4 ; 4 high G values in signed 16 bit
  447. psraw mm1, 6 ; low G
  448. psraw mm6, 6 ; high G
  449. ;
  450. packuswb mm1, mm6 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
  451. ; ; -------- green done --------
  452. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  453. ; shuffle up the results:
  454. ; red = mm0
  455. ; green = mm1
  456. ; blue = mm2
  457. ; into red-green-blue order and store
  458. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  459. movq mm3, mm2 ; B
  460. ;
  461. punpcklbw mm3, mm1 ; mm3: G3 B3 G2 B2 G1 B1 G0 B0
  462. movq mm4, mm0 ; R
  463. punpcklbw mm4, mm4 ; mm4: R3 R3 R2 R2 R1 R1 R0 R0
  464. movq mm5, mm3 ; BG
  465. mov eax, [tmpCCOPitch]
  466. punpcklwd mm3, mm4 ; mm3: R1 R1 G1 B1 R0 R0 G0 B0
  467. movq mm6, mm3 ; save mm3
  468. ;
  469. pand mm6, lowrgb ; mm6: 0 0 0 0 0 R0 G0 B0
  470. ;
  471. pand mm3, lowrgbn ; mm3: R1 R1 G1 B1 R0 0 0 0
  472. ;
  473. psrlq mm3, 8 ; mm3: 0 R1 R1 G1 B1 R0 0 0
  474. ;
  475. por mm3, mm6 ; mm3: x x R1 G1 B1 R0 G0 B0
  476. ;
  477. pand mm3, highwn ; mm3: 0 0 R1 G1 B1 R0 G0 B0
  478. movq mm6, mm5 ; BG
  479. punpckhwd mm6, mm4 ; mm6: R3 R3 G3 B3 R2 R2 G2 B2
  480. ;
  481. movq mm4, mm6
  482. ;
  483. psllq mm4, 48 ; mm4: G2 B2 0 0 0 0 0 0
  484. ;
  485. por mm3, mm4 ; mm3: G2 B2 R1 G1 B1 R0 G0 B0
  486. ;
  487. movq [edi+eax], mm3
  488. psrlq mm6, 24 ; mm6: 0 0 0 R3 R3 G3 B3 R2
  489. punpckhbw mm2, mm1 ; mm2: G7 B7 G6 B6 G5 B5 G4 B4
  490. punpckhbw mm0, mm0 ; mm0: R7 R7 R6 R6 R5 R5 R4 R4
  491. movq mm7, mm2 ; mm7: G7 B7 G6 B6 G5 B5 G4 B4
  492. punpcklwd mm7, mm0 ; mm7: x R5 G5 B5 x R4 G4 B4
  493. ;
  494. punpckldq mm6, mm7 ; mm6: R4 R4 G4 B4 R3 G3 B3 R2
  495. movq mm4, mm7
  496. psllq mm4, 24 ; mm4: B5 R4 R4 G4 B4 0 0 0
  497. ;
  498. pand mm6, highpn ; mm6: 0 R4 G4 B4 R3 G3 B3 R2
  499. psrlq mm7, 40 ; mm7: 0 0 0 0 0 R5 R5 G5
  500. pand mm4, highp ; mm4: B5 0 0 0 0 0 0 0 0
  501. punpckhwd mm2, mm0 ; mm2: R7 R7 G7 B7 R6 R6 G6 B6
  502. por mm6, mm4 ; mm6: B5 R4 G4 B4 R3 G3 B3 R2
  503. punpcklwd mm7, mm2 ; mm7 x x x x G6 B6 R5 G5
  504. psrlq mm2, 24
  505. ;
  506. punpckldq mm7, mm2
  507. ;
  508. movq [edi+eax+8], mm6 ; aligned
  509. ;
  510. movq [edi+eax+16], mm7
  511. add edi, 24 ; ih take 24 instead of 12 output
  512. add ebx, 4 ; ? to take 4 pixels together instead of 2
  513. jl do_next_8x2_block ; ? update the loop for 8 y pixels at once
  514. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  515. ; Update:
  516. ; edi: output RGB plane pointer for odd and even line
  517. ; ebp: Y Plane address
  518. ; esi: V Plane address
  519. ; edx: U Plane address
  520. ; YcursorEven: Even Y line address
  521. ; YCursorOdd: Odd Y line address
  522. ; Note: eax, ebx, ecx can be used as scratch registers
  523. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  524. mov ecx, [CCOSkipDistance]
  525. mov eax, [localYPitch]
  526. add edi, ecx ; go to begin of next even line
  527. mov ecx, [tmpCCOPitch]
  528. add edi, ecx ; skip odd line
  529. mov ecx, [localChromaPitch]
  530. add esi, ecx
  531. add ebp, eax ; skip two lines
  532. mov [YCursorEven], ebp ; save even line address
  533. mov ecx, [localChromaPitch]
  534. add edx, ecx
  535. add ebp, eax ; odd line address
  536. mov [YCursorOdd], ebp ; save odd line address
  537. mov eax, [YLimit] ; Done with last line?
  538. cmp ebp, eax
  539. jbe PrepareChromaLine
  540. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  541. ; end do 2 lines loop
  542. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  543. finish:
  544. mov esp, [StashESP]
  545. ;
  546. pop ebx
  547. pop ebp
  548. pop edi
  549. pop esi
  550. ret
  551. MMX_YUV12ToRGB24 ENDP
  552. MMXCODE1 ENDS
  553. END