Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

704 lines
26 KiB

  1. ;-------------------------------------------------------------------------
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;-------------------------------------------------------------------------
  13. ;-------------------------------------------------------------------------
  14. ;//
  15. ;// $Header: S:\h26x\src\dec\cx512161.asv
  16. ;//
  17. ;// $Log: S:\h26x\src\dec\cxm12161.asv $
  18. ;//
  19. ;// Rev 1.9 24 May 1996 11:12:10 AGUPTA2
  20. ;//
  21. ;// Modified version of final drop from IDC. Fixed alignment, global var,
  22. ;// referencing beyond stack pointer problems. Cosmetic changes to adhere
  23. ;// to a common coding convention in all MMX color convertor files.
  24. ;//
  25. ;// Rev 1.8 17 Apr 1996 09:51:08 ISRAELH
  26. ;// Added AspectRatio adjustement, emms.
  27. ;//
  28. ;// Rev 1.7 11 Apr 1996 09:51:08 RMCKENZX
  29. ;// Changed return to pop the stack.
  30. ;//
  31. ;// Rev 1.6 09 Apr 1996 10:00:44 RMCKENZX
  32. ;//
  33. ;// Changed calling sequence to __stdcall.
  34. ;//
  35. ;// Rev 1.5 05 Apr 1996 10:40:20 RMCKENZX
  36. ;// Hacked in Aspect Ratio correction. This is accomplished
  37. ;// by simply overwriting the next even line after the aspect
  38. ;// count has been matched or exceeded.
  39. ;//
  40. ;// Rev 1.4 29 Mar 1996 07:52:56 RMCKENZX
  41. ;// re-fixed bug in 655 setup.
  42. ;//
  43. ;// Rev 1.3 28 Mar 1996 14:35:38 RMCKENZX
  44. ;// Cleaned up code, added comments, revised calling sequence,
  45. ;// moved global variables onto stack.
  46. ;//
  47. ;// Rev 1.2 21 Mar 1996 08:10:06 RMCKENZX
  48. ;// Fixed 655 case -- initialized GLeftShift at 5.
  49. ;//
  50. ;// Rev 1.1 20 Mar 1996 11:18:52 RMCKENZX
  51. ;// March 96 version.
  52. ;
  53. ; Rev 1.3 19 Feb 1996 11:49:42 israelh
  54. ; bug fix.
  55. ; new algorithm for RGB16 bit pack.
  56. ;
  57. ; Rev 1.3 18 Feb 1996 20:58:44 israelh
  58. ; better algorithm and bug fix
  59. ;
  60. ; Rev 1.2 29 Jan 1996 19:53:50 mikeh
  61. ;
  62. ; added Ifdef timing
  63. ;
  64. ; Rev 1.1 29 Jan 1996 16:29:16 mikeh
  65. ; remvoed $LOG stuff
  66. ;
  67. ; Rev 1.0 29 Jan 1996 11:49:44 israelh
  68. ; Initial revision.
  69. ;//
  70. ;// MMX 1.3 14 Jan 1996 IsraelH
  71. ;// Implementing runtime RGB bit allocation according to BValLo[0]:
  72. ;// It contains the ColorConvertor value from d1color.cpp module.
  73. ;// Compiler flag RTIME16 for using runtime allocation.
  74. ;//
  75. ;// MMX 1.2 10 Jan 1996 IsraelH
  76. ;// Implementing RGB16x565 (5-R 5-G 5-B) as default
  77. ;// Compiler flag MODE555 for RGB16555 (5-R 5-G 5-B)
  78. ;//
  79. ;// MMX 1.1 09 Jan 1996 IsraelH
  80. ;// Implementing RGB16x555 (5-R 5-G 5-B)
  81. ;// Commenting out RGB16664 (6-R 6-G 4-B)
  82. ;// Adding performance measurements in runtime
  83. ;//
  84. ;// MMX 1.0 25 Dec 1995 IsraelH
  85. ;// Port to MMX(TM) without using tables
  86. ;
  87. ;-------------------------------------------------------------------------
  88. ;
  89. ; +---------- Color convertor.
  90. ; |+--------- For both H261 and H263.
  91. ; ||+-------- MMx Version.
  92. ; |||++------ Convert from YUV12.
  93. ; |||||++---- Convert to RGB16.
  94. ; |||||||+--- Zoom by one, i.e. non-zoom.
  95. ; ||||||||
  96. ; cxm12161 -- This function performs YUV12-to-RGB16 color conversion for H26x.
  97. ; It handles any format in which there are three fields, the low
  98. ; order field being B and fully contained in the low order byte, the
  99. ; second field being G and being somewhere in bits 4 through 11,
  100. ; and the high order field being R and fully contained in the high
  101. ; order byte.
  102. ;
  103. ; The YUV12 input is planar, 8 bits per pel. The Y plane may have
  104. ; a pitch of up to 768. It may have a width less than or equal
  105. ; to the pitch. It must be DWORD aligned, and preferably QWORD
  106. ; aligned. Pitch and Width must be a multiple of four. For best
  107. ; performance, Pitch should not be 4 more than a multiple of 32.
  108. ; Height may be any amount, but must be a multiple of two. The U
  109. ; and V planes may have a different pitch than the Y plane, subject
  110. ; to the same limitations.
  111. ;
  112. OPTION CASEMAP:NONE
  113. OPTION PROLOGUE:None
  114. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  115. .586
  116. .xlist
  117. include iammx.inc
  118. include memmodel.inc
  119. .list
  120. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  121. MMXCODE1 ENDS
  122. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  123. MMXDATA1 ENDS
  124. MMXDATA1 SEGMENT
  125. ALIGN 8
  126. Minusg DWORD 00800080h, 00800080h
  127. Yadd DWORD 10101010h, 10101010h
  128. VtR DWORD 00660066h, 00660066h ;01990199h,01990199h
  129. VtG DWORD 00340034h, 00340034h ;00d000d0h,00d000d0h
  130. UtG DWORD 00190019h, 00190019h ;00640064h,00640064h
  131. UtB DWORD 00810081h, 00810081h ;02050205h,02050205h
  132. Ymul DWORD 004a004ah, 004a004ah ;012a012ah,012a012ah
  133. UVtG DWORD 00340019h, 00340019h ;00d00064h,00d00064h
  134. VtRUtB DWORD 01990205h, 01990205h
  135. fourbitu DWORD 0f0f0f0f0h, 0f0f0f0f0h
  136. fivebitu DWORD 0e0e0e0e0h, 0e0e0e0e0h
  137. sixbitu DWORD 0c0c0c0c0h, 0c0c0c0c0h
  138. MMXDATA1 ENDS
  139. MMXCODE1 SEGMENT
  140. MMX_YUV12ToRGB16 PROC DIST LANG PUBLIC,
  141. AYPlane: DWORD,
  142. AVPlane: DWORD,
  143. AUPlane: DWORD,
  144. AFrameWidth: DWORD,
  145. AFrameHeight: DWORD,
  146. AYPitch: DWORD,
  147. AVPitch: DWORD,
  148. AAspectAdjustmentCnt: DWORD,
  149. AColorConvertedFrame: DWORD,
  150. ADCIOffset: DWORD,
  151. ACCOffsetToLine0: DWORD,
  152. ACCOPitch: DWORD,
  153. ACCType: DWORD
  154. LocalFrameSize = 256
  155. RegisterStorageSize = 16
  156. argument_base EQU ebp + RegisterStorageSize
  157. local_base EQU esp
  158. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  159. ; Arguments:
  160. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  161. YPlane EQU argument_base + 4
  162. VPlane EQU argument_base + 8
  163. UPlane EQU argument_base + 12
  164. FrameWidth EQU argument_base + 16
  165. FrameHeight EQU argument_base + 20
  166. YPitch EQU argument_base + 24
  167. ChromaPitch EQU argument_base + 28
  168. AspectAdjustmentCount EQU argument_base + 32
  169. ColorConvertedFrame EQU argument_base + 36
  170. DCIOffset EQU argument_base + 40
  171. CCOffsetToLine0 EQU argument_base + 44
  172. CCOPitch EQU argument_base + 48
  173. CCType EQU argument_base + 52
  174. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  175. ; Locals (on local stack frame)
  176. ; (local_base is aligned at cache-line boundary in the prologue)
  177. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  178. localFrameWidth EQU local_base + 0
  179. localYPitch EQU local_base + 4
  180. localChromaPitch EQU local_base + 8
  181. localAspectAdjustmentCount EQU local_base + 12
  182. localCCOPitch EQU local_base + 16
  183. CCOCursor EQU local_base + 20
  184. CCOSkipDistance EQU local_base + 24
  185. YLimit EQU local_base + 28
  186. DistanceFromVToU EQU local_base + 32
  187. currAspectCount EQU local_base + 36
  188. YCursorEven EQU local_base + 40
  189. YCursorOdd EQU local_base + 44
  190. tmpCCOPitch EQU local_base + 48
  191. StashESP EQU local_base + 52
  192. ; space for two DWORD locals
  193. temp_mmx EQU local_base + 64 ; note it is 64 bytes
  194. RLeftShift EQU local_base +128
  195. GLeftShift EQU local_base +136
  196. RRightShift EQU local_base +144
  197. GRightShift EQU local_base +152
  198. BRightShift EQU local_base +160
  199. RUpperLimit EQU local_base +168
  200. GUpperLimit EQU local_base +176
  201. BUpperLimit EQU local_base +184
  202. ; Switches used by RGB color convertors to determine the exact conversion type.
  203. RGB16555 = 9
  204. RGB16664 = 14
  205. RGB16565 = 18
  206. RGB16655 = 22
  207. push esi
  208. push edi
  209. push ebp
  210. push ebx
  211. mov ebp, esp
  212. sub esp, LocalFrameSize
  213. and esp, -32
  214. mov [StashESP], ebp
  215. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  216. ; Save some parameters on local stack frame
  217. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  218. mov ebx, [FrameWidth]
  219. ;
  220. mov [localFrameWidth], ebx
  221. mov ebx, [YPitch]
  222. mov [localYPitch], ebx
  223. mov ebx, [ChromaPitch]
  224. mov [localChromaPitch], ebx
  225. mov ebx, [AspectAdjustmentCount]
  226. mov [localAspectAdjustmentCount], ebx
  227. mov ebx, [CCOPitch]
  228. mov [localCCOPitch], ebx
  229. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  230. ; Set-up rest of the local stack frame
  231. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  232. mov al, [CCType]
  233. cmp al, RGB16555
  234. je RGB555
  235. cmp al, RGB16664
  236. je RGB664
  237. cmp al, RGB16565
  238. je RGB565
  239. cmp al, RGB16655
  240. je RGB655
  241. RGB555:
  242. xor eax, eax
  243. mov ebx, 2 ; 10-8 for byte shift
  244. mov [RLeftShift], ebx
  245. mov [RLeftShift+4], eax
  246. mov ebx, 5
  247. mov [GLeftShift+4], eax
  248. mov [GLeftShift], ebx
  249. mov ebx, 9
  250. mov [RRightShift], ebx
  251. mov [RRightShift+4], eax
  252. mov [GRightShift], ebx
  253. mov [GRightShift+4], eax
  254. mov [BRightShift], ebx
  255. mov [BRightShift+4], eax
  256. movq mm0, fivebitu
  257. ;
  258. movq [RUpperLimit], mm0
  259. ;
  260. movq [GUpperLimit], mm0
  261. ;
  262. movq [BUpperLimit], mm0
  263. jmp RGBEND
  264. RGB664:
  265. xor eax, eax
  266. mov ebx, 2 ; 8-6
  267. mov [RLeftShift], ebx
  268. mov [RLeftShift+4], eax
  269. mov ebx, 4
  270. mov [GLeftShift+4], eax
  271. mov [GLeftShift], ebx
  272. mov ebx, 8
  273. mov [RRightShift], ebx
  274. mov [RRightShift+4], eax
  275. mov [GRightShift], ebx
  276. mov [GRightShift+4], eax
  277. mov ebx, 10
  278. mov [BRightShift+4], eax
  279. mov [BRightShift], ebx
  280. ;
  281. movq mm0, sixbitu
  282. ;
  283. movq [RUpperLimit], mm0
  284. ;
  285. movq [GUpperLimit], mm0
  286. ;
  287. movq mm0, fourbitu
  288. ;
  289. movq [BUpperLimit], mm0
  290. jmp RGBEND
  291. RGB565:
  292. xor eax, eax
  293. mov ebx, 3 ; 8-5
  294. mov [RLeftShift], ebx
  295. mov [RLeftShift+4], eax
  296. mov ebx, 5
  297. mov [GLeftShift+4], eax
  298. mov [GLeftShift], ebx
  299. mov ebx, 9
  300. mov [RRightShift+4], eax
  301. mov [RRightShift], ebx
  302. mov [BRightShift], ebx
  303. mov [BRightShift+4], eax
  304. mov ebx, 8
  305. mov [GRightShift+4], eax
  306. mov [GRightShift], ebx
  307. ;
  308. movq mm0, fivebitu
  309. ;
  310. movq [RUpperLimit], mm0
  311. ;
  312. movq [BUpperLimit], mm0
  313. ;
  314. movq mm0, sixbitu
  315. ;
  316. movq [GUpperLimit], mm0
  317. jmp RGBEND
  318. RGB655:
  319. xor eax, eax
  320. mov ebx, 2 ; 8-6
  321. mov [RLeftShift], ebx
  322. mov [RLeftShift+4], eax
  323. mov ebx, 5
  324. mov [GLeftShift+4], eax
  325. mov [GLeftShift], ebx
  326. mov ebx, 8
  327. mov [RRightShift], ebx
  328. mov [RRightShift+4], eax
  329. mov ebx, 9
  330. mov [GRightShift+4], eax
  331. mov [GRightShift], ebx
  332. mov [BRightShift], ebx
  333. mov [BRightShift+4], eax
  334. ;
  335. movq mm0, sixbitu
  336. ;
  337. movq [RUpperLimit], mm0
  338. ;
  339. movq mm0, fivebitu
  340. ;
  341. movq [GUpperLimit], mm0
  342. ;
  343. movq [BUpperLimit], mm0
  344. jmp RGBEND
  345. RGBEND:
  346. mov ebx, [VPlane]
  347. mov ecx, [UPlane]
  348. sub ecx, ebx
  349. mov eax, [ColorConvertedFrame]
  350. mov [DistanceFromVToU], ecx
  351. mov edx, [DCIOffset]
  352. add eax, edx
  353. mov edx, [CCOffsetToLine0]
  354. add eax, edx
  355. mov edx, [FrameHeight]
  356. mov [CCOCursor], eax
  357. mov ecx, [YPitch]
  358. imul edx, ecx ; FrameHeight*YPitch
  359. ;
  360. mov ebx, [FrameWidth]
  361. mov eax, [CCOPitch]
  362. sub eax, ebx ; CCOPitch-FrameWidth
  363. mov esi, [YPlane] ; Fetch cursor over luma plane.
  364. sub eax, ebx ; CCOPitch-2*FrameWidth
  365. mov [CCOSkipDistance], eax ; CCOPitch-2*FrameWidth
  366. add edx, esi ; YPlane+Size_of_Y_array
  367. ;
  368. mov [YLimit], edx
  369. mov edx, [AspectAdjustmentCount]
  370. cmp edx,1
  371. je finish
  372. mov esi, [VPlane]
  373. mov [currAspectCount], edx
  374. mov [localAspectAdjustmentCount], edx
  375. xor eax, eax
  376. mov edi, [CCOCursor]
  377. mov edx, [DistanceFromVToU]
  378. mov ebp, [YPlane]
  379. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  380. ; cannot access parameters beyond this point
  381. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  382. mov ebx, [localFrameWidth]
  383. mov eax, [localYPitch]
  384. add ebp, ebx
  385. mov [YCursorEven], ebp ; YPlane + FrameWidth
  386. add ebp, eax
  387. sar ebx, 1 ; FrameWidth/2
  388. mov [YCursorOdd], ebp ; YPlane + FrameWidth + YPitch
  389. add esi, ebx ; VPlane + FrameWidth/2
  390. ;
  391. add edx, esi ; UPlane + FrameWidth/2
  392. neg ebx
  393. mov [localFrameWidth], ebx ; -FrameWidth/2
  394. ; Register Usage:
  395. ;
  396. ;------------------------------------------------------------------------------
  397. PrepareChromaLine:
  398. mov ebp, [currAspectCount]
  399. mov ebx, [localFrameWidth]
  400. sub ebp, 2
  401. mov eax, [localCCOPitch]
  402. mov [tmpCCOPitch], eax
  403. ja continue
  404. xor eax, eax
  405. add ebp, [localAspectAdjustmentCount]
  406. mov [tmpCCOPitch], eax
  407. continue:
  408. mov [currAspectCount], ebp
  409. do_next_8x2_block:
  410. mov ebp, [YCursorEven]
  411. ; here is even line
  412. movdt mm1, [edx+ebx] ; 4 u values
  413. pxor mm0, mm0 ; mm0=0
  414. movdt mm2, [esi+ebx] ; 4 v values
  415. punpcklbw mm1, mm0 ; get 4 unsign u
  416. psubw mm1, Minusg ; get 4 unsign u-128
  417. punpcklbw mm2, mm0 ; get unsign v
  418. psubw mm2, Minusg ; get unsign v-128
  419. movq mm3, mm1 ; save the u-128 unsign
  420. movq mm5, mm1 ; save u-128 unsign
  421. punpcklwd mm1, mm2 ; get 2 low u, v unsign pairs
  422. pmaddwd mm1, UVtG
  423. punpckhwd mm3, mm2 ; create high 2 unsign uv pairs
  424. pmaddwd mm3, UVtG
  425. ;
  426. movq [temp_mmx], mm2 ; save v-128
  427. ;
  428. movq mm6, [ebp+2*ebx] ; mm6 has 8 y pixels
  429. ;
  430. psubusb mm6, Yadd ; mm6 has 8 y-16 pixels
  431. packssdw mm1, mm3 ; packed the results to signed words
  432. movq mm7, mm6 ; save the 8 y-16 pixels
  433. punpcklbw mm6, mm0 ; mm6 has 4 low y-16 unsign
  434. pmullw mm6, Ymul
  435. punpckhbw mm7, mm0 ; mm7 has 4 high y-16 unsign
  436. pmullw mm7, Ymul
  437. movq mm4, mm1
  438. movq [temp_mmx+8], mm1 ; save 4 chroma G values
  439. punpcklwd mm1, mm1 ; chroma G replicate low 2
  440. movq mm0, mm6 ; low y
  441. punpckhwd mm4, mm4 ; chroma G replicate high 2
  442. movq mm3, mm7 ; high y
  443. psubw mm6, mm1 ; 4 low G
  444. psraw mm6, [GRightShift]
  445. psubw mm7, mm4 ; 4 high G values in signed 16 bit
  446. movq mm2, mm5
  447. punpcklwd mm5, mm5 ; replicate the 2 low u pixels
  448. pmullw mm5, UtB
  449. punpckhwd mm2, mm2
  450. psraw mm7, [GRightShift]
  451. pmullw mm2, UtB
  452. packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
  453. ;
  454. movq [temp_mmx+16], mm5 ; low chroma B
  455. paddw mm5, mm0 ; 4 low B values in signed 16 bit
  456. movq [temp_mmx+40], mm2 ; high chroma B
  457. paddw mm2, mm3 ; 4 high B values in signed 16 bit
  458. psraw mm5, [BRightShift] ; low B scaled down by 6+(8-5)
  459. ;
  460. psraw mm2, [BRightShift] ; high B scaled down by 6+(8-5)
  461. ;
  462. packuswb mm5, mm2 ; mm5: B7 B6 B5 B4 B3 B2 B1 B0
  463. ;
  464. movq mm2, [temp_mmx] ; 4 v values
  465. movq mm1, mm5 ; save B
  466. movq mm7, mm2
  467. punpcklwd mm2, mm2 ; replicate the 2 low v pixels
  468. pmullw mm2, VtR
  469. punpckhwd mm7, mm7
  470. pmullw mm7, VtR
  471. ;
  472. paddusb mm1, [BUpperLimit] ; mm1: saturate B+0FF-15
  473. ;
  474. movq [temp_mmx+24], mm2 ; low chroma R
  475. ;
  476. paddw mm2, mm0 ; 4 low R values in signed 16 bit
  477. ;
  478. psraw mm2, [RRightShift] ; low R scaled down by 6+(8-5)
  479. pxor mm4, mm4 ; mm4=0 for 8->16 conversion
  480. movq [temp_mmx+32], mm7 ; high chroma R
  481. paddw mm7, mm3 ; 4 high R values in signed 16 bit
  482. psraw mm7, [RRightShift] ; high R scaled down by 6+(8-5)
  483. ;
  484. psubusb mm1, [BUpperLimit]
  485. packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
  486. paddusb mm6, [GUpperLimit] ; G fast patch ih
  487. ;
  488. psubusb mm6, [GUpperLimit] ; fast patch ih
  489. ;
  490. paddusb mm2, [RUpperLimit] ; R
  491. ;
  492. psubusb mm2, [RUpperLimit]
  493. ;
  494. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  495. ; here we are packing from RGB24 to RGB16
  496. ; input:
  497. ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
  498. ; mm1: B7 B6 B5 B4 B3 B2 B1 B0
  499. ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
  500. ; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
  501. ; when H=2**xBITS-1 (x is for R G B)
  502. ; output:
  503. ; mm1- result: 4 low RGB16
  504. ; mm7- result: 4 high RGB16
  505. ; using: mm0- zero register
  506. ; mm3- temporary results
  507. ; algorithm:
  508. ; for (i=0; i<8; i++) {
  509. ; RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
  510. ; }
  511. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  512. psllq mm2, [RLeftShift] ; position R in the most significant part of the byte
  513. movq mm7, mm1 ; mm1: Save B
  514. ; note: no need for shift to place B on the least significant part of the byte
  515. ; R in left position, B in the right position so they can be combined
  516. punpcklbw mm1, mm2 ; mm1: 4 low 16 bit RB
  517. pxor mm0, mm0 ; mm0: 0
  518. punpckhbw mm7, mm2 ; mm5: 4 high 16 bit RB
  519. movq mm3, mm6 ; mm3: G
  520. punpcklbw mm6, mm0 ; mm6: low 4 G 16 bit
  521. ;
  522. psllw mm6, [GLeftShift] ; shift low G 5 positions
  523. ;
  524. punpckhbw mm3, mm0 ; mm3: high 4 G 16 bit
  525. por mm1, mm6 ; mm1: low RBG16
  526. psllw mm3, [GLeftShift] ; shift high G 5 positions
  527. ;
  528. por mm7, mm3 ; mm5: high RBG16
  529. ;
  530. mov ebp, [YCursorOdd] ; moved to here to save cycles before odd line
  531. ;
  532. movq [edi], mm1 ; !! aligned
  533. ;
  534. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  535. ;- start odd line
  536. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  537. movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels
  538. pxor mm2, mm2
  539. psubusb mm1, Yadd ; mm1 has 8 pixels y-16
  540. ;
  541. movq mm5, mm1
  542. punpcklbw mm1, mm2 ; get 4 low y-16 unsign pixels word
  543. pmullw mm1, Ymul ; low 4 luminance contribution
  544. punpckhbw mm5, mm2 ; 4 high y-16
  545. pmullw mm5, Ymul ; high 4 luminance contribution
  546. ;
  547. movq [edi+8], mm7 ; !! aligned
  548. movq mm0, mm1
  549. paddw mm0, [temp_mmx+24] ; low 4 R
  550. movq mm6, mm5
  551. psraw mm0, [RRightShift] ; low R scaled down by 6+(8-5)
  552. ;
  553. paddw mm5, [temp_mmx+32] ; high 4 R
  554. movq mm2, mm1
  555. psraw mm5, [RRightShift] ; high R scaled down by 6+(8-5)
  556. ;
  557. paddw mm2, [temp_mmx+16] ; low 4 B
  558. packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
  559. psraw mm2, [BRightShift] ; low B scaled down by 6+(8-5)
  560. movq mm5, mm6
  561. paddw mm6, [temp_mmx+40] ; high 4 B
  562. ;
  563. psraw mm6, [BRightShift] ; high B scaled down by 6+(8-5)
  564. ;
  565. movq mm3, [temp_mmx+8] ; chroma G low 4
  566. ;
  567. packuswb mm2, mm6 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
  568. movq mm4, mm3
  569. punpcklwd mm3, mm3 ; replicate low 2
  570. ;
  571. punpckhwd mm4, mm4 ; replicate high 2
  572. psubw mm1, mm3 ; 4 low G
  573. psraw mm1, [GRightShift] ; low G scaled down by 6+(8-5)
  574. psubw mm5, mm4 ; 4 high G values in signed 16 bit
  575. psraw mm5, [GRightShift] ; high G scaled down by 6+(8-5)
  576. ;
  577. paddusb mm2, [BUpperLimit] ; mm1: saturate B+0FF-15
  578. packuswb mm1, mm5 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
  579. psubusb mm2, [BUpperLimit]
  580. ;
  581. paddusb mm1, [GUpperLimit] ; G
  582. ;
  583. psubusb mm1, [GUpperLimit]
  584. ;
  585. paddusb mm0, [RUpperLimit] ; R
  586. ;
  587. mov eax, [tmpCCOPitch]
  588. ;
  589. psubusb mm0, [RUpperLimit]
  590. ;
  591. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  592. ; here we are packing from RGB24 to RGB16
  593. ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
  594. ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
  595. ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
  596. ; output:
  597. ; mm2- result: 4 low RGB16
  598. ; mm7- result: 4 high RGB16
  599. ; using: mm4- zero register
  600. ; mm3- temporary results
  601. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  602. psllq mm0, [RLeftShift] ; position R in the most significant part of the byte
  603. movq mm7, mm2 ; mm7: Save B
  604. ; note: no need for shift to place B on the least significant part of the byte
  605. ; R in left position, B in the right position so they can be combined
  606. punpcklbw mm2, mm0 ; mm1: 4 low 16 bit RB
  607. pxor mm4, mm4 ; mm4: 0
  608. movq mm3, mm1 ; mm3: G
  609. punpckhbw mm7, mm0 ; mm7: 4 high 16 bit RB
  610. punpcklbw mm1, mm4 ; mm1: low 4 G 16 bit
  611. ;
  612. punpckhbw mm3, mm4 ; mm3: high 4 G 16 bit
  613. ;
  614. psllw mm1, [GLeftShift] ; shift low G 5 positions
  615. por mm2, mm1 ; mm2: low RBG16
  616. psllw mm3, [GLeftShift] ; shift high G 5 positions
  617. ;
  618. por mm7, mm3 ; mm7: high RBG16
  619. ;
  620. movq [edi+eax], mm2
  621. ;
  622. movq [edi+eax+8], mm7 ; aligned
  623. ;
  624. add edi, 16 ; ih take 16 bytes (8 pixels-16 bit)
  625. add ebx, 4 ; ? to take 4 pixels together instead of 2
  626. jl do_next_8x2_block ; ? update the loop for 8 y pixels at once
  627. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  628. ; Update:
  629. ; edi: output RGB plane pointer for odd and even line
  630. ; ebp: Y Plane address
  631. ; esi: V Plane address
  632. ; edx: U Plane address
  633. ; YcursorEven: Even Y line address
  634. ; YCursorOdd: Odd Y line address
  635. ; Note: eax, ebx, ecx can be used as scratch registers
  636. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  637. mov ecx, [CCOSkipDistance]
  638. mov eax, [localYPitch]
  639. add edi, ecx ; go to begin of next even line
  640. mov ecx, [tmpCCOPitch]
  641. add edi, ecx ; skip odd line
  642. mov ecx, [localChromaPitch]
  643. add esi, ecx
  644. add ebp, eax ; skip two lines
  645. mov [YCursorEven], ebp ; save even line address
  646. mov ecx, [localChromaPitch]
  647. add edx, ecx
  648. add ebp, eax ; odd line address
  649. mov [YCursorOdd], ebp ; save odd line address
  650. mov eax, [YLimit] ; Done with last line?
  651. cmp ebp, eax
  652. jbe PrepareChromaLine
  653. ; ADDedi CCOSkipDistance ; go to begin of next line
  654. ; ADDedi tmpCCOPitch ; skip odd line (if it is needed)
  655. ; Leax YPitch
  656. ; Lebp YCursorOdd
  657. ; add ebp, eax ; skip one line
  658. ; Sebp YCursorEven
  659. ;
  660. ; add ebp, eax ; skip one line
  661. ; Sebp tmpYCursorOdd
  662. ; ADDesi ChromaPitch
  663. ; ADDedx ChromaPitch
  664. ; Leax YLimit ; Done with last line?
  665. ; cmp ebp, eax
  666. ; jbe PrepareChromaLine
  667. finish:
  668. mov esp, [StashESP]
  669. ;
  670. pop ebx
  671. pop ebp
  672. pop edi
  673. pop esi
  674. ret
  675. MMX_YUV12ToRGB16 ENDP
  676. MMXCODE1 ENDS
  677. END