Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

645 lines
24 KiB

  1. /**************************************************************************\
  2. *
  3. * Copyright (c) 2000 Microsoft Corporation
  4. *
  5. * Module name:
  6. *
  7. * Include file to generate either 5-5-5 or 5-6-5 versions of the
  8. * dither code.
  9. *
  10. * Notes:
  11. *
  12. * When DITHER_BLEND_555 is #defined to 1, then this file will generate
  13. * 5-5-5 versions of the included routines.
  14. *
  15. * When DITHER_BLEND_555 is #defined to 0, then we will generate 5-6-5
  16. * versions.
  17. *
  18. * Revision History:
  19. *
  20. * 03/15/2000 andrewgo
  21. * Created it.
  22. *
  23. \**************************************************************************/
  24. #undef DITHER_ARRAY
  25. #undef RED_SHIFT
  26. #undef GREEN_SHIFT
  27. #undef BLUE_SHIFT
  28. #undef DITHERBLEND_FUNC
  29. #undef DITHER_FUNC
  30. #if DITHER_BLEND_555
  31. #define DITHER_ARRAY Dither555
  32. #define RED_SHIFT 9
  33. #define GREEN_SHIFT 6
  34. #define BLUE_SHIFT 3
  35. #define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_555_MMX
  36. #define DITHER_FUNC ScanOperation::Dither_sRGB_555_MMX
  37. #else
  38. #define DITHER_ARRAY Dither565
  39. #define RED_SHIFT 8
  40. #define GREEN_SHIFT 5
  41. #define BLUE_SHIFT 3
  42. #define DITHERBLEND_FUNC ScanOperation::Dither_Blend_sRGB_565_MMX
  43. #define DITHER_FUNC ScanOperation::Dither_sRGB_565_MMX
  44. #endif
  45. // Do a dithered blend to 16bpp using MMX
  46. VOID FASTCALL
  47. DITHERBLEND_FUNC(
  48. VOID *dst,
  49. const VOID *src,
  50. INT count,
  51. const OtherParams *otherParams
  52. )
  53. {
  54. #if defined(_X86_)
  55. DEFINE_POINTERS(ARGB, WORD);
  56. DEFINE_BLEND_POINTER(ARGB);
  57. ASSERT(count != 0);
  58. ASSERT(otherParams);
  59. static ULONGLONG redBlueMask = 0x00f800f800f800f8;
  60. static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff;
  61. #if DITHER_BLEND_555
  62. static ULONGLONG greenMask = 0x0000f8000000f800;
  63. static ULONGLONG redBlueMultiplier = 0x0400000104000001;
  64. #else
  65. static ULONGLONG greenMask = 0x0000fc000000fc00;
  66. static ULONGLONG redBlueMultiplier = 0x0800000108000001;
  67. #endif
  68. INT x = otherParams->X;
  69. INT y = otherParams->Y;
  70. UINT32 *dither = (otherParams->DoingDither)
  71. ? &DITHER_ARRAY[8 * (y & 3)]
  72. : &DitherNone[0];
  73. UINT32 ditherIncrement = (x & 3) * 4;
  74. const ARGB *blendPixel = bl;
  75. _asm
  76. {
  77. ; ecx = count
  78. ; esi = source
  79. ; edi = destination
  80. ; mm4 = red and blue mask (0xf800f8)
  81. ; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5)
  82. ; mm6 = C1 | C0 dither
  83. ; mm7 = C3 | C2 dither
  84. mov eax, ditherIncrement
  85. mov esi, blendPixel
  86. mov edi, d
  87. mov ecx, count
  88. movq mm4, redBlueMask
  89. movq mm5, greenMask
  90. ; We always want our qword reads from the screen to be aligned.
  91. ; So if the initial pixel is not qword-aligned, we handle up to
  92. ; three pixels up front to make it qword-aligned.
  93. ;
  94. ; (Note that as a consequence of us aligning to the destination,
  95. ; we're often doing unaligned reads on the source. But it's
  96. ; a much bigger performance win to align operations to the screen
  97. ; than to system memory, due to the terrible screen read
  98. ; performance.)
  99. alignment_loop:
  100. add eax, dither
  101. test edi, 6
  102. movq mm6, [eax]
  103. movq mm7, [eax+8]
  104. jz done_start_alignment
  105. call do_single_pixel
  106. ; Adjust our pointers and load our new dither values:
  107. mov eax, ditherIncrement
  108. add eax, 4
  109. and eax, 0x0000000F
  110. mov ditherIncrement, eax
  111. add esi, 4
  112. add edi, 2
  113. dec ecx
  114. jz all_done
  115. jmp alignment_loop
  116. done_start_alignment:
  117. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  118. do_main_loop:
  119. sub ecx, 4 ; pre-decrement by 4
  120. jl do_pair
  121. ; We do chunks of 4 pixels at a time so that we can unroll our
  122. ; dither loop (our dither repeats every 4 pixels).
  123. do_main_loop_2:
  124. mov al, [esi+3]
  125. and al, [esi+7]
  126. and al, [esi+11]
  127. and al, [esi+15]
  128. inc al ; if all alphas were 0xff, this
  129. jnz do_pair ; will wrap to zero
  130. ; The four pixels starting at [esi] are opaque. We only need to
  131. ; dither them and convert to 16bpp. The following codepath will
  132. ; process all four in parallel (two at a time) in order to optimize
  133. ; usage of the execution units and minimize dependencies between
  134. ; consecutive instructions.
  135. ; We start by reading the four pixels into mm0 and mm1, adding
  136. ; the dither component, and then breaking into group 0 (pixels 0
  137. ; and 2) and group 1 (pixels 1 and 3). I will use **0** and **1**
  138. ; in the comments below to show which pixel group the instruction is
  139. ; processing
  140. movq mm0, [esi] ; mm0 = DW1 | DW0
  141. movq mm1, [esi + 8] ; mm1 = DW3 | DW2
  142. paddusb mm0, mm6 ; add dither
  143. movq mm2, mm0
  144. paddusb mm1, mm7 ; add dither
  145. add edi, 8
  146. punpckhdq mm2, mm1 ; **1** mm2 = DW3 | DW1
  147. punpckldq mm0, mm1 ; **0** mm0 = DW2 | DW0
  148. movq mm3, mm2 ; **1**
  149. pand mm2, mm4 ; **1** red and blue
  150. movq mm1, mm0 ; **0**
  151. pand mm0, mm4 ; **0** red and blue
  152. pand mm3, mm5 ; **1** green
  153. psrlw mm0, 3 ; **0** shift red and blue to lowest
  154. ; 5 bits in register
  155. ; Note the use of the pmaddwd to simultaneously shift both the red and
  156. ; blue bits into their appropriate positions. The constant
  157. ; redBlueMultiplier contains four shorts, each of which is equal to
  158. ; 2^i where i is the number of bits that we need to shift that color
  159. ; component by in order to attain the correct position in the 16bpp
  160. ; color. This is possible only because the red and blue
  161. ; components lie on different shorts in the 64bits register (green has
  162. ; been masked earlier), and so we can dedicate an entire 16bit short
  163. ; to red and to blue.
  164. pmaddwd mm2, redBlueMultiplier ; **1**
  165. add esi, 16
  166. pand mm1, mm5 ; **0** green
  167. psrld mm3, GREEN_SHIFT-3 ; **1**
  168. pmaddwd mm0, redBlueMultiplier ; **0**
  169. sub ecx, 4 ; pre-decrement for next iteration
  170. por mm2, mm3 ; **1** combine green with red/blue
  171. ; mm2 = 0 | W3 | 0 | W1
  172. psrld mm1, GREEN_SHIFT ; **0**
  173. psllq mm2, 13 ; **1** mm2 = W3 | 0 | W1 | 0
  174. por mm0, mm1 ; **0** combine green with red/blue
  175. ; mm1 = 0 | W2 | 0 | W0
  176. por mm0, mm2 ; mm2 = W3 | W2 | W1 | W0
  177. movq [edi - 8], mm0
  178. jge do_main_loop_2
  179. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  180. do_pair:
  181. add ecx, 2 ; pre-decrement for this iteration
  182. jl do_last_pixel
  183. ; We're doing only a single pair of pixels, so swap our dither
  184. ; values in preparation for the next iteration:
  185. pxor mm6, mm7
  186. pxor mm7, mm6
  187. pxor mm6, mm7 ; swap mm6 and mm7
  188. mov al, [esi+3]
  189. inc al
  190. cmp al, 1
  191. ja do_pair_blend
  192. mov al, [esi+7]
  193. inc al
  194. cmp al, 1
  195. ja do_pair_blend
  196. mov al, [esi+3] ; Do we really want this here?
  197. or al, [esi+7]
  198. jz do_pair_done
  199. movq mm0, [esi]
  200. paddusb mm0, mm7 ; add dither
  201. movq mm2, mm0
  202. pand mm0, mm5 ; green
  203. pand mm2, mm4 ; red and blue
  204. psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
  205. movq mm3, mm2
  206. psrld mm3, BLUE_SHIFT ; blue
  207. psrld mm2, RED_SHIFT ; red (9 for 5-5-5)
  208. por mm0, mm3
  209. por mm0, mm2 ; mm0 = X | C1 | X | C0
  210. movq mm1, mm0
  211. psrlq mm1, 32
  212. punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
  213. movd eax, mm0
  214. cmp byte ptr [esi+3], 0
  215. je do_pair_done_first_write
  216. mov [edi], ax
  217. do_pair_done_first_write:
  218. cmp byte ptr [esi+7], 0
  219. je do_pair_done_second_write
  220. shr eax, 16
  221. mov [edi+2], ax
  222. do_pair_done_second_write:
  223. add edi, 4
  224. add esi, 8
  225. jmp do_main_loop
  226. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  227. do_pair_blend:
  228. movd mm1, [edi] ; read destination, X | X | C1 | C0
  229. punpcklwd mm1, mm1 ; C1 | C1 | C0 | C0
  230. psrld mm1, 16 ; 0 | C1 | 0 | C0
  231. ; (trick using single red and
  232. ; blue mask requires high bits
  233. ; to be zero)
  234. movq mm0, mm1
  235. movq mm2, mm1
  236. pslld mm1, BLUE_SHIFT ; blue
  237. pslld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
  238. pslld mm2, RED_SHIFT ; red (9 for 5-5-5)
  239. por mm1, mm2 ; combine red and blue
  240. pand mm1, mm4 ; leave valid red and blue bits
  241. pand mm0, mm5 ; leave valid green bits
  242. por mm1, mm0 ; mm1 = C1 | C0
  243. ; Okay now we've got the destination read and split. Handle the first
  244. ; blend:
  245. movd mm2, [esi]
  246. punpcklbw mm2, mm2
  247. psrlw mm2, 8 ; mm2 = S
  248. movq mm3, mm2
  249. punpckhwd mm3, mm3
  250. punpckhdq mm3, mm3 ; mm3 = alpha
  251. movq mm0, mm1
  252. punpcklbw mm0, mm0
  253. psrlw mm0, 8 ; mm0 = D
  254. #if NO_PREMULTIPLIED_ALPHA
  255. psubw mm2, mm0
  256. pmullw mm2, mm3 ; mm2 = alpha * (S - D)
  257. movq mm3, mm2
  258. psrlw mm3, 8
  259. paddw mm2, mm3 ; approximate x/255 by 257/65536
  260. psrlw mm2, 8 ; mm2 = alpha * (S - D)
  261. paddb mm0, mm2 ; mm0 = C0 = D + alpha * (S - D)
  262. #else
  263. pxor mm3, flipAlphaBits
  264. pmullw mm0, mm3 ; mm2 = (255 - alpha) * D
  265. movq mm3, mm0
  266. psrlw mm0, 8 ; approximate x/255 by 257/65536
  267. paddw mm0, mm3 ; mm2 = (255 - alpha) * D / 255
  268. psrlw mm0, 8 ; don't care about rounding, not enough bits
  269. paddb mm0, mm2 ; mm0 = C0 = S + (1 - alpha) * D
  270. #endif
  271. ; Handle the second blend (change mm0 to mm1):
  272. movd mm2, [esi+4]
  273. punpcklbw mm2, mm2
  274. psrlw mm2, 8 ; mm2 = S
  275. movq mm3, mm2
  276. punpckhwd mm3, mm3
  277. punpckhdq mm3, mm3 ; mm3 = alpha
  278. punpckhbw mm1, mm1
  279. psrlw mm1, 8 ; mm1 = D
  280. #if NO_PREMULTIPLIED_ALPHA
  281. psubw mm2, mm1
  282. pmullw mm2, mm3 ; mm2 = alpha * (S - D)
  283. movq mm3, mm2
  284. psrlw mm3, 8
  285. paddw mm2, mm3 ; approximate x/255 by 257/65536
  286. psrlw mm2, 8 ; mm2 = alpha * (S - D)
  287. paddb mm1, mm2 ; mm1 = C1 = D + alpha * (S - D)
  288. #else
  289. pxor mm3, flipAlphaBits
  290. pmullw mm1, mm3 ; mm2 = (255 - alpha) * D
  291. movq mm3, mm1
  292. psrlw mm1, 8 ; approximate x/255 by 257/65536
  293. paddw mm1, mm3 ; mm2 = (255 - alpha) * D / 255
  294. psrlw mm1, 8 ; don't care about rounding, not enough bits
  295. paddb mm1, mm2 ; mm1 = C1 = S + (1 - alpha) * D
  296. #endif
  297. packuswb mm0, mm1 ; mm0 = C1 | C0
  298. ; Dither and pack everything back up:
  299. paddusb mm0, mm7 ; add dither
  300. movq mm2, mm0
  301. pand mm0, mm5 ; green
  302. pand mm2, mm4 ; red and blue
  303. psrld mm0, GREEN_SHIFT ; green
  304. movq mm3, mm2
  305. psrld mm3, BLUE_SHIFT ; blue
  306. psrld mm2, RED_SHIFT ; red
  307. por mm0, mm3
  308. por mm0, mm2 ; mm0 = X | C1 | X | C0
  309. movq mm1, mm0
  310. psrlq mm1, 32 ; mm1 = 0 | 0 | X | C1
  311. punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
  312. movd [edi], mm0
  313. do_pair_done:
  314. add edi, 4
  315. add esi, 8
  316. jmp do_main_loop
  317. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  318. do_single_pixel:
  319. movd mm0, [esi]
  320. mov al, [esi+3]
  321. inc al
  322. jnz do_single_blend ; if not completely opaque
  323. paddusb mm0, mm6 ; add dither
  324. movq mm2, mm0
  325. pand mm0, mm5 ; green
  326. pand mm2, mm4 ; red and blue
  327. psrld mm0, GREEN_SHIFT
  328. movq mm3, mm2
  329. psrld mm3, BLUE_SHIFT
  330. psrld mm2, RED_SHIFT
  331. por mm0, mm3
  332. por mm0, mm2 ; mm0 = X | C1 | X | C0
  333. movd eax, mm0
  334. mov [edi], ax
  335. do_single_done:
  336. ret
  337. do_single_blend:
  338. dec al
  339. jz do_single_done ; completely transparent pixel
  340. ; alpha is between 0 and 255
  341. movzx eax, word ptr [edi] ; do the destination read
  342. movd mm1, eax ; mm1 = 0 | 0 | 0 | C0
  343. movq mm0, mm1
  344. movq mm2, mm1
  345. pslld mm1, BLUE_SHIFT ; blue
  346. pslld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
  347. pslld mm2, RED_SHIFT ; red (9 for 5-5-5)
  348. por mm1, mm2 ; combine red and blue
  349. pand mm1, mm4 ; leave valid red and blue bits
  350. pand mm0, mm5 ; leave valid green bits
  351. por mm1, mm0 ; mm1 = C1 | C0
  352. ; Okay now we've got the destination read and split. Handle the first blend:
  353. movd mm2, [esi]
  354. punpcklbw mm2, mm2
  355. psrlw mm2, 8 ; mm2 = S
  356. movq mm3, mm2
  357. punpckhwd mm3, mm3
  358. punpckhdq mm3, mm3 ; mm3 = alpha
  359. movq mm0, mm1
  360. punpcklbw mm0, mm0
  361. psrlw mm0, 8 ; mm0 = D
  362. #if NO_PREMULTIPLIED_ALPHA
  363. psubw mm2, mm0
  364. pmullw mm2, mm3 ; mm2 = alpha * (S - D)
  365. movq mm3, mm2
  366. psrlw mm3, 8
  367. paddw mm2, mm3 ; approximate x/255 by 257/65536
  368. psrlw mm2, 8 ; mm2 = alpha * (S - D)
  369. paddb mm0, mm2 ; mm0 = C0 = D + alpha * (S - D)
  370. #else
  371. pxor mm3, flipAlphaBits
  372. pmullw mm0, mm3 ; mm2 = (255 - alpha) * D
  373. movq mm3, mm0
  374. psrlw mm0, 8 ; approximate x/255 by 257/65536
  375. paddw mm0, mm3 ; mm2 = (255 - alpha) * D / 255
  376. psrlw mm0, 8 ; don't care about rounding, not enough bits
  377. paddb mm0, mm2 ; mm0 = C0 = S + (1 - alpha) * D
  378. #endif
  379. packuswb mm0, mm0 ; mm0 = C1 | C0
  380. ; Dither and pack everything back up:
  381. paddusb mm0, mm6 ; add dither
  382. movq mm2, mm0
  383. pand mm0, mm5 ; green
  384. pand mm2, mm4 ; red and blue
  385. psrld mm0, GREEN_SHIFT
  386. movq mm3, mm2
  387. psrld mm3, BLUE_SHIFT
  388. psrld mm2, RED_SHIFT
  389. por mm0, mm3
  390. por mm0, mm2 ; mm0 = X | C1 | X | C0
  391. movd eax, mm0
  392. mov [edi], ax
  393. ret
  394. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  395. do_last_pixel:
  396. test ecx, 1
  397. jz all_done
  398. call do_single_pixel
  399. all_done:
  400. emms
  401. }
  402. #endif
  403. }
  404. // Dither to 16bpp using MMX
  405. VOID FASTCALL
  406. DITHER_FUNC(
  407. VOID *dst,
  408. const VOID *src,
  409. INT count,
  410. const OtherParams *otherParams
  411. )
  412. {
  413. #if defined(_X86_)
  414. DEFINE_POINTERS(ARGB, WORD);
  415. ASSERT(count != 0);
  416. ASSERT(otherParams);
  417. static ULONGLONG redBlueMask = 0x00f800f800f800f8;
  418. static ULONGLONG flipAlphaBits = 0x00ff00ff00ff00ff;
  419. #if DITHER_BLEND_555
  420. static ULONGLONG greenMask = 0x0000f8000000f800;
  421. #else
  422. static ULONGLONG greenMask = 0x0000fc000000fc00;
  423. #endif
  424. INT x = otherParams->X;
  425. INT y = otherParams->Y;
  426. UINT32 *dither = (otherParams->DoingDither)
  427. ? &DITHER_ARRAY[8 * (y & 3) + (x & 3)]
  428. : &DitherNone[0];
  429. _asm
  430. {
  431. ; ecx = count
  432. ; esi = source
  433. ; edi = destination
  434. ; mm4 = red and blue mask (0xf800f8)
  435. ; mm5 = green mask (0x00fc00) (0x00f800 for 5-5-5)
  436. ; mm6 = C1 | C0 dither
  437. ; mm7 = C3 | C2 dither
  438. mov eax, dither
  439. mov esi, s
  440. mov edi, d
  441. mov ecx, count
  442. movq mm4, redBlueMask
  443. movq mm5, greenMask
  444. movq mm6, [eax]
  445. movq mm7, [eax+8]
  446. sub ecx, 4 ; pre-decrement by 4
  447. jl do_last_3_pixels_or_less
  448. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  449. ; We do chunks of 4 pixels at a time so that we can unroll our
  450. ; dither loop (our dither repeats every 4 pixels).
  451. do_main_loop:
  452. movq mm0, [esi]
  453. paddusb mm0, mm6 ; add dither
  454. movq mm2, mm0
  455. pand mm0, mm5 ; green
  456. pand mm2, mm4 ; red and blue
  457. psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
  458. movq mm3, mm2
  459. psrld mm3, BLUE_SHIFT ; blue
  460. psrld mm2, RED_SHIFT ; red (9 for 5-5-5)
  461. por mm0, mm3
  462. por mm0, mm2 ; mm0 = X | C1 | X | C0
  463. movq mm1, mm0
  464. psrlq mm1, 32 ; mm1 = X | X | X | C1
  465. punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
  466. movd [edi], mm0
  467. movq mm0, [esi+8]
  468. paddusb mm0, mm7 ; add dither
  469. movq mm2, mm0
  470. pand mm0, mm5 ; green
  471. pand mm2, mm4 ; red and blue
  472. psrld mm0, GREEN_SHIFT
  473. movq mm3, mm2
  474. psrld mm3, BLUE_SHIFT
  475. psrld mm2, RED_SHIFT
  476. por mm0, mm3
  477. por mm0, mm2 ; mm0 = X | C1 | X | C0
  478. movq mm1, mm0
  479. psrlq mm1, 32
  480. punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
  481. movd [edi+4], mm0
  482. add edi, 8
  483. add esi, 16
  484. sub ecx, 4 ; pre-decrement for next iteration
  485. jge do_main_loop
  486. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  487. do_last_3_pixels_or_less:
  488. add ecx, 4 ; get back 'real' count
  489. jz all_done
  490. dec ecx ; if exactly 1 pixel left
  491. jz do_last_pixel
  492. ; do 2 pixels
  493. ; we'll decrement ecx again later
  494. movq mm0, [esi]
  495. paddusb mm0, mm6 ; add dither
  496. movq mm2, mm0
  497. pand mm0, mm5 ; green
  498. pand mm2, mm4 ; red and blue
  499. psrld mm0, GREEN_SHIFT ; green (6 for 5-5-5)
  500. movq mm3, mm2
  501. psrld mm3, BLUE_SHIFT ; blue
  502. psrld mm2, RED_SHIFT ; red (9 for 5-5-5)
  503. por mm0, mm3
  504. por mm0, mm2 ; mm0 = X | C1 | X | C0
  505. movq mm1, mm0
  506. psrlq mm1, 32 ; mm1 = X | X | X | C1
  507. punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
  508. movd eax, mm0
  509. mov [edi], eax
  510. dec ecx
  511. jz all_done
  512. add esi, 8
  513. add edi, 4
  514. do_last_pixel:
  515. movd mm0, [esi]
  516. paddusb mm0, mm7 ; add dither
  517. movq mm2, mm0
  518. pand mm0, mm5 ; green
  519. pand mm2, mm4 ; red and blue
  520. psrld mm0, GREEN_SHIFT
  521. movq mm3, mm2
  522. psrld mm3, BLUE_SHIFT
  523. psrld mm2, RED_SHIFT
  524. por mm0, mm3
  525. por mm0, mm2 ; mm0 = X | C1 | X | C0
  526. movq mm1, mm0
  527. psrlq mm1, 32
  528. punpcklwd mm0, mm1 ; mm0 = X | X | C1 | C0
  529. movd eax, mm0
  530. mov [edi], ax
  531. all_done:
  532. emms
  533. }
  534. #endif
  535. }