Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2283 lines
79 KiB

  1. /******************************Module*Header*******************************\
  2. * Module Name: alphaimg.cxx
  3. *
  4. * Low level alpha blending routines
  5. *
  6. * Created: 21-Jun-1996
  7. * Author: Mark Enstrom [marke]
  8. *
  9. * Copyright (c) 1996-1999 Microsoft Corporation
  10. \**************************************************************************/
  11. #include "precomp.hxx"
  12. /**************************************************************************\
  13. * Explanation of algorithm for AlphaPerPixelOnly routines
  14. * -------------------------------------------------------
  15. *
  16. * The inner loop of each routine computes:
  17. *
  18. * Dst = Alpha * Src + (1-Alpha) * Dst
  19. *
  20. *
  21. * The source pixel is assumed to have been premultiplied
  22. * with the alpha value, which leaves this:
  23. *
  24. * Dst = Src + (1-Alpha) * Dst
  25. *
  26. *
  27. * Because the alpha is stored as a byte, we must actually compute
  28. *
  29. * Dst = Src + (255-SrcAlpha) * Dst / 255
  30. *
  31. *
  32. * A close approximation to 1/255 is 257/65536; we use this to replace
  33. * the divide with shifts and adds. That is, X/255 becomes:
  34. *
  35. * ((X<<8) + X) >> 16
  36. *
  37. * or:
  38. *
  39. * (X + (X>>8)) >> 8
  40. *
  41. *
  42. * We improve the accuracy of this approximation by adding a rounding
  43. * step after the multiply.
  44. *
  45. * In particular, this gives exact results
  46. * where SrcAlpha is 0 or 255, important for versions of the routine
  47. * which do not special case those values (such as mmxAlphaPerPixelOnly).
  48. *
  49. * The resulting algorithm is:
  50. *
  51. * T1 = Dst * (255 - SrcAlpha) + 128
  52. * T2 = T1 >> 8
  53. * T3 = (T1 + T2) >> 8;
  54. * Dst = Src + T2
  55. *
  56. * Finally, the above must be done to each of the 4 components of the pixel.
  57. * Most versions of the routine do 2 components in a single
  58. * DWORD. The algorithm is therefore done twice per pixel,
  59. * once for each set of 2 components, and the two iterations are interleaved.
  60. *
  61. \**************************************************************************/
  62. /**************************************************************************\
  63. * vAlphaPerPixelOnly
  64. *
  65. * Used when the source has per-pixel alpha values and the
  66. * SourceConstantAlpha is 255.
  67. *
  68. * Dst = Src + (1-SrcAlpha) * Dst
  69. *
  70. * Arguments:
  71. *
  72. * ppixDst - address of dst pixel
  73. * ppixSrc - address of src pixel
  74. * cx - number of pixels in scan line
  75. * BlendFunction - blend to be done on each pixel
  76. *
  77. * Return Value:
  78. *
  79. * none
  80. *
  81. * History:
  82. *
  83. * 1/23/1997 Mark Enstrom [marke]
  84. *
  85. \**************************************************************************/
  86. #if !defined(_X86_)
  87. VOID
  88. vAlphaPerPixelOnly(
  89. ALPHAPIX *ppixDst,
  90. ALPHAPIX *ppixSrc,
  91. LONG cx,
  92. BLENDFUNCTION BlendFunction
  93. )
  94. {
  95. ALPHAPIX pixSrc;
  96. ALPHAPIX pixDst;
  97. BYTE alpha;
  98. while (cx--)
  99. {
  100. pixSrc = *ppixSrc;
  101. alpha = pixSrc.pix.a;
  102. if (alpha != 0)
  103. {
  104. pixDst = *ppixDst;
  105. if (alpha == 255)
  106. {
  107. pixDst = pixSrc;
  108. }
  109. else
  110. {
  111. //
  112. // Dst = Src + (1-Alpha) * Dst
  113. //
  114. ULONG Multa = 255 - alpha;
  115. ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8;
  116. ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff);
  117. ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
  118. ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;
  119. ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
  120. ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;
  121. ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
  122. ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;
  123. pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB;
  124. }
  125. *ppixDst = pixDst;
  126. }
  127. ppixSrc++;
  128. ppixDst++;
  129. }
  130. }
  131. #endif
  132. /**************************************************************************\
  133. * vAlphaPerPixelAndConst
  134. *
  135. * Used when the source has per-pixel alpha values and the
  136. * SourceConstantAlpha is not 255.
  137. *
  138. * if SrcAlpha == 255 then
  139. *
  140. * Dst = Dst + ConstAlpha * (Src - Dst)
  141. *
  142. * else
  143. *
  144. * Src = Src * ConstAlpha
  145. * Dst = Src + (1 - SrcAlpha) Dst
  146. *
  147. * Arguments:
  148. *
  149. * ppixDst - address of dst pixel
  150. * ppixSrc - address of src pixel
  151. * cx - number of pixels in scan line
  152. * BlendFunction - blend to be done on each pixel
  153. *
  154. * Return Value:
  155. *
  156. * None
  157. *
  158. * History:
  159. *
  160. * 3/12/1997 Mark Enstrom [marke]
  161. *
  162. \**************************************************************************/
  163. VOID
  164. vAlphaPerPixelAndConst(
  165. ALPHAPIX *ppixDst,
  166. ALPHAPIX *ppixSrc,
  167. LONG cx,
  168. BLENDFUNCTION BlendFunction
  169. )
  170. {
  171. ALPHAPIX pixSrc;
  172. ALPHAPIX pixDst;
  173. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  174. BYTE alpha;
  175. while (cx--)
  176. {
  177. pixSrc = *ppixSrc;
  178. alpha = pixSrc.pix.a;
  179. if (alpha != 0)
  180. {
  181. pixDst = *ppixDst;
  182. if (alpha == 255)
  183. {
  184. //
  185. // Blend: D = sA * S + (1-sA) * D
  186. //
  187. // red and blue
  188. //
  189. ULONG uB00rr00bb = pixDst.ul & 0x00ff00ff;
  190. ULONG uF00rr00bb = pixSrc.ul & 0x00ff00ff;
  191. ULONG uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
  192. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
  193. ULONG uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
  194. ULONG uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
  195. //
  196. // alpha and green
  197. //
  198. ULONG uB00aa00gg = (pixDst.ul >> 8) & 0xff00ff;
  199. ULONG uF00aa00gg = (pixSrc.ul >> 8) & 0xff00ff;
  200. ULONG uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) +
  201. (ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080;
  202. ULONG uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8;
  203. ULONG uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00;
  204. pixDst.ul = uD00rr00bb + uDaa00gg00;
  205. }
  206. else
  207. {
  208. //
  209. // disolve
  210. //
  211. ULONG ul_B_00AA00GG = (pixSrc.ul & 0xff00ff00) >> 8;
  212. ULONG ul_B_00RR00BB = (pixSrc.ul & 0x00ff00ff);
  213. ULONG ul_T_AAAAGGGG = ul_B_00AA00GG * ConstAlpha + 0x00800080;
  214. ULONG ul_T_RRRRBBBB = ul_B_00RR00BB * ConstAlpha + 0x00800080;
  215. ULONG ul_T_00AA00GG = (ul_T_AAAAGGGG & 0xFF00FF00) >> 8;
  216. ULONG ul_T_00RR00BB = (ul_T_RRRRBBBB & 0xFF00FF00) >> 8;
  217. ULONG ul_C_AA00GG00 = ((ul_T_AAAAGGGG + ul_T_00AA00GG) & 0xFF00FF00);
  218. ULONG ul_C_00RR00BB = ((ul_T_RRRRBBBB + ul_T_00RR00BB) & 0xFF00FF00) >> 8;
  219. pixSrc.ul = (ul_C_AA00GG00 | ul_C_00RR00BB);
  220. //
  221. // over
  222. //
  223. ULONG Multa = 255 - pixSrc.pix.a;
  224. ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8;
  225. ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff);
  226. ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
  227. ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;
  228. ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
  229. ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;
  230. ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
  231. ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;
  232. pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB;
  233. }
  234. *ppixDst = pixDst;
  235. }
  236. ppixSrc++;
  237. ppixDst++;
  238. }
  239. }
  240. /******************************Public*Routine******************************\
  241. * vAlphaConstOnly
  242. *
  243. * Used when the source does not have per-pixel alpha values,
  244. * and SourceConstantAlpha is not 255.
  245. *
  246. * Dst = Dst + ConstAlpha * (Src - Dst)
  247. *
  248. * Arguments:
  249. *
  250. * ppixDst - address of dst pixel
  251. * ppixSrc - address of src pixel
  252. * cx - number of pixels in scan line
  253. * BlendFunction - blend to be done on each pixel
  254. *
  255. * Return Value:
  256. *
  257. * None
  258. *
  259. * History:
  260. *
  261. * 12/2/1996 Mark Enstrom [marke]
  262. *
  263. \**************************************************************************/
  264. #if !defined (_X86_)
  265. VOID
  266. vAlphaConstOnly(
  267. ALPHAPIX *ppixDst,
  268. ALPHAPIX *ppixSrc,
  269. LONG cx,
  270. BLENDFUNCTION BlendFunction
  271. )
  272. {
  273. PULONG pulSrc = (PULONG)ppixSrc;
  274. PULONG pulDst = (PULONG)ppixDst;
  275. PULONG pulSrcEnd = pulSrc + cx;
  276. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  277. //
  278. // Blend: D = sA * S + (1-sA) * D
  279. //
  280. while (pulSrc != pulSrcEnd)
  281. {
  282. ULONG ulDst = *pulDst;
  283. ULONG ulSrc = *pulSrc;
  284. ULONG uB00rr00bb = ulDst & 0x00ff00ff;
  285. ULONG uF00rr00bb = ulSrc & 0x00ff00ff;
  286. ULONG uMrrrrbbbb;
  287. ULONG uM00rr00bb;
  288. ULONG uD00rr00bb;
  289. ULONG uB00aa00gg;
  290. ULONG uF00aa00gg;
  291. ULONG uMaaaagggg;
  292. ULONG uM00aa00gg;
  293. ULONG uDaa00gg00;
  294. //
  295. // red and blue
  296. //
  297. uB00rr00bb = ulDst & 0x00ff00ff;
  298. uF00rr00bb = ulSrc & 0x00ff00ff;
  299. uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
  300. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
  301. uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
  302. uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
  303. //
  304. // alpha and green
  305. //
  306. uB00aa00gg = (ulDst >> 8) & 0xff00ff;
  307. uF00aa00gg = (ulSrc >> 8) & 0xff00ff;
  308. uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) +
  309. (ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080;
  310. uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8;
  311. uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00;
  312. *pulDst = uD00rr00bb + uDaa00gg00;
  313. pulSrc++;
  314. pulDst++;
  315. }
  316. }
  317. /**************************************************************************\
  318. * vAlphaConstOnly16_555
  319. *
  320. * Optimized version of vAlphaConstOnly used when source and destination
  321. * are both 16_555.
  322. *
  323. * Dst = Dst + ConstAlpha * (Src - Dst)
  324. *
  325. * Arguments:
  326. *
  327. * ppixDst - address of dst pixel
  328. * ppixSrc - address of src pixel
  329. * cx - number of pixels in scan line
  330. * BlendFunction - blend to be done on each pixel
  331. *
  332. * Return Value:
  333. *
  334. * None
  335. *
  336. * History:
  337. *
  338. * 12/2/1996 Mark Enstrom [marke]
  339. *
  340. \**************************************************************************/
  341. VOID
  342. vAlphaConstOnly16_555(
  343. ALPHAPIX *ppixDst,
  344. ALPHAPIX *ppixSrc,
  345. LONG cx,
  346. BLENDFUNCTION BlendFunction
  347. )
  348. {
  349. PUSHORT pusSrc = (PUSHORT)ppixSrc;
  350. PUSHORT pusDst = (PUSHORT)ppixDst;
  351. PUSHORT pusSrcEnd = pusSrc + cx;
  352. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  353. //
  354. // Blend: D = sA * S + (1-sA) * D
  355. //
  356. while (pusSrc != pusSrcEnd)
  357. {
  358. USHORT usDst = *pusDst;
  359. USHORT usSrc = *pusSrc;
  360. ULONG uB00rr00bb;
  361. ULONG uF00rr00bb;
  362. ULONG uMrrrrbbbb;
  363. ULONG uM00rr00bb;
  364. ULONG uDrrxxbbxx;
  365. ULONG uB000000gg;
  366. ULONG uF000000gg;
  367. ULONG uM0000gggg;
  368. ULONG uM000000gg;
  369. ULONG uD0000ggxx;
  370. //
  371. // red and blue
  372. //
  373. uB00rr00bb = (usDst & 0x7c1f); // uB 0rrr rr00 000b bbbb
  374. uF00rr00bb = (usSrc & 0x7c1f); // uS 0rrr rr00 000b bbbb
  375. uMrrrrbbbb = ((uB00rr00bb<<5)-uB00rr00bb) +
  376. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00004010;
  377. uM00rr00bb = (uMrrrrbbbb & 0x000F83E0) >> 5;
  378. uDrrxxbbxx = ((uMrrrrbbbb+uM00rr00bb) >> 5) & 0x7c1f;
  379. //
  380. // green
  381. //
  382. uB000000gg = (usDst & 0x3e0) >> 5;
  383. uF000000gg = (usSrc & 0x3e0) >> 5;
  384. uM0000gggg = ((uB000000gg <<5)-uB000000gg) +
  385. (ConstAlpha * (uF000000gg-uB000000gg)) + 0x00000010;
  386. uM000000gg = (uM0000gggg & 0x000003E0)>>5;
  387. uD0000ggxx = (uM0000gggg + uM000000gg) & 0x03E0;
  388. *pusDst = (USHORT)(uDrrxxbbxx | uD0000ggxx);
  389. pusSrc++;
  390. pusDst++;
  391. }
  392. }
  393. /**************************************************************************\
  394. * vAlphaConstOnly16_565
  395. *
  396. * Optimized version of vAlphaConstOnly used when source and destination
  397. * are both 16_565.
  398. *
  399. * Dst = Dst + ConstAlpha * (Src - Dst)
  400. *
  401. * Arguments:
  402. *
  403. * ppixDst - address of dst pixel
  404. * ppixSrc - address of src pixel
  405. * cx - number of pixels in scan line
  406. * BlendFunction - blend to be done on each pixel
  407. *
  408. * Return Value:
  409. *
  410. * None
  411. *
  412. * History:
  413. *
  414. * 12/2/1996 Mark Enstrom [marke]
  415. *
  416. \**************************************************************************/
  417. VOID
  418. vAlphaConstOnly16_565(
  419. ALPHAPIX *ppixDst,
  420. ALPHAPIX *ppixSrc,
  421. LONG cx,
  422. BLENDFUNCTION BlendFunction
  423. )
  424. {
  425. PUSHORT pusSrc = (PUSHORT)ppixSrc;
  426. PUSHORT pusDst = (PUSHORT)ppixDst;
  427. PUSHORT pusSrcEnd = pusSrc + cx;
  428. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  429. //
  430. // Blend: D = sA * S + (1-sA) * D
  431. //
  432. while (pusSrc != pusSrcEnd)
  433. {
  434. USHORT usDst = *pusDst;
  435. USHORT usSrc = *pusSrc;
  436. ULONG uB00rr00bb;
  437. ULONG uF00rr00bb;
  438. ULONG uMrrrrbbbb;
  439. ULONG uM00rr00bb;
  440. ULONG uDrrxxbbxx;
  441. ULONG uB000000gg;
  442. ULONG uF000000gg;
  443. ULONG uM0000gggg;
  444. ULONG uM000000gg;
  445. ULONG uD0000ggxx;
  446. //
  447. // red and blue
  448. //
  449. uB00rr00bb = (usDst & 0xf81f); // uB 0rrr rr00 000b bbbb
  450. uF00rr00bb = (usSrc & 0xf81f); // uS 0rrr rr00 000b bbbb
  451. uMrrrrbbbb = ((uB00rr00bb<<5)-uB00rr00bb) +
  452. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00008010;
  453. uM00rr00bb = (uMrrrrbbbb & 0x001F03E0) >> 5;
  454. uDrrxxbbxx = ((uMrrrrbbbb+uM00rr00bb) >> 5) & 0xf81f;
  455. //
  456. // green
  457. //
  458. uB000000gg = (usDst & 0x7e0) >> 5;
  459. uF000000gg = (usSrc & 0x7e0) >> 5;
  460. uM0000gggg = ((uB000000gg <<6)-uB000000gg) +
  461. (2 * ConstAlpha * (uF000000gg-uB000000gg)) + 0x00000020;
  462. uM000000gg = (uM0000gggg & 0x00000fc0)>>6;
  463. uD0000ggxx = ((uM0000gggg + uM000000gg) & 0x0fc0) >> 1;
  464. *pusDst = (USHORT)(uDrrxxbbxx | uD0000ggxx);
  465. pusSrc++;
  466. pusDst++;
  467. }
  468. }
  469. #endif
  470. /******************************Public*Routine******************************\
  471. * vAlphaConstOnly24
  472. *
  473. * Optimized version of vAlphaConstOnly used when source and destination
  474. * are both 24bpp.
  475. *
  476. * Arguments:
  477. *
  478. * pixDst,
  479. * pixSrc,
  480. * cx,
  481. * BlendFunction
  482. *
  483. * Return Value:
  484. *
  485. *
  486. *
  487. * History:
  488. *
  489. * 12/2/1996 Mark Enstrom [marke]
  490. *
  491. \**************************************************************************/
  492. VOID
  493. vAlphaConstOnly24(
  494. ALPHAPIX *ppixDst,
  495. ALPHAPIX *ppixSrc,
  496. LONG cx,
  497. BLENDFUNCTION BlendFunction
  498. )
  499. {
  500. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  501. PBYTE pjSrc = (PBYTE)ppixSrc;
  502. PBYTE pjDst = (PBYTE)ppixDst;
  503. PBYTE pjSrcEnd = pjSrc + 3*cx;
  504. while (pjSrc != pjSrcEnd)
  505. {
  506. ULONG ulDst = (*pjDst) << 16;
  507. ULONG ulSrc = (*pjSrc) << 16;
  508. ULONG uB00rr00bb;
  509. ULONG uF00rr00bb;
  510. ULONG uMrrrrbbbb;
  511. ULONG uM00rr00bb;
  512. ULONG uD00rr00bb;
  513. ULONG uB000000gg;
  514. ULONG uF000000gg;
  515. ULONG uM0000gggg;
  516. ULONG uM000000gg;
  517. ULONG uD000000gg;
  518. //
  519. // red and blue
  520. //
  521. uB00rr00bb = uB00rr00bb = ulDst | (*(pjDst+1));
  522. uF00rr00bb = uF00rr00bb = ulSrc | (*(pjSrc+1));
  523. uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
  524. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
  525. uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
  526. uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
  527. //
  528. // green
  529. //
  530. uB000000gg = *(pjDst+2);
  531. uF000000gg = *(pjSrc+2);
  532. uM0000gggg = ((uB000000gg <<8)-uB000000gg) +
  533. (ConstAlpha * (uF000000gg-uB000000gg)) + 0x00000080;
  534. uM000000gg = (uM0000gggg & 0x0000ff00)>>8;
  535. uD000000gg = ((uM0000gggg + uM000000gg) & 0x0000ff00) >> 8;
  536. *pjDst = (BYTE)(uD00rr00bb >> 16);
  537. *(pjDst+1) = (BYTE)(uD00rr00bb);
  538. *(pjDst+2) = (BYTE)(uD000000gg);
  539. pjSrc+=3;
  540. pjDst+=3;
  541. }
  542. }
  543. /******************************Public*Routine******************************\
  544. * AlphaScanLineBlend
  545. *
  546. * Blends source and destionation surfaces one scan line at a time.
  547. *
  548. * Allocate a scan line buffer for xlate of src to 32BGRA if needed.
  549. * Allocate a scan line buffer for xlate of dst to 32BGRA if needed.
  550. * Blend scan line using blend function from pAlphaDispatch
  551. * Write scan line back to dst (if needed)
  552. *
  553. * Arguments:
  554. *
  555. * pDst - pointer to dst surface
  556. * pDstRect - Dst output rect
  557. * DeltaDst - dst scan line delat
  558. * pSrc - pointer to src surface
  559. * DeltaSrc - src scan line delta
  560. * pptlSrc - src offset
  561. * pxloSrcTo32 - xlateobj from src to 32BGR
  562. * pxlo32ToDst - xlateobj from 32BGR to dst
  563. * palDst - destination palette
  564. * palSrc - source palette
  565. * pAlphaDispatch - blend data and function pointers
  566. *
  567. * Return Value:
  568. *
  569. * Status
  570. *
  571. * History:
  572. *
  573. * 10/14/1996 Mark Enstrom [marke]
  574. *
  575. \**************************************************************************/
  576. BOOL
  577. AlphaScanLineBlend(
  578. PBYTE pDst,
  579. PRECTL pDstRect,
  580. LONG DeltaDst,
  581. PBYTE pSrc,
  582. LONG DeltaSrc,
  583. PPOINTL pptlSrc,
  584. XLATEOBJ *pxloSrcTo32,
  585. XLATEOBJ *pxloDstTo32,
  586. XLATEOBJ *pxlo32ToDst,
  587. XEPALOBJ palDst,
  588. XEPALOBJ palSrc,
  589. PALPHA_DISPATCH_FORMAT pAlphaDispatch
  590. )
  591. {
  592. //
  593. // get two scanlines of RGBA data, blend pixels, store
  594. //
  595. LONG cx = pDstRect->right - pDstRect->left;
  596. LONG cy = pDstRect->bottom - pDstRect->top;
  597. LONG ScanBufferWidth = cx * 4;
  598. LONG AllocationSize = 0;
  599. LONG lSrcBytesPerPixel = pAlphaDispatch->ulSrcBitsPerPixel/8;
  600. LONG lDstBytesPerPixel = pAlphaDispatch->ulDstBitsPerPixel/8;
  601. PBYTE pjSrcTempScanBuffer = NULL;
  602. PBYTE pjDstTempScanBuffer = NULL;
  603. PBYTE pjAlloc = NULL;
  604. PBYTE pjDstTmp;
  605. PBYTE pjSrcTmp;
  606. XEPALOBJ palDstDC = (XEPALOBJ)((XLATE *) pxlo32ToDst)->ppalDstDC;
  607. // Arithemtic overflow check
  608. if (ScanBufferWidth < cx)
  609. return(FALSE);
  610. //
  611. // calculate destination starting address
  612. //
  613. if (lDstBytesPerPixel)
  614. {
  615. pjDstTmp = pDst + lDstBytesPerPixel * pDstRect->left + DeltaDst * pDstRect->top;
  616. }
  617. else if (pAlphaDispatch->ulDstBitsPerPixel == 1)
  618. {
  619. pjDstTmp = pDst + pDstRect->left/8 + DeltaDst * pDstRect->top;
  620. }
  621. else
  622. {
  623. pjDstTmp = pDst + pDstRect->left/2 + DeltaDst * pDstRect->top;
  624. }
  625. //
  626. // calculate source starting address
  627. //
  628. if (lSrcBytesPerPixel)
  629. {
  630. pjSrcTmp = pSrc + lSrcBytesPerPixel * pptlSrc->x + DeltaSrc * pptlSrc->y;
  631. }
  632. else if (pAlphaDispatch->ulSrcBitsPerPixel == 1)
  633. {
  634. pjSrcTmp = pSrc + pptlSrc->x/8 + DeltaSrc * pptlSrc->y;
  635. }
  636. else
  637. {
  638. pjSrcTmp = pSrc + pptlSrc->x/2 + DeltaSrc * pptlSrc->y;
  639. }
  640. //
  641. // calculate size of needed scan line buffer
  642. //
  643. if (pAlphaDispatch->pfnLoadDstAndConvert != NULL)
  644. {
  645. AllocationSize += ScanBufferWidth;
  646. }
  647. if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL)
  648. {
  649. AllocationSize += ScanBufferWidth;
  650. // Arithemtic overflow check
  651. if (AllocationSize < ScanBufferWidth)
  652. return(FALSE);
  653. }
  654. //
  655. // allocate scan line buffer memory
  656. //
  657. if (AllocationSize) {
  658. pjAlloc = (PBYTE)PALLOCMEM(AllocationSize,'plaG');
  659. if (pjAlloc == NULL)
  660. {
  661. return(FALSE);
  662. }
  663. }
  664. PBYTE pjTemp = pjAlloc;
  665. if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL)
  666. {
  667. pjSrcTempScanBuffer = pjTemp;
  668. pjTemp += ScanBufferWidth;
  669. }
  670. if (pAlphaDispatch->pfnLoadDstAndConvert != NULL)
  671. {
  672. pjDstTempScanBuffer = pjTemp;
  673. pjTemp += ScanBufferWidth;
  674. }
  675. //
  676. // Blend scan lines
  677. //
  678. while (cy--)
  679. {
  680. PBYTE pjSource = pjSrcTmp;
  681. PBYTE pjDest = pjDstTmp;
  682. //
  683. // get src scan line if needed
  684. //
  685. if (pjSrcTempScanBuffer)
  686. {
  687. (*pAlphaDispatch->pfnLoadSrcAndConvert)(
  688. (PULONG)pjSrcTempScanBuffer,
  689. pjSrcTmp,
  690. 0,
  691. cx,
  692. pxloSrcTo32
  693. );
  694. pjSource = pjSrcTempScanBuffer;
  695. }
  696. //
  697. // get dst scan line if needed
  698. //
  699. if (pjDstTempScanBuffer)
  700. {
  701. (*pAlphaDispatch->pfnLoadDstAndConvert)(
  702. (PULONG)pjDstTempScanBuffer,
  703. pjDstTmp,
  704. 0,
  705. cx,
  706. pxloDstTo32
  707. );
  708. pjDest = pjDstTempScanBuffer;
  709. }
  710. //
  711. // blend
  712. //
  713. (*pAlphaDispatch->pfnGeneralBlend)(
  714. (PALPHAPIX)pjDest,
  715. (PALPHAPIX)pjSource,
  716. cx,
  717. pAlphaDispatch->BlendFunction);
  718. //
  719. // write buffer back if needed
  720. //
  721. if (pjDstTempScanBuffer)
  722. {
  723. (*pAlphaDispatch->pfnConvertAndStore)(
  724. pjDstTmp,
  725. (PULONG)pjDstTempScanBuffer,
  726. cx,
  727. 0,
  728. pxlo32ToDst,
  729. palDst,
  730. palDstDC);
  731. }
  732. pjDstTmp += DeltaDst;
  733. pjSrcTmp += DeltaSrc;
  734. }
  735. //
  736. // free scan line buffer memory
  737. //
  738. if (AllocationSize) VFREEMEM(pjAlloc);
  739. return(TRUE);
  740. }
  741. #if defined(_X86_)
  742. //
  743. // MMX assembly code from intel
  744. //
  745. typedef unsigned __int64 QWORD;
  746. /**************************************************************************\
  747. * mmxAlphaPerPixelOnly
  748. *
  749. * Used when the source has per-pixel alpha values and the
  750. * SourceConstantAlpha is 255.
  751. *
  752. * Dst = Src + (1-SrcAlpha) * Dst
  753. *
  754. * Arguments:
  755. *
  756. * ppixDst - address of dst pixel
  757. * ppixSrc - address of src pixel
  758. * cx - number of pixels in scan line
  759. * BlendFunction - blend to be done on each pixel
  760. *
  761. * Return Value:
  762. *
  763. * none
  764. *
  765. * History:
  766. *
  767. * 3/12/1997 Mark Enstrom [marke]
  768. *
  769. \**************************************************************************/
  770. /**************************************************************************
  771. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  772. DO NOT CALL THIS FUNCTION WITH WIDTH == 0
  773. This function operates on 32 bit pixels (BGRA) in a row of a bitmap.
  774. This function performs the following:
  775. SrcTran = 255 - pixSrc.a
  776. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255);
  777. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255);
  778. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255);
  779. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255);
  780. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  781. Step 1:
  782. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  783. as a DWORD, then do Step 2.
  784. Step 2:
  785. QuadAligned
  786. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  787. pixel left, do as a DWORD.
  788. Step 3:
  789. Load two source pixels, S1 and S2. Get (255 - alpha value) for each source pixel, 255-S1a and 255-S2a.
  790. Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register.
  791. Load two destination pixels, D1 and D2. Expand each byte in D1 into four words
  792. of an MMX register. If at least four pixels can be done, do Step 4. If not, jump over
  793. FourPixelsPerPass and finish doing two pixels at TwoPixelsLeft, Step 5.
  794. Step 4:
  795. FourPixelsPerPass
  796. Expand each byte in D2 into four words of an MMX register. Multiply each byte
  797. of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate result
  798. of both pixels. Copy the results of each pixel into an MMX register. Shift each result of
  799. both pixels by 8. Add the shifted results to the copied results. Shift these results by 8.
  800. Pack the results into one MMX register. Add the packed results to the source pixels. Store result
  801. over destination pixels. Stay in FourPixelsPerPass loop until there are less than four pixels to do.
  802. Step 5:
  803. TwoPixelsLeft
  804. Do same as Step 4 above; but do not loop.
  805. Step 6:
  806. OnePixelLeft
  807. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  808. **************************************************************************/
  809. VOID
  810. mmxAlphaPerPixelOnly(
  811. ALPHAPIX *pDst,
  812. ALPHAPIX *pSrc,
  813. LONG Width,
  814. BLENDFUNCTION BlendFunction)
  815. {
  816. static QWORD W128 = 0x0080008000800080;
  817. static QWORD AlphaMask = 0x000000FF000000FF;
  818. _asm
  819. {
  820. mov esi, pSrc
  821. mov edi, pDst
  822. movq mm7, W128 // | 0 | 128 | 0 | 128 | 0 | 128 | 0 | 128 |
  823. // This register never changes
  824. pxor mm6, mm6 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
  825. // This register never changes
  826. mov ecx, Width
  827. // Step 1:
  828. test edi, 7 // Test first pixel for QWORD alignment
  829. jz QuadAligned // if unaligned,
  830. jmp Do1Pixel // do first pixel only
  831. QuadAligned: // Step 2:
  832. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  833. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  834. test ecx, ecx // Make sure there is at least 1 quad to do
  835. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  836. // Step 3:
  837. movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  838. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  839. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  840. movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  841. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  842. movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  843. punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
  844. movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  845. punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  846. punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
  847. punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  848. dec ecx
  849. jz TwoPixelsLeft
  850. FourPixelsPerPass: // Step 4:
  851. // Indenting indicates operations on the next set of pixels
  852. // Within this loop, instructions will pair as shown for the Pentium processor
  853. // T1 = 255-S1a T2 = 255-S2a
  854. punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
  855. pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
  856. movq mm0, [esi+8] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  857. pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
  858. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  859. add esi, 8 // pSrc++;
  860. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  861. paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  862. paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  863. movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  864. movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  865. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  866. movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  867. punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
  868. // TDXx' = TX*DXx+128
  869. psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  870. // TDXx" = (TX*DXx+128)+(TDXx'>>8)
  871. psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
  872. paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
  873. paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
  874. psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  875. movq mm2, [edi+8] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  876. psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
  877. movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  878. packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  879. paddusb mm4, [esi-8]
  880. punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  881. movq [edi], mm4
  882. punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
  883. punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  884. add edi, 8 // pDst++;
  885. dec ecx
  886. jnz FourPixelsPerPass
  887. TwoPixelsLeft: // Step 5:
  888. punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
  889. pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
  890. pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
  891. paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  892. paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  893. movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  894. movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  895. psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  896. psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
  897. paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
  898. paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
  899. psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  900. psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
  901. packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  902. paddusb mm4, [esi]
  903. movq [edi], mm4
  904. add edi, 8
  905. add esi, 8
  906. OnePixelLeft: // Step 6:
  907. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  908. // If 0, there were an even number of pixels and we're done
  909. // If 1, there is an odd number of pixels and we need to do one more
  910. test eax, 1
  911. jz Done
  912. Do1Pixel: // make as a macro if used in asm file
  913. // T = 255-S1x
  914. movd mm0, DWORD PTR[esi] // | 0 | 0 | 0 | 0 | S1a | S1r | S1g | S1b |
  915. psrld mm0, 24 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | S1a |
  916. pxor mm0, AlphaMask // | 0 | 0 | 0 | 255 | 0 | 0 | 0 |255-S1a|
  917. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  918. punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  919. movd mm1, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b |
  920. punpcklbw mm1, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  921. pmullw mm0, mm1 // | T*D1a | T*D1r | T*D1g | T*D1b |
  922. paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  923. movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  924. psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  925. paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" |
  926. psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  927. movd mm1, [esi]
  928. packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  929. paddusb mm0, mm1
  930. movd [edi], mm0
  931. add edi, 4 // pDst++;
  932. add esi, 4 // pSrc++;
  933. test ecx, ecx
  934. jz Done // just processed the last pixel of the row
  935. dec ecx
  936. jmp QuadAligned // just processed the first pixel of the row
  937. Done:
  938. emms // remove for optimizations, have calling function do emms
  939. }
  940. }
  941. /**************************************************************************\
  942. * mmxAlphaPerPixelAndConst
  943. *
  944. * Used when the source has per-pixel alpha values and the
  945. * SourceConstantAlpha is not 255.
  946. *
  947. * if SrcAlpha == 255 then
  948. *
  949. * Dst = Dst + ConstAlpha * (Src - Dst)
  950. *
  951. * else
  952. *
  953. * Src = Src * ConstAlpha
  954. * Dst = Src + (1 - SrcAlpha) Dst
  955. *
  956. * Arguments:
  957. *
  958. * ppixDst - address of dst pixel
  959. * ppixSrc - address of src pixel
  960. * cx - number of pixels in scan line
  961. * BlendFunction - blend to be done on each pixel
  962. *
  963. * Return Value:
  964. *
  965. * None
  966. *
  967. * History:
  968. *
  969. * 3/12/1997 Mark Enstrom [marke]
  970. *
  971. \**************************************************************************/
  972. /**************************************************************************
  973. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  974. DO NOT CALL THIS FUNCTION WITH WIDTH == 0
  975. This function operates on 32 bit pixels (BGRA) in a row of a bitmap.
  976. This function performs the following:
  977. first,
  978. pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
  979. pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
  980. pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
  981. pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
  982. then,
  983. SrcTran = 255 - pixSrc.a
  984. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255);
  985. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255);
  986. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255);
  987. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255);
  988. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  989. Step 1:
  990. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  991. as a DWORD, then do Step 2.
  992. Step 2:
  993. QuadAligned
  994. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  995. pixel left, do as a DWORD.
  996. Step 3:
  997. Load two source pixels, S1 and S2, as one QWORD. Expand S1 and S2 as four words into two MMX registers.
  998. Multiply each word in S1 and S2 by ConstAlpha. Add 128 to each result of both pixels. Copy the results
  999. of each pixel into an MMX register. Shift each result of both pixels by 8. Add the shifted results
  1000. to the copied results. Shift these results by 8. Pack the results into one MMX register...this will
  1001. be used later.
  1002. Shift the packed results by 24 to get only the alpha value for each pixel.
  1003. Step 4:
  1004. Get (255 - new alpha value) for each pixel, 255-S1a and 255-S2a.
  1005. Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register.
  1006. Load two destination pixels, D1 and D2. Expand D1 and D2 as four words into two MMX registers.
  1007. Multiply each byte of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate
  1008. result of both pixels. Copy the results of each pixel into an MMX register. Shift each result of
  1009. both pixels by 8. Add the shifted results to the copied results. Shift these results by 8.
  1010. Pack the results into one MMX register. Add the packed results to the new source pixels saved from
  1011. above. Store result over destination pixels. Stay in TwoPixelsAtOnceLoop loop until there is less than
  1012. two pixels to do.
  1013. Step 5:
  1014. OnePixelLeft
  1015. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  1016. **************************************************************************/
  1017. VOID
  1018. mmxAlphaPerPixelAndConst(
  1019. ALPHAPIX *pDst,
  1020. ALPHAPIX *pSrc,
  1021. LONG Width,
  1022. BLENDFUNCTION BlendFunction
  1023. )
  1024. {
  1025. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1026. static QWORD W128 = 0x0080008000800080;
  1027. static QWORD AlphaMask = 0x000000FF000000FF;
  1028. static QWORD Zeros = 0;
  1029. _asm
  1030. {
  1031. mov esi, pSrc
  1032. mov edi, pDst
  1033. movq mm7, W128 // This register never changes
  1034. pxor mm4, mm4 // This register never changes
  1035. xor eax, eax
  1036. mov al, ConstAlpha
  1037. movd mm5, eax // | | | | CA |
  1038. punpcklwd mm5, mm5 // | | | CA | CA |
  1039. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1040. // This register never changes
  1041. mov ecx, Width
  1042. // Step 1:
  1043. test edi, 7 // Test first pixel for QWORD alignment
  1044. jz QuadAligned // if unaligned,
  1045. jmp Do1Pixel // do first pixel only
  1046. QuadAligned: // Step 2:
  1047. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  1048. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  1049. test ecx, ecx // Make sure there is at least 1 quad to do
  1050. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  1051. TwoPixelsAtOnceLoop: // Step 3:
  1052. // Within this loop, instructions will pair as shown for the Pentium processor
  1053. /* Dissolve
  1054. pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
  1055. pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
  1056. pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
  1057. pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
  1058. */
  1059. movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  1060. movq mm1, mm0 // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  1061. punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b |
  1062. punpckhbw mm1, mm4 // | 0 | S2a | 0 | S2r | 0 | S2g | 0 | S2b |
  1063. pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b |
  1064. add esi, 8 // pSrc++;
  1065. pmullw mm1, mm5 // | CA*S2a | CA*S2r | CA*S2g | CA*S2b |
  1066. paddusw mm1, mm7 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 |
  1067. paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  1068. movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  1069. psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 |
  1070. // S1x' = CA*S1x+128 S2x' = CA*S2x+128
  1071. movq mm3, mm1 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 |
  1072. psrlw mm1, 8 // | S2a'>>8 | S2r'>>8 | S2g'>>8 | S2b'>>8 |
  1073. // S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8
  1074. paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" |
  1075. paddusw mm1, mm3 // | S2a" | S2r" | S2g" | S2b" |
  1076. psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 |
  1077. // SXx'" = ((CA*SXx+128)>>8)>>8)
  1078. psrlw mm1, 8 // | S2a">>8 | S2r">>8 | S2g">>8 | S2b">>8 |
  1079. packuswb mm0, mm1 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"|
  1080. movq mm6, mm0
  1081. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  1082. /* Over
  1083. SrcTran = 255 - pixSrc.a
  1084. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255);
  1085. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255);
  1086. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255);
  1087. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255);
  1088. */
  1089. // Step 4:
  1090. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  1091. movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  1092. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  1093. movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  1094. punpcklwd mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  1095. movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  1096. punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
  1097. punpcklwd mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
  1098. punpckhbw mm3, mm4 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
  1099. // T1 = 255-S1a T2 = 255-S2a
  1100. punpcklbw mm2, mm4 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  1101. pmullw mm1, mm3 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
  1102. add edi, 8 // pDst++;
  1103. pmullw mm0, mm2 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
  1104. paddusw mm0, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  1105. paddusw mm1, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  1106. movq mm3, mm1 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  1107. // TDXx' = TX*DXx+128
  1108. psrlw mm1, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
  1109. movq mm2, mm0 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  1110. psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  1111. // TDXx" = (TX*DXx+128)+(TDXx'>>8)
  1112. paddusw mm1, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
  1113. paddusw mm0, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
  1114. psrlw mm1, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
  1115. psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  1116. packuswb mm0, mm1 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  1117. // SXx = SXx'" TDXx = TDXx'"
  1118. paddusb mm0, mm6// |S2a+TD2a|S2r+TD2r|S2g+TD2g|S2b+TD2b|S1a+TD1a|S1r+TD1r|S1g+TD1g|S1b+TD1b|
  1119. movq [edi-8], mm0
  1120. dec ecx
  1121. jnz TwoPixelsAtOnceLoop
  1122. OnePixelLeft: // Step 5:
  1123. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  1124. // If 0, there were an even number of pixels and we're done
  1125. // If 1, there is an odd number of pixels and we need to do one more
  1126. test eax, 1
  1127. jz Done
  1128. Do1Pixel: // make as a macro if used in asm file
  1129. /* Dissolve
  1130. pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
  1131. pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
  1132. pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
  1133. pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
  1134. */
  1135. movd mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  1136. punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b |
  1137. pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b |
  1138. paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  1139. movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  1140. // S1x' = CA*S1x+128 S2x' = CA*S2x+128
  1141. psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 |
  1142. // S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8
  1143. paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" |
  1144. psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 |
  1145. packuswb mm0, mm0 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"|
  1146. movq mm6, mm0
  1147. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  1148. /* Over
  1149. SrcTran = 255 - pixSrc.a
  1150. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255);
  1151. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255);
  1152. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255);
  1153. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255);
  1154. */
  1155. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  1156. punpcklwd mm0, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 |255-S1a|255-S1a|
  1157. punpckldq mm0, mm0 // | 255-S1a| 255-S1a| 255-S1a| 255-S1a|
  1158. movd mm2, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b |
  1159. punpcklbw mm2, mm4 // | D1a | D1r | D1g | D1b |
  1160. // T = 255-S1x
  1161. pmullw mm0, mm2 // | T*D1a | T*D1r | T*D1g | T*D1b |
  1162. paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  1163. movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  1164. psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  1165. paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" |
  1166. psrlw mm0, 8
  1167. packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  1168. paddusb mm0, mm6
  1169. movd [edi], mm0
  1170. add edi, 4 // pDst++;
  1171. add esi, 4 // pSrc++;
  1172. test ecx, ecx
  1173. jz Done // just processed the last pixel of the row
  1174. dec ecx
  1175. jmp QuadAligned // just processed the first pixel of the row
  1176. Done:
  1177. emms // remove for optimizations, have calling function do emms
  1178. }
  1179. }
  1180. /**************************************************************************\
  1181. * mmxAlphaConstOnly16_555
  1182. *
  1183. * Optimized version of mmxAlphaConstOnly used when source and destination
  1184. * are both 16_555.
  1185. *
  1186. * Dst = Dst + ConstAlpha * (Src - Dst)
  1187. *
  1188. * Arguments:
  1189. *
  1190. * ppixDst - address of dst pixel
  1191. * ppixSrc - address of src pixel
  1192. * cx - number of pixels in scan line
  1193. * BlendFunction - blend to be done on each pixel
  1194. *
  1195. * Return Value:
  1196. *
  1197. * None
  1198. *
  1199. \**************************************************************************/
  1200. /**************************************************************************
  1201. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  1202. This function operates on 16 bit pixels (5 for Red, 5 for Green, and 5 for Blue) in a row of a bitmap.
  1203. It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
  1204. The function performs the following on each byte:
  1205. For red, green, and blue:
  1206. tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31)
  1207. tmp2 = tmp1 / 32
  1208. tmp2 = tmp2 + tmp1
  1209. tmp2 = tmp2 / 32
  1210. Dst = tmp2
  1211. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  1212. Red and blue are processed together in the same register. Green is processed separately.
  1213. For two pixels at once, the reds and blues for both pixels are processed in the same register; and the
  1214. greens are processed together in a separate register.
  1215. The loop structure is as follows:
  1216. Step 1:
  1217. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  1218. as a DWORD (OnePixelLeft:), then do Step 2.
  1219. Step 2:
  1220. (QuadAligned:)
  1221. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  1222. pixel left, do as a DWORD.
  1223. Step 3:
  1224. (TwoPixelsAtOnceLoop:)
  1225. Perform the above function, using MMX instructions, on two pixels per pass of the loop.
  1226. Step 4:
  1227. (OnePixelLeft:)
  1228. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  1229. **************************************************************************/
  1230. VOID
  1231. mmxAlphaConstOnly16_555(
  1232. PALPHAPIX pDst,
  1233. PALPHAPIX pSrc,
  1234. LONG Width,
  1235. BLENDFUNCTION BlendFunction
  1236. )
  1237. {
  1238. static QWORD RMask = 0x007C0000007C0000;
  1239. static QWORD GMask = 0x0000000003E003E0;
  1240. static QWORD BMask = 0x0000001F0000001F;
  1241. static QWORD RBConst = 0x0010001000100010;
  1242. static QWORD GConst = 0x0000000000100010;
  1243. static QWORD RedMask = 0x001F0000001F0000;
  1244. static QWORD CA; // ConstAlpha in 4 words of a qword
  1245. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1246. _asm
  1247. {
  1248. mov ecx, Width // Make sure there is at least one pixel to do
  1249. test ecx, ecx
  1250. jz Done
  1251. mov esi, pSrc
  1252. mov edi, pDst
  1253. xor eax, eax
  1254. mov al, ConstAlpha
  1255. movd mm5, eax // | | | | CA |
  1256. punpcklwd mm5, mm5 // | | | CA | CA |
  1257. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1258. movq CA, mm5
  1259. // Step 1:
  1260. test edi, 7 // Test first pixel for QWORD alignment
  1261. jz QuadAligned // if unaligned,
  1262. jmp Do1Pixel // do first pixel only
  1263. QuadAligned: // Step 2:
  1264. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  1265. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  1266. test ecx, ecx // Make sure there is at least 1 quad to do
  1267. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  1268. TwoPixelsAtOnceLoop: // Step 3:
  1269. movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1270. pxor mm7, mm7
  1271. movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1272. movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1273. movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1274. punpcklbw mm0, mm7 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1275. punpcklbw mm1, mm7 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1276. movq mm4, mm0 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1277. pand mm0, RMask // | D2rrrrr00 | 0 | D1rrrrr00 | 0 |
  1278. movq mm5, mm1 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1279. pand mm4, BMask // | 0 | D2bbbbb | 0 | D1bbbbb |
  1280. psrlw mm0, 2 // | D2rrrrr | 0 | D1rrrrr | 0 |
  1281. pand mm1, RMask // | S2rrrrr00 | 0 | S1rrrrr00 | 0 |
  1282. por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1283. pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb |
  1284. movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1285. pand mm2, GMask // | 0 | 0 |D2ggggg00000|D1ggggg00000|
  1286. psllw mm4, 5 // | D2r*32 | D2b*32 | D1r*32 | D1b*32 |
  1287. pand mm3, GMask // | 0 | 0 |S2ggggg00000|S1ggggg00000|
  1288. psrlw mm1, 2 // | S2rrrrr | 0 | S1rrrrr | 0 |
  1289. por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb |
  1290. movq mm6, mm2 // | 0 | 0 |D2ggggg00000|D1ggggg00000|
  1291. psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b |
  1292. psrlw mm2, 5 // | 0 | 0 | D2ggggg | D1ggggg |
  1293. pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b |
  1294. psubw mm4, mm0 // | D2r*(32-1) | D2b*(32-1) | D1r*(32-1) | D1b*(32-1) |
  1295. paddw mm4, RBConst// | CA2r+c | CA2b+c | CA1r+c | CA1b+c |
  1296. psrlw mm3, 5 // | 0 | 0 | S2ggggg | S1ggggg |
  1297. psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g |
  1298. add esi, 4 // pSrc++;
  1299. pmullw mm3, CA // | 0 | 0 | CA2g | CA1g |
  1300. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1301. psubw mm6, mm2 // | 0 | 0 | D2g*(32-1) | D1g*(32-1) |
  1302. add edi, 4 // pDst++;
  1303. paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c |
  1304. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1305. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1306. psrlw mm4, 5 // RBtmp2 = RBtmp1 / 32
  1307. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1308. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1309. psrlw mm6, 5 // Gtmp2 = Gtmp1 / 32
  1310. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1311. psrlw mm1, 5 // RBtmp2 = RBtmp2 / 32
  1312. pand mm5, GMask // Gtmp2 = Gtmp2 / 32, but keep bit position
  1313. movq mm4, mm1 // RBtmp2 = RBtmp2 / 32
  1314. pand mm4, RedMask// Mask to get red
  1315. pand mm1, BMask // Mask to get blue
  1316. psllw mm4, 2 // Line up the red
  1317. por mm4, mm1 // Combine reds and blues in proper bit location
  1318. packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb |
  1319. por mm4, mm5 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb |
  1320. movd [edi-4], mm4
  1321. dec ecx
  1322. jnz TwoPixelsAtOnceLoop
  1323. OnePixelLeft: // Step 4:
  1324. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  1325. // If 0, there was an even number of pixels and we're done
  1326. // If 1, there is an odd number of pixels and we need to do one more
  1327. test eax, 1
  1328. jz Done
  1329. Do1Pixel: // make as a macro if used in asm file
  1330. movzx edx,WORD PTR[edi] // edx = D 0000 0000 0rrr rrgg gggb bbbb
  1331. movzx ebx,WORD PTR[esi] // ebx = S 0000 0000 0rrr rrgg gggb bbbb
  1332. movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1333. pxor mm7, mm7
  1334. movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1335. movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1336. movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1337. punpcklbw mm0, mm7 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1338. punpcklbw mm1, mm7 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1339. movq mm4, mm0 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1340. pand mm0, RMask // | 0 | 0 | D1rrrrr00 | 0 |
  1341. movq mm5, mm1 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1342. pand mm4, BMask // | 0 | 0 | 0 | D1bbbbb |
  1343. psrlw mm0, 2 // | 0 | 0 | D1rrrrr | 0 |
  1344. pand mm1, RMask // | 0 | 0 | S1rrrrr00 | 0 |
  1345. por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1346. pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb |
  1347. movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1348. pand mm2, GMask // | 0 | 0 | 0 |D1ggggg00000|
  1349. psllw mm4, 5 // | 0 | 0 | D1r*32 | D1b*32 |
  1350. pand mm3, GMask // | 0 | 0 | 0 |S1ggggg00000|
  1351. psrlw mm1, 2 // | 0 | 0 | S1rrrrr | 0 |
  1352. por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb |
  1353. movq mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000|
  1354. psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b |
  1355. psrlw mm2, 5 // | 0 | 0 | 0 | D1ggggg |
  1356. pmullw mm5, CA // | 0 | 0 | CA1r | CA1b |
  1357. psubw mm4, mm0 // | 0 | 0 | D1r*(32-1) | D1b*(32-1) |
  1358. paddw mm4, RBConst// | 0 | 0 | CA1r+c | CA1b+c |
  1359. psrlw mm3, 5 // | 0 | 0 | 0 | S1ggggg |
  1360. psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g |
  1361. add esi, 2 // pSrc++;
  1362. pmullw mm3, CA // | 0 | 0 | 0 | CA1g |
  1363. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1364. psubw mm6, mm2 // | 0 | 0 | 0 | D1g*(32-1) |
  1365. add edi, 2 // pDst++;
  1366. paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c |
  1367. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1368. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1369. psrlw mm4, 5 // RBtmp2 = RBtmp1 / 32
  1370. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1371. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1372. psrlw mm6, 5 // Gtmp2 = Gtmp1 / 32
  1373. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1374. psrlw mm1, 5 // RBtmp2 = RBtmp2 / 32
  1375. pand mm5, GMask // Gtmp2 = Gtmp2 / 32, but keep bit position
  1376. movq mm4, mm1 // RBtmp2 = RBtmp2 / 32
  1377. pand mm4, RedMask// Mask to get red
  1378. pand mm1, BMask // Mask to get blue
  1379. psllw mm4, 2 // Line up the red
  1380. por mm4, mm1 // Combine reds and blues in proper bit location
  1381. packsswb mm4, mm7 // | 0 | 0 | D10rrrrr00 | D1000bbbbb |
  1382. por mm4, mm5 // | 0 | 0 | D10rrrrrgg | D1gggbbbbb |
  1383. movd edx, mm4
  1384. mov [edi-2], dx
  1385. test ecx, ecx
  1386. jz Done // just processed the last pixel of the row
  1387. dec ecx
  1388. jmp QuadAligned // just processed the first pixel of the row
  1389. Done:
  1390. emms // remove for optimizations, have calling function do emms
  1391. }
  1392. }
  1393. /**************************************************************************\
  1394. * mmxAlphaConstOnly16_565
  1395. *
  1396. * Optimized version of mmxAlphaConstOnly used when source and destination
  1397. * are both 16_565.
  1398. *
  1399. * Dst = Dst + ConstAlpha * (Src - Dst)
  1400. *
  1401. * Arguments:
  1402. *
  1403. * ppixDst - address of dst pixel
  1404. * ppixSrc - address of src pixel
  1405. * cx - number of pixels in scan line
  1406. * BlendFunction - blend to be done on each pixel
  1407. *
  1408. * Return Value:
  1409. *
  1410. * None
  1411. *
  1412. \**************************************************************************/
  1413. /**************************************************************************
  1414. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  1415. This function operates on 16 bit pixels (5 for Red, 6 for Green, and 5 for Blue) in a row of a bitmap.
  1416. It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
  1417. The function performs the following:
  1418. For red and blue:
  1419. tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31)
  1420. tmp2 = tmp1 / 32
  1421. tmp2 = tmp2 + tmp1
  1422. tmp2 = tmp2 / 32
  1423. Dst = tmp2
  1424. For green:
  1425. tmp1 = Alpha(Src - Dst) + 32 + (Dst * 63)
  1426. tmp2 = tmp1 / 32
  1427. tmp2 = tmp2 + tmp1
  1428. tmp2 = tmp2 / 32
  1429. Dst = tmp2
  1430. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  1431. Red and blue are processed together in the same register. Green is processed separately.
  1432. For two pixels at once, the reds and blues for both pixels are processed in the same register; and the
  1433. greens are processed together in a separate register.
  1434. The loop structure is as follows:
  1435. Step 1:
  1436. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  1437. as a DWORD (OnePixelLeft:), then do Step 2.
  1438. Step 2:
  1439. (QuadAligned:)
  1440. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  1441. pixel left, do as a DWORD.
  1442. Step 3:
  1443. (TwoPixelsAtOnceLoop:)
  1444. Perform the above function, using MMX instructions, on two pixels per pass of the loop.
  1445. Step 4:
  1446. (OnePixelLeft:)
  1447. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  1448. **************************************************************************/
  1449. VOID
  1450. mmxAlphaConstOnly16_565(
  1451. PALPHAPIX pDst,
  1452. PALPHAPIX pSrc,
  1453. LONG Width,
  1454. BLENDFUNCTION BlendFunction
  1455. )
  1456. {
  1457. static QWORD RMask = 0x00FF000000FF0000;
  1458. static QWORD GMask = 0x0000000007E007E0;
  1459. static QWORD BMask = 0x0000001F0000001F;
  1460. static QWORD RBConst = 0x0010001000100010;
  1461. static QWORD GConst = 0x0000000000200020;
  1462. static QWORD GreenMask = 0x000000000FC00FC0;
  1463. static QWORD CA; // ConstAlpha in 4 words of a qword
  1464. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1465. _asm
  1466. {
  1467. mov ecx, Width // Make sure there is at least one pixel to do
  1468. test ecx, ecx
  1469. jz Done
  1470. mov esi, pSrc
  1471. mov edi, pDst
  1472. xor eax, eax
  1473. mov al, ConstAlpha
  1474. movd mm5, eax // | | | | CA |
  1475. punpcklwd mm5, mm5 // | | | CA | CA |
  1476. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1477. movq CA, mm5
  1478. // Step 1:
  1479. test edi, 7 // Test first pixel for QWORD alignment
  1480. jz QuadAligned // if unaligned,
  1481. jmp Do1Pixel // do first pixel only
  1482. QuadAligned: // Step 2:
  1483. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  1484. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  1485. test ecx, ecx // Make sure there is at least 1 quad to do
  1486. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  1487. TwoPixelsAtOnceLoop: // Step 3:
  1488. movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1489. pxor mm7, mm7
  1490. movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1491. movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1492. movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1493. punpcklbw mm0, mm7 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1494. punpcklbw mm1, mm7 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1495. movq mm4, mm0 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1496. pand mm0, RMask // | D2rrrrr000 | 0 | D1rrrrr000 | 0 |
  1497. movq mm5, mm1 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1498. pand mm4, BMask // | 0 | D2bbbbb | 0 | D1bbbbb |
  1499. psrlw mm0, 3 // | D2rrrrr | 0 | D1rrrrr | 0 |
  1500. pand mm1, RMask // | S2rrrrr000 | 0 | S1rrrrr000 | 0 |
  1501. por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1502. pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb |
  1503. movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1504. pand mm2, GMask // | 0 | 0 |D2gggggg00000|D1gggggg00000|
  1505. psllw mm4, 5 // | D2r*32 | D2b*32 | D1r*32 | D1b*32 |
  1506. pand mm3, GMask // | 0 | 0 |S2gggggg00000|S1gggggg00000|
  1507. psrlw mm1, 3 // | S2rrrrr | 0 | S1rrrrr | 0 |
  1508. por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb |
  1509. movq mm6, mm2 // | 0 | 0 |D2gggggg00000|D1gggggg00000|
  1510. psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b |
  1511. psrlw mm2, 5 // | 0 | 0 | D2gggggg | D1gggggg |
  1512. pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b |
  1513. psubw mm4, mm0 // | D2r*(32-1) | D2b*(32-1) | D1r*(32-1) | D1b*(32-1) |
  1514. paddw mm4, RBConst // | CA2r+c | CA2b+c | CA1r+c | CA1b+c |
  1515. psrlw mm3, 5 // | 0 | 0 | S2gggggg | S1gggggg |
  1516. psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g |
  1517. add esi, 4 // pSrc++;
  1518. pmullw mm3, CA // | 0 | 0 | CA2g | CA1g |
  1519. psllw mm6, 1 // | 0 | 0 |D2gggggg000000|D1gggggg000000|
  1520. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1521. psubw mm6, mm2 // | 0 | 0 | D2g*(64-1) | D1g*(64-1) |
  1522. paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c |
  1523. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1524. add edi, 4 // pDst++;
  1525. psllw mm3, 1 // | 0 | 0 |CA2gggggg000000|CA1gggggg000000|
  1526. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1527. psrlw mm4, 5 // RBtmp2 = RBtmp1 / 32
  1528. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1529. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1530. psrlw mm6, 6 // Gtmp2 = Gtmp1 / 32
  1531. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1532. psrlw mm1, 5 // RBtmp2 = RBtmp2 / 32
  1533. pand mm5, GreenMask // Gtmp2 = Gtmp2 / 32, but keep bit position
  1534. movq mm4, mm1 // RBtmp2 = RBtmp2 / 32
  1535. pand mm4, RMask // Mask to get red
  1536. psrlw mm5, 1 // Align the green
  1537. pand mm1, BMask // Mask to get blue
  1538. psllw mm4, 3 // Align the red
  1539. por mm4, mm1 // Combine reds and blues in proper bit location
  1540. packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D2rrrrr000 | D2000bbbbb | D1rrrrr000 | D1000bbbbb |
  1541. por mm4, mm5 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1542. movd [edi-4], mm4
  1543. dec ecx
  1544. jnz TwoPixelsAtOnceLoop
  1545. OnePixelLeft: // Step 4:
  1546. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  1547. // If 0, there were an even number of pixels and we're done
  1548. // If 1, there is an odd number of pixels and we need to do one more
  1549. test eax, 1
  1550. jz Done
  1551. Do1Pixel: // make as a macro if used in asm file
  1552. movzx edx,WORD PTR[edi] // edx = D 0000 0000 rrrr rggg gggb bbbb
  1553. movzx ebx,WORD PTR[esi] // ebx = S 0000 0000 rrrr rggg gggb bbbb
  1554. movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1555. pxor mm7, mm7
  1556. movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1557. movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1558. movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1559. punpcklbw mm0, mm7 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1560. punpcklbw mm1, mm7 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1561. movq mm4, mm0 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1562. pand mm0, RMask // | 0 | 0 | D1rrrrr000 | 0 |
  1563. movq mm5, mm1 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1564. pand mm4, BMask // | 0 | 0 | 0 | D1bbbbb |
  1565. psrlw mm0, 3 // | 0 | 0 | D1rrrrr | 0 |
  1566. pand mm1, RMask // | 0 | 0 | S1rrrrr000 | 0 |
  1567. por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1568. pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb |
  1569. movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1570. pand mm2, GMask // | 0 | 0 | 0 |D1gggggg00000|
  1571. psllw mm4, 5 // | 0 | 0 | D1r*32 | D1b*32 |
  1572. pand mm3, GMask // | 0 | 0 | 0 |S1gggggg00000|
  1573. psrlw mm1, 3 // | 0 | 0 | S1rrrrr | 0 |
  1574. por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb |
  1575. movq mm6, mm2 // | 0 | 0 | 0 |D1gggggg00000|
  1576. psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b |
  1577. psrlw mm2, 5 // | 0 | 0 | 0 | D1gggggg |
  1578. pmullw mm5, CA // | 0 | 0 | CA1r | CA1b |
  1579. psubw mm4, mm0 // | 0 | 0 | D1r*(32-1) | D1b*(32-1) |
  1580. paddw mm4, RBConst // | 0 | 0 | CA1r+c | CA1b+c |
  1581. psrlw mm3, 5 // | 0 | 0 | 0 | S1gggggg |
  1582. psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g |
  1583. add esi, 2 // pSrc++;
  1584. pmullw mm3, CA // | 0 | 0 | 0 | CA1g |
  1585. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1586. psllw mm6, 1 // | 0 | 0 | 0 |D1gggggg000000|
  1587. psubw mm6, mm2 // | 0 | 0 | 0 | D1g*(64-1) |
  1588. add edi, 2 // pDst++;
  1589. paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c |
  1590. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1591. psllw mm3, 1 // | 0 | 0 | 0 |CA1gggggg000000|
  1592. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1593. psrlw mm4, 5 // RBtmp2 = RBtmp1 / 32
  1594. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1595. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1596. psrlw mm6, 6 // Gtmp2 = Gtmp1 / 32
  1597. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1598. psrlw mm1, 5 // RBtmp2 = RBtmp2 / 32
  1599. pand mm5, GreenMask // Gtmp2 = Gtmp2 / 32, but keep bit position
  1600. movq mm4, mm1 // RBtmp2 = RBtmp2 / 32
  1601. pand mm4, RMask // Mask to get red
  1602. psrlw mm5, 1 // Align the green
  1603. pand mm1, BMask // Mask to get blue
  1604. psllw mm4, 3 // Align the red
  1605. por mm4, mm1 // Combine reds and blues in proper bit location
  1606. packuswb mm4, mm7 // | 0 | 0 | D1rrrrr000 | D1000bbbbb |
  1607. por mm4, mm5 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1608. movd edx, mm4
  1609. mov [edi-2], dx
  1610. test ecx, ecx
  1611. jz Done // just processed the last pixel of the row
  1612. dec ecx
  1613. jmp QuadAligned // just processed the first pixel of the row
  1614. Done:
  1615. emms // remove for optimizations, have calling function do emms
  1616. }
  1617. }
  1618. /**************************************************************************\
  1619. * mmxAlphaConstOnly24
  1620. *
  1621. * Optimized version of mmxAlphaConstOnly used when source and destination
  1622. * are both 24bpp.
  1623. *
  1624. * Dst = Dst + ConstAlpha * (Src - Dst)
  1625. *
  1626. * Arguments:
  1627. *
  1628. * ppixDst - address of dst pixel
  1629. * ppixSrc - address of src pixel
  1630. * cx - number of pixels in scan line
  1631. * BlendFunction - blend to be done on each pixel
  1632. *
  1633. * Return Value:
  1634. *
  1635. * None
  1636. *
  1637. \**************************************************************************/
  1638. /**************************************************************************
  1639. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  1640. This function operates on 24 bit pixels (8 bits each for Red, Green, and Blue) in a row of a bitmap.
  1641. It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
  1642. The function performs the following on each byte:
  1643. tmp1 = Alpha(Src - Dst) + 128 + (Dst * 127)
  1644. tmp2 = tmp1 shr 8 (move high byte to low byte)
  1645. tmp2 = tmp2 + tmp1
  1646. tmp2 = tmp2 shr 8 (move high byte to low byte)
  1647. Dst = tmp2
  1648. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  1649. The loop structure is as follows:
  1650. Step 1:
  1651. Multiply width in pixels by 3 to get width in bytes. Byte count is kept in ecx and eax.
  1652. ecx is used as the loop counter.
  1653. Step 2:
  1654. Check pDst for QWORD alignment. If aligned, do Step 3. If unaligned, test to see if there
  1655. are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:) and then do Step 3.
  1656. If no, there are only 3 bytes to do; so do them one at a time (OneToThreeBytesLeft:).
  1657. Step 3:
  1658. (QuadAligned:)
  1659. pDst is QWORD aligned. We want to do 8 bytes (1 quad) at once, so divide byte count by 8 to get loop
  1660. count. If ecx is 0 at this point, there are no more quads to do; so do 0 to 7 bytes (NoQuadsLeft:),
  1661. in Step 5.
  1662. Step 4:
  1663. (Do1QUAD:)
  1664. Perform the above function, using MMX instructions, on 8 bytes per pass of the loop.
  1665. Step 5:
  1666. (NoQuadsLeft:)
  1667. Mask eax with 7 to get the byte count modulo 8, 0 to 7 bytes left. Copy eax into ecx. Test to see
  1668. if there are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:); if no, there are
  1669. only 3 bytes to do, so do them one at a time (OneToThreeBytesLeft:).
  1670. Step 6:
  1671. (Do1DWORD:)
  1672. Perform the above function, using MMX instructions, on 4 bytes. Do Step 3 (QuadAligned:) to see if
  1673. there are more bytes to do.
  1674. Step 7:
  1675. (OneToThreeBytesLeft:)
  1676. Do one byte at a time. This will happen if there are less than 4 bytes left to do.
  1677. **************************************************************************/
  1678. VOID
  1679. mmxAlphaConstOnly24(
  1680. PALPHAPIX pDst,
  1681. PALPHAPIX pSrc,
  1682. LONG Width,
  1683. BLENDFUNCTION BlendFunction
  1684. )
  1685. {
  1686. static QWORD WordConst = 0x0080008000800080;
  1687. static QWORD WordMask = 0xFF00FF00FF00FF00;
  1688. static QWORD ByteConst = 0x0000000000000080;
  1689. static QWORD ByteMask = 0x000000000000FF00;
  1690. static QWORD CA; // ConstAlpha in 4 words of a qword
  1691. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1692. _asm
  1693. {
  1694. mov ecx, Width // Make sure there is at least one pixel to do
  1695. test ecx, ecx
  1696. jz Done
  1697. mov esi, pSrc
  1698. mov edi, pDst
  1699. xor eax, eax
  1700. mov al, ConstAlpha
  1701. movd mm5, eax // | | | | CA |
  1702. punpcklwd mm5, mm5 // | | | CA | CA |
  1703. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1704. movq CA, mm5
  1705. // Step 1:
  1706. lea ecx, [2*ecx+ecx]// NumPixels * 3 bytes/pixel = NumBytes
  1707. // Step 2:
  1708. test edi, 7 // Test first pixel for QWORD alignment
  1709. jz QuadAligned // If unaligned,
  1710. cmp ecx, 4 // test to see if there are 4 bytes to do
  1711. jae Do1DWORD // if yes, do 4 bytes
  1712. jmp OneToThreeBytesLeft// if no, do 1 to 3 bytes
  1713. QuadAligned: // Step 3:
  1714. mov eax, ecx // Save the width in eax for later (see NoQuadsLeft:)
  1715. shr ecx, 3 // Want to do 8 bytes at once, so divide
  1716. // byte count by 8 to get loop count
  1717. test ecx, ecx // Make sure there is at least 1 QUAD (8 bytes) to do
  1718. jz NoQuadsLeft // If we take this jump, there are 0 to 7 bytes left
  1719. Do1QUAD: // Step 4:
  1720. movq mm0, [edi] // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
  1721. pxor mm7, mm7
  1722. movq mm1, [esi] // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 |
  1723. movq mm2, mm0 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
  1724. movq mm3, mm1 // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 |
  1725. punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 |
  1726. movq mm4, mm0 // | D4 | D3 | D2 | D1 |
  1727. punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 |
  1728. punpckhbw mm2, mm7 // | D8 | D7 | D6 | D5 |
  1729. psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 |
  1730. pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 |
  1731. punpckhbw mm3, mm7 // | S8 | S7 | S6 | S5 |
  1732. psubw mm3, mm2 // | S8-D8 | S7-D7 | S6-D6 | S5-D5 |
  1733. movq mm6, mm2 // | D8 | D7 | D6 | D5 |
  1734. pmullw mm3, CA // | CA8 | CA7 | CA6 | CA5 |
  1735. psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 |
  1736. psllw mm6, 8 // | D8*128 | D7*128 | D6*128 | D5*128 |
  1737. psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 |
  1738. paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C|
  1739. psubw mm6, mm2 // | D8*127 | D7*127 | D6*127 | D5*127 |
  1740. paddw mm6, WordConst // | D8*127+C| D7*127+C| D6*127+C| D5*127+C|
  1741. paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1742. paddw mm6, mm3 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127)
  1743. movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1744. movq mm5, mm6 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127)
  1745. psrlw mm4, 8 // tmp3 = tmp1 shr 8 (move high byte to low byte)
  1746. psrlw mm6, 8 // tmp4 = tmp2 shr 8 (move high byte to low byte)
  1747. paddw mm4, mm3 // tmp3 = tmp3 + tmp1
  1748. paddw mm6, mm5 // tmp4 = tmp4 + tmp2
  1749. psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte)
  1750. psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte)
  1751. add edi, 8 // pDst++;
  1752. packuswb mm4, mm6 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
  1753. add esi, 8 // pSrc++;
  1754. movq [edi-8], mm4
  1755. dec ecx
  1756. jnz Do1QUAD
  1757. NoQuadsLeft: // Step 5:
  1758. // This tests for 0 to 7 bytes left in row - eax contains initial byte count
  1759. and eax, 7 // 0 to 7 bytes left to do
  1760. jz Done
  1761. cmp eax, 4 // Test to see if there are 4 bytes to do
  1762. mov ecx, eax
  1763. jae Do1DWORD // if yes, do 4 bytes
  1764. jmp OneToThreeBytesLeft // if no, do 1 to 3 bytes
  1765. // Step 6:
  1766. Do1DWORD: // make as a macro if used in asm file
  1767. movd mm0, [edi] // | 0 | 0 | 0 | 0 | D4 | D3 | D2 | D1 |
  1768. pxor mm7, mm7
  1769. movd mm1, [esi] // | 0 | 0 | 0 | 0 | S4 | S3 | S2 | S1 |
  1770. punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 |
  1771. movq mm4, mm0 // | D4 | D3 | D2 | D1 |
  1772. punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 |
  1773. psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 |
  1774. psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 |
  1775. pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 |
  1776. psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 |
  1777. paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C|
  1778. paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1779. movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1780. psrlw mm4, 8 // tmp2 = tmp1 shr 8 (move high byte to low byte)
  1781. paddw mm4, mm3 // tmp2 = tmp2 + tmp1
  1782. psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte)
  1783. add edi, 4 // pDst++;
  1784. packuswb mm4, mm4 // | D4 | D3 | D2 | D1 | D4 | D3 | D2 | D1 |
  1785. add esi, 4 // pSrc++;
  1786. movd [edi-4], mm4
  1787. sub ecx, 4 // Just did 4 bytes at the beginning or end of a scan line
  1788. jmp QuadAligned // Jump to QuadAligned to determine if there are more bytes to do
  1789. OneToThreeBytesLeft: // Step 7:
  1790. movzx edx,BYTE PTR[edi] // edx = Dest Byte
  1791. movzx ebx,BYTE PTR[esi] // ebx = Src Byte
  1792. movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db |
  1793. pxor mm7, mm7
  1794. movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Sb |
  1795. movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db |
  1796. psllw mm2, 8 // | 0 | 0 | 0 | 0 | 0 | 0 | Db| 0 |
  1797. psubw mm1, mm0 // | 0 | 0 | 0 | Sb-Db |
  1798. pmullw mm1, CA // | 0 | 0 | 0 | CAb |
  1799. psubw mm2, mm0 // | 0 | 0 | 0 | Db*127|
  1800. paddw mm2, ByteConst // | 0 | 0 | 0 |Db*127+128|
  1801. paddw mm1, mm2 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1802. movq mm2, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1803. psrlw mm2, 8 // tmp2 = tmp2 shr 8
  1804. paddw mm2, mm1 // tmp2 = tmp2 + tmp1
  1805. psrlw mm2, 8 // tmp2 = tmp2 shr 8
  1806. movd edx, mm2
  1807. mov BYTE PTR[edi], dl
  1808. inc edi
  1809. inc esi
  1810. dec ecx
  1811. jnz OneToThreeBytesLeft
  1812. Done:
  1813. emms // remove for optimizations, have calling function do emms
  1814. }
  1815. }
  1816. #endif