Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2331 lines
80 KiB

  1. /******************************Module*Header*******************************\
  2. * Module Name: tranblt.cxx
  3. *
  4. * Transparent BLT
  5. *
  6. * Created: 21-Jun-1996
  7. * Author: Mark Enstrom [marke]
  8. *
  9. * Copyright (c) 1996-1999 Microsoft Corporation
  10. \**************************************************************************/
  11. #include "precomp.hxx"
  12. #pragma hdrstop
  13. #if !(_WIN32_WINNT >= 0x500)
  14. //
  15. // global memory DC with single scan line 32 bpp DIBSection,
  16. // use protected by semLocal
  17. //
  18. HDC ghdc32Tmp;
  19. HDC ghdc32;
  20. PULONG gpulDIB32;
  21. /**************************************************************************\
  22. * bInitAlpha
  23. *
  24. * Init global scan line DC
  25. *
  26. * Arguments:
  27. *
  28. * none
  29. *
  30. * Return Value:
  31. *
  32. * status
  33. *
  34. * History:
  35. *
  36. * 4/30/1997 Mark Enstrom [marke]
  37. *
  38. \**************************************************************************/
  39. BOOL
  40. bInitAlpha()
  41. {
  42. BOOL bRet = TRUE;
  43. BITMAPINFO bmi32;
  44. bmi32.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
  45. bmi32.bmiHeader.biWidth = SCAN_LINE_DC_WIDTH;
  46. bmi32.bmiHeader.biHeight = 1;
  47. bmi32.bmiHeader.biPlanes = 1;
  48. bmi32.bmiHeader.biBitCount = 32;
  49. bmi32.bmiHeader.biCompression = BI_RGB;
  50. bmi32.bmiHeader.biSizeImage = 0;
  51. bmi32.bmiHeader.biXPelsPerMeter = 0;
  52. bmi32.bmiHeader.biYPelsPerMeter = 0;
  53. bmi32.bmiHeader.biClrUsed = 0;
  54. bmi32.bmiHeader.biClrImportant = 0;
  55. HDC hdc32 = CreateCompatibleDC(NULL);
  56. if (hdc32 != NULL)
  57. {
  58. PULONG pulDIBSrc;
  59. HBITMAP hbmSrc = CreateDIBSection(hdc32,&bmi32,DIB_RGB_COLORS,(PVOID *)&pulDIBSrc,NULL,0);
  60. if (hbmSrc)
  61. {
  62. HBITMAP hbmOld = (HBITMAP)SelectObject(hdc32,hbmSrc);
  63. if (hbmOld != NULL)
  64. {
  65. ghdc32 = hdc32;
  66. ghdc32Tmp = hdc32;
  67. gpulDIB32 = pulDIBSrc;
  68. }
  69. else
  70. {
  71. DeleteDC(hdc32);
  72. DeleteObject(hbmSrc);
  73. bRet = FALSE;
  74. }
  75. }
  76. else
  77. {
  78. DeleteDC(hdc32);
  79. bRet = FALSE;
  80. }
  81. }
  82. else
  83. {
  84. bRet = FALSE;
  85. }
  86. return(bRet);
  87. }
  88. /**************************************************************************\
  89. * CleanupGlobals
  90. *
  91. * Free any global DIBsections, DCs, etc. from initialization.
  92. *
  93. * Arguments:
  94. * none.
  95. *
  96. * Return Value:
  97. * none.
  98. *
  99. * History:
  100. *
  101. * 1/19/2000 Donald Chinn [DChinn]
  102. *
  103. \**************************************************************************/
  104. VOID CleanupGlobals()
  105. {
  106. ASSERTGDI(ghdc32 == ghdc32Tmp, "ghdc32Tmp is still being used.");
  107. if (!ghdc32)
  108. {
  109. DeleteDC(ghdc32);
  110. }
  111. if (!gpulDIB32)
  112. {
  113. DeleteObject(gpulDIB32);
  114. }
  115. return;
  116. }
  117. /**************************************************************************\
  118. * hdcAllocateScanLineDC
  119. *
  120. * allocate tmp scan line DC. try to use fast allocator.
  121. *
  122. * Arguments:
  123. *
  124. * hdcComp - hdc for compatible bitmap
  125. * width - width of scan line
  126. * pulScanLine - return pointer to temp scan line
  127. *
  128. * Return Value:
  129. *
  130. *
  131. *
  132. * History:
  133. *
  134. * 4/30/1997 Mark Enstrom [marke]
  135. *
  136. \**************************************************************************/
  137. HDC
  138. hdcAllocateScanLineDC(
  139. LONG width,
  140. PULONG *pulScanLine
  141. )
  142. {
  143. ASSERTGDI(pulScanLine != NULL,"Scan line pointer must not be NULL");
  144. HDC hdcRet = NULL;
  145. //
  146. // try to acquire global scan line DC
  147. //
  148. if (width <= SCAN_LINE_DC_WIDTH)
  149. {
  150. hdcRet = (HDC)InterlockedExchange((PLONG)&ghdc32Tmp, 0);
  151. }
  152. if (hdcRet != NULL)
  153. {
  154. //
  155. // allocation succeded, assign pointer
  156. //
  157. *pulScanLine = gpulDIB32;
  158. }
  159. else
  160. {
  161. //
  162. // if acquire failed, allocate
  163. //
  164. BITMAPINFO bmi32;
  165. bmi32.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
  166. bmi32.bmiHeader.biWidth = width;
  167. bmi32.bmiHeader.biHeight = 1;
  168. bmi32.bmiHeader.biPlanes = 1;
  169. bmi32.bmiHeader.biBitCount = 32;
  170. bmi32.bmiHeader.biCompression = BI_RGB;
  171. bmi32.bmiHeader.biSizeImage = 0;
  172. bmi32.bmiHeader.biXPelsPerMeter = 0;
  173. bmi32.bmiHeader.biYPelsPerMeter = 0;
  174. bmi32.bmiHeader.biClrUsed = 0;
  175. bmi32.bmiHeader.biClrImportant = 0;
  176. HDC hdc32 = CreateCompatibleDC(NULL);
  177. if (hdc32 != NULL)
  178. {
  179. PULONG pulDIBSrc;
  180. HBITMAP hbmSrc = CreateDIBSection(hdc32,&bmi32,DIB_RGB_COLORS,(PVOID *)&pulDIBSrc,NULL,0);
  181. if (hbmSrc)
  182. {
  183. HBITMAP hbmOld = (HBITMAP)SelectObject(hdc32,hbmSrc);
  184. if (hbmOld != NULL)
  185. {
  186. hdcRet = hdc32;
  187. *pulScanLine = pulDIBSrc;
  188. }
  189. else
  190. {
  191. DeleteDC(hdc32);
  192. DeleteObject(hbmSrc);
  193. }
  194. }
  195. else
  196. {
  197. DeleteDC(hdc32);
  198. }
  199. }
  200. }
  201. return(hdcRet);
  202. }
  203. /**************************************************************************\
  204. * vFreeScanLineDC
  205. *
  206. * free tmp scan line dc and dibsection
  207. *
  208. * Arguments:
  209. *
  210. * hdcFree - scan line DC
  211. *
  212. * Return Value:
  213. *
  214. * none
  215. *
  216. * History:
  217. *
  218. * 4/30/1997 Mark Enstrom [marke]
  219. *
  220. \**************************************************************************/
  221. VOID
  222. vFreeScanLineDC(
  223. HDC hdcFree
  224. )
  225. {
  226. ASSERTGDI(hdcFree != NULL,"vFreeScanLineDC: DC can't be NULL");
  227. if (hdcFree == ghdc32)
  228. {
  229. //
  230. // release global hdc
  231. //
  232. ghdc32Tmp = ghdc32;
  233. }
  234. else
  235. {
  236. //
  237. // free allocated DC and bitmap
  238. //
  239. HBITMAP hbmOld = (HBITMAP)GetCurrentObject(hdcFree,OBJ_BITMAP);
  240. DeleteDC(hdcFree);
  241. if (hbmOld)
  242. {
  243. DeleteObject(hbmOld);
  244. }
  245. }
  246. }
  247. /**************************************************************************\
  248. * vPixelOver
  249. *
  250. * optimized routine used when the blend function is SRC_OVER and the
  251. * SourceConstantAlpha is 255.
  252. *
  253. * Dst = Src + (1-SrcAlpha) * Dst
  254. *
  255. * Arguments:
  256. *
  257. * ppixDst - address of dst pixel
  258. * ppixSrc - address of src pixel
  259. * cx - number of pixels in scan line
  260. * BlendFunction - blend to be done on each pixel
  261. * pwrMask - set each byte to 0 for pixel that doesn't need
  262. * to be written to dst
  263. *
  264. * Return Value:
  265. *
  266. * none
  267. *
  268. * History:
  269. *
  270. * 1/23/1997 Mark Enstrom [marke]
  271. *
  272. \**************************************************************************/
  273. #if !defined(_X86_)
  274. VOID
  275. vPixelOver(
  276. ALPHAPIX *ppixDst,
  277. ALPHAPIX *ppixSrc,
  278. LONG cx,
  279. BLENDFUNCTION BlendFunction,
  280. PBYTE pwrMask
  281. )
  282. {
  283. ALPHAPIX pixSrc;
  284. ALPHAPIX pixDst;
  285. while (cx--)
  286. {
  287. pixSrc = *ppixSrc;
  288. if (pixSrc.pix.a != 0)
  289. {
  290. pixDst = *ppixDst;
  291. if (pixSrc.pix.a == 255)
  292. {
  293. pixDst = pixSrc;
  294. }
  295. else
  296. {
  297. //
  298. // Dst = Src + (1-SrcAlpha) * Dst
  299. //
  300. ULONG Multa = 255 - pixSrc.pix.a;
  301. ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8;
  302. ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff);
  303. ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
  304. ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;
  305. ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
  306. ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;
  307. ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
  308. ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;
  309. pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB;
  310. }
  311. *ppixDst = pixDst;
  312. }
  313. else
  314. {
  315. *pwrMask = 0;
  316. }
  317. pwrMask++;
  318. ppixSrc++;
  319. ppixDst++;
  320. }
  321. }
  322. #endif
  323. /**************************************************************************\
  324. * vPixelBlendOrDissolveOver
  325. *
  326. * Blend routine when the blend function is SRC_OVER, but when
  327. * SourceConstantAlpah != 255 and The source bitmap does have alpha values
  328. *
  329. * if SrcAlpha == 255 then
  330. * (Blend)
  331. * Dst = Dst + ConstAlpha * (Src - Dst)
  332. *
  333. * else
  334. * (Dissolve)
  335. * Src = Src * ConstAlpha
  336. * (Over)
  337. * Dst = Src + (1 - SrcAlpha) Dst
  338. *
  339. * Arguments:
  340. *
  341. * ppixDst - address of dst pixel
  342. * ppixSrc - address of src pixel
  343. * cx - number of pixels in scan line
  344. * BlendFunction - blend to be done on each pixel
  345. * pwrMask - set each byte to 0 for pixel that doesn't need
  346. * to be written to dst
  347. *
  348. * Return Value:
  349. *
  350. * None
  351. *
  352. * History:
  353. *
  354. * 3/12/1997 Mark Enstrom [marke]
  355. *
  356. \**************************************************************************/
  357. VOID
  358. vPixelBlendOrDissolveOver(
  359. ALPHAPIX *ppixDst,
  360. ALPHAPIX *ppixSrc,
  361. LONG cx,
  362. BLENDFUNCTION BlendFunction,
  363. PBYTE pwrMask
  364. )
  365. {
  366. ALPHAPIX pixSrc;
  367. ALPHAPIX pixDst;
  368. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  369. while (cx--)
  370. {
  371. pixSrc = *ppixSrc;
  372. if (pixSrc.pix.a != 0)
  373. {
  374. pixDst = *ppixDst;
  375. if (pixSrc.pix.a == 255)
  376. {
  377. //
  378. // Blend: D = sA * S + (1-sA) * D
  379. //
  380. // red and blue
  381. //
  382. ULONG uB00rr00bb = pixDst.ul & 0x00ff00ff;
  383. ULONG uF00rr00bb = pixSrc.ul & 0x00ff00ff;
  384. ULONG uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
  385. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
  386. ULONG uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
  387. ULONG uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
  388. //
  389. // alpha and green
  390. //
  391. ULONG uB00aa00gg = (pixDst.ul >> 8) & 0xff00ff;
  392. ULONG uF00aa00gg = (pixSrc.ul >> 8) & 0xff00ff;
  393. ULONG uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) +
  394. (ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080;
  395. ULONG uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8;
  396. ULONG uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00;
  397. pixDst.ul = uD00rr00bb + uDaa00gg00;
  398. }
  399. else
  400. {
  401. //
  402. // disolve
  403. //
  404. ULONG ul_B_00AA00GG = (pixSrc.ul & 0xff00ff00) >> 8;
  405. ULONG ul_B_00RR00BB = (pixSrc.ul & 0x00ff00ff);
  406. ULONG ul_T_AAAAGGGG = ul_B_00AA00GG * ConstAlpha + 0x00800080;
  407. ULONG ul_T_RRRRBBBB = ul_B_00RR00BB * ConstAlpha + 0x00800080;
  408. ULONG ul_T_00AA00GG = (ul_T_AAAAGGGG & 0xFF00FF00) >> 8;
  409. ULONG ul_T_00RR00BB = (ul_T_RRRRBBBB & 0xFF00FF00) >> 8;
  410. ULONG ul_C_AA00GG00 = ((ul_T_AAAAGGGG + ul_T_00AA00GG) & 0xFF00FF00);
  411. ULONG ul_C_00RR00BB = ((ul_T_RRRRBBBB + ul_T_00RR00BB) & 0xFF00FF00) >> 8;
  412. pixSrc.ul = (ul_C_AA00GG00 | ul_C_00RR00BB);
  413. //
  414. // over
  415. //
  416. ULONG Multa = 255 - pixSrc.pix.a;
  417. ULONG _D1_00AA00GG = (pixDst.ul & 0xff00ff00) >> 8;
  418. ULONG _D1_00RR00BB = (pixDst.ul & 0x00ff00ff);
  419. ULONG _D2_AAAAGGGG = _D1_00AA00GG * Multa + 0x00800080;
  420. ULONG _D2_RRRRBBBB = _D1_00RR00BB * Multa + 0x00800080;
  421. ULONG _D3_00AA00GG = (_D2_AAAAGGGG & 0xff00ff00) >> 8;
  422. ULONG _D3_00RR00BB = (_D2_RRRRBBBB & 0xff00ff00) >> 8;
  423. ULONG _D4_AA00GG00 = (_D2_AAAAGGGG + _D3_00AA00GG) & 0xFF00FF00;
  424. ULONG _D4_00RR00BB = ((_D2_RRRRBBBB + _D3_00RR00BB) & 0xFF00FF00) >> 8;
  425. pixDst.ul = pixSrc.ul + _D4_AA00GG00 + _D4_00RR00BB;
  426. }
  427. *ppixDst = pixDst;
  428. }
  429. else
  430. {
  431. *pwrMask = 0;
  432. }
  433. pwrMask++;
  434. ppixSrc++;
  435. ppixDst++;
  436. }
  437. }
  438. #if !defined(_X86_)
  439. /******************************Public*Routine******************************\
  440. * vPixelBlend
  441. *
  442. * Blend function used then BlendFunction is SRC_OVER and
  443. * SourceConstantAlpha != 255, and Src image does NOT have
  444. * it's own alpha channel. (assume 255)
  445. *
  446. * Dst = Dst + ConstAlpha * (Src - Dst)
  447. *
  448. * Arguments:
  449. *
  450. * ppixDst - address of dst pixel
  451. * ppixSrc - address of src pixel
  452. * cx - number of pixels in scan line
  453. * BlendFunction - blend to be done on each pixel
  454. * pwrMask - set each byte to 0 for pixel that doesn't need
  455. * to be written to dst
  456. *
  457. * Return Value:
  458. *
  459. * None
  460. *
  461. * History:
  462. *
  463. * 12/2/1996 Mark Enstrom [marke]
  464. *
  465. \**************************************************************************/
  466. VOID
  467. vPixelBlend(
  468. ALPHAPIX *ppixDst,
  469. ALPHAPIX *ppixSrc,
  470. LONG cx,
  471. BLENDFUNCTION BlendFunction,
  472. PBYTE pwrMask
  473. )
  474. {
  475. PULONG pulSrc = (PULONG)ppixSrc;
  476. PULONG pulDst = (PULONG)ppixDst;
  477. PULONG pulSrcEnd = pulSrc + cx;
  478. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  479. //
  480. // Blend: D = sA * S + (1-sA) * D
  481. //
  482. while (pulSrc != pulSrcEnd)
  483. {
  484. ULONG ulDst = *pulDst;
  485. ULONG ulSrc = *pulSrc;
  486. ULONG uB00rr00bb = ulDst & 0x00ff00ff;
  487. ULONG uF00rr00bb = ulSrc & 0x00ff00ff;
  488. ULONG uMrrrrbbbb;
  489. ULONG uM00rr00bb;
  490. ULONG uD00rr00bb;
  491. ULONG uB00aa00gg;
  492. ULONG uF00aa00gg;
  493. ULONG uMaaaagggg;
  494. ULONG uM00aa00gg;
  495. ULONG uDaa00gg00;
  496. //
  497. // red and blue
  498. //
  499. uB00rr00bb = ulDst & 0x00ff00ff;
  500. uF00rr00bb = ulSrc & 0x00ff00ff;
  501. uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
  502. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
  503. uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
  504. uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
  505. //
  506. // alpha and green
  507. //
  508. uB00aa00gg = (ulDst >> 8) & 0xff00ff;
  509. uF00aa00gg = (ulSrc >> 8) & 0xff00ff;
  510. uMaaaagggg = ((uB00aa00gg <<8)-uB00aa00gg) +
  511. (ConstAlpha * (uF00aa00gg-uB00aa00gg)) + 0x00800080;
  512. uM00aa00gg = (uMaaaagggg & 0xff00ff00)>>8;
  513. uDaa00gg00 = (uMaaaagggg + uM00aa00gg) & 0xff00ff00;
  514. *pulDst = uD00rr00bb + uDaa00gg00;
  515. pulSrc++;
  516. pulDst++;
  517. }
  518. }
  519. #endif
  520. /******************************Public*Routine******************************\
  521. * vPixelBlend24
  522. *
  523. * Blend two 24 bpp images with a constant alpha value
  524. *
  525. * Arguments:
  526. *
  527. * pixDst,
  528. * pixSrc,
  529. * cx,
  530. * BlendFunction
  531. * pwrMask
  532. *
  533. * Return Value:
  534. *
  535. *
  536. *
  537. * History:
  538. *
  539. * 12/2/1996 Mark Enstrom [marke]
  540. *
  541. \**************************************************************************/
  542. VOID
  543. vPixelBlend24(
  544. ALPHAPIX *ppixDst,
  545. ALPHAPIX *ppixSrc,
  546. LONG cx,
  547. BLENDFUNCTION BlendFunction,
  548. PBYTE pwrMask
  549. )
  550. {
  551. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  552. PBYTE pjSrc = (PBYTE)ppixSrc;
  553. PBYTE pjDst = (PBYTE)ppixDst;
  554. PBYTE pjSrcEnd = pjSrc + 3*cx;
  555. while (pjSrc != pjSrcEnd)
  556. {
  557. ULONG ulDst = (*pjDst) << 16;
  558. ULONG ulSrc = (*pjSrc) << 16;
  559. ULONG uB00rr00bb;
  560. ULONG uF00rr00bb;
  561. ULONG uMrrrrbbbb;
  562. ULONG uM00rr00bb;
  563. ULONG uD00rr00bb;
  564. ULONG uB000000gg;
  565. ULONG uF000000gg;
  566. ULONG uM0000gggg;
  567. ULONG uM000000gg;
  568. ULONG uD000000gg;
  569. //
  570. // red and blue
  571. //
  572. uB00rr00bb = uB00rr00bb = ulDst | (*(pjDst+1));
  573. uF00rr00bb = uF00rr00bb = ulSrc | (*(pjSrc+1));
  574. uMrrrrbbbb = ((uB00rr00bb<<8)-uB00rr00bb) +
  575. (ConstAlpha * (uF00rr00bb - uB00rr00bb)) + 0x00800080;
  576. uM00rr00bb = (uMrrrrbbbb & 0xff00ff00) >> 8;
  577. uD00rr00bb = ((uMrrrrbbbb+uM00rr00bb) & 0xff00ff00)>>8;
  578. //
  579. // green
  580. //
  581. uB000000gg = *(pjDst+2);
  582. uF000000gg = *(pjSrc+2);
  583. uM0000gggg = ((uB000000gg <<8)-uB000000gg) +
  584. (ConstAlpha * (uF000000gg-uB000000gg)) + 0x00000080;
  585. uM000000gg = (uM0000gggg & 0x0000ff00)>>8;
  586. uD000000gg = ((uM0000gggg + uM000000gg) & 0x0000ff00) >> 8;
  587. *pjDst = (BYTE)(uD00rr00bb >> 16);
  588. *(pjDst+1) = (BYTE)(uD00rr00bb);
  589. *(pjDst+2) = (BYTE)(uD000000gg);
  590. pjSrc+=3;
  591. pjDst+=3;
  592. }
  593. }
  594. #if defined(_X86_)
  595. typedef unsigned __int64 QWORD;
  596. /**************************************************************************
  597. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  598. DO NOT CALL THIS FUNCTION WITH WIDTH == 0
  599. This function operates on 32 bit pixels (BGRA) in a row of a bitmap.
  600. This function performs the following:
  601. SrcTran = 255 - pixSrc.a
  602. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255);
  603. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255);
  604. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255);
  605. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255);
  606. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  607. Step 1:
  608. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  609. as a DWORD, then do Step 2.
  610. Step 2:
  611. QuadAligned
  612. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  613. pixel left, do as a DWORD.
  614. Step 3:
  615. Load two source pixels, S1 and S2. Get (255 - alpha value) for each source pixel, 255-S1a and 255-S2a.
  616. Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register.
  617. Load two destination pixels, D1 and D2. Expand each byte in D1 into four words
  618. of an MMX register. If at least four pixels can be done, do Step 4. If not, jump over
  619. FourPixelsPerPass and finish doing two pixels at TwoPixelsLeft, Step 5.
  620. Step 4:
  621. FourPixelsPerPass
  622. Expand each byte in D2 into four words of an MMX register. Multiply each byte
  623. of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate result
  624. of both pixels. Copy the results of each pixel into an MMX register. Shift each result of
  625. both pixels by 8. Add the shifted results to the copied results. Shift these results by 8.
  626. Pack the results into one MMX register. Add the packed results to the source pixels. Store result
  627. over destination pixels. Stay in FourPixelsPerPass loop until there are less than four pixels to do.
  628. Step 5:
  629. TwoPixelsLeft
  630. Do same as Step 4 above; but do not loop.
  631. Step 6:
  632. OnePixelLeft
  633. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  634. **************************************************************************/
  635. VOID
  636. mmxPixelOver(
  637. ALPHAPIX *pDst,
  638. ALPHAPIX *pSrc,
  639. LONG Width,
  640. BLENDFUNCTION BlendFunction,
  641. PBYTE pwrMask)
  642. {
  643. static QWORD W128 = 0x0080008000800080;
  644. static QWORD AlphaMask = 0x000000FF000000FF;
  645. _asm
  646. {
  647. mov esi, pSrc
  648. mov edi, pDst
  649. movq mm7, W128 // | 0 | 128 | 0 | 128 | 0 | 128 | 0 | 128 |
  650. // This register never changes
  651. pxor mm6, mm6 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
  652. // This register never changes
  653. mov ecx, Width
  654. // Step 1:
  655. test edi, 7 // Test first pixel for QWORD alignment
  656. jz QuadAligned // if unaligned,
  657. jmp Do1Pixel // do first pixel only
  658. QuadAligned: // Step 2:
  659. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  660. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  661. test ecx, ecx // Make sure there is at least 1 quad to do
  662. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  663. // Step 3:
  664. movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  665. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  666. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  667. movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  668. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  669. movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  670. punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
  671. movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  672. punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  673. punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
  674. punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  675. dec ecx
  676. jz TwoPixelsLeft
  677. FourPixelsPerPass: // Step 4:
  678. // Indenting indicates operations on the next set of pixels
  679. // Within this loop, instructions will pair as shown for the Pentium processor
  680. // T1 = 255-S1a T2 = 255-S2a
  681. punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
  682. pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
  683. movq mm0, [esi+8] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  684. pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
  685. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  686. add esi, 8 // pSrc++;
  687. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  688. paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  689. paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  690. movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  691. movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  692. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  693. movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  694. punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
  695. // TDXx' = TX*DXx+128
  696. psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  697. // TDXx" = (TX*DXx+128)+(TDXx'>>8)
  698. psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
  699. paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
  700. paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
  701. psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  702. movq mm2, [edi+8] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  703. psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
  704. movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  705. packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  706. paddusb mm4, [esi-8]
  707. punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  708. movq [edi], mm4
  709. punpckldq mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
  710. punpcklbw mm2, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  711. add edi, 8 // pDst++;
  712. dec ecx
  713. jnz FourPixelsPerPass
  714. TwoPixelsLeft: // Step 5:
  715. punpckhbw mm3, mm6 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
  716. pmullw mm2, mm0 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
  717. pmullw mm3, mm1 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
  718. paddusw mm2, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  719. paddusw mm3, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  720. movq mm4, mm2 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  721. movq mm5, mm3 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  722. psrlw mm2, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  723. psrlw mm3, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
  724. paddusw mm4, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
  725. paddusw mm5, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
  726. psrlw mm4, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  727. psrlw mm5, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
  728. packuswb mm4, mm5 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  729. paddusb mm4, [esi]
  730. movq [edi], mm4
  731. add edi, 8
  732. add esi, 8
  733. OnePixelLeft: // Step 6:
  734. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  735. // If 0, there were an even number of pixels and we're done
  736. // If 1, there is an odd number of pixels and we need to do one more
  737. test eax, 1
  738. jz Done
  739. Do1Pixel: // make as a macro if used in asm file
  740. // T = 255-S1x
  741. movd mm0, DWORD PTR[esi] // | 0 | 0 | 0 | 0 | S1a | S1r | S1g | S1b |
  742. psrld mm0, 24 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | S1a |
  743. pxor mm0, AlphaMask // | 0 | 0 | 0 | 255 | 0 | 0 | 0 |255-S1a|
  744. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  745. punpckldq mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  746. movd mm1, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b |
  747. punpcklbw mm1, mm6 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  748. pmullw mm0, mm1 // | T*D1a | T*D1r | T*D1g | T*D1b |
  749. paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  750. movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  751. psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  752. paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" |
  753. psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  754. movd mm1, [esi]
  755. packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  756. paddusb mm0, mm1
  757. movd [edi], mm0
  758. add edi, 4 // pDst++;
  759. add esi, 4 // pSrc++;
  760. test ecx, ecx
  761. jz Done // just processed the last pixel of the row
  762. dec ecx
  763. jmp QuadAligned // just processed the first pixel of the row
  764. Done:
  765. emms // remove for optimizations, have calling function do emms
  766. }
  767. }
  768. /**************************************************************************\
  769. * mmxPixelBlendOrDissolveOver
  770. *
  771. * Blend routine when the blend function is SRC_OVER, but when
  772. * SourceConstantAlpah != 255 and The source bitmap does have alpha values
  773. *
  774. * if SrcAlpha == 255 then
  775. *
  776. * Dst = Dst + ConstAlpha * (Src - Dst)
  777. *
  778. * else
  779. *
  780. * Src = Src * ConstAlpha
  781. * Dst = Src + (1 - SrcAlpha) Dst
  782. *
  783. * Arguments:
  784. *
  785. * ppixDst - address of dst pixel
  786. * ppixSrc - address of src pixel
  787. * cx - number of pixels in scan line
  788. * BlendFunction - blend to be done on each pixel
  789. * pwrMask - set each byte to 0 for pixel that doesn't need
  790. * to be written to dst
  791. *
  792. * Return Value:
  793. *
  794. * None
  795. *
  796. * History:
  797. *
  798. * 3/12/1997 Mark Enstrom [marke]
  799. *
  800. \**************************************************************************/
  801. /**************************************************************************
  802. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  803. DO NOT CALL THIS FUNCTION WITH WIDTH == 0
  804. This function operates on 32 bit pixels (BGRA) in a row of a bitmap.
  805. This function performs the following:
  806. first,
  807. pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
  808. pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
  809. pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
  810. pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
  811. then,
  812. SrcTran = 255 - pixSrc.a
  813. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+127)/255);
  814. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+127)/255);
  815. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+127)/255);
  816. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+127)/255);
  817. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  818. Step 1:
  819. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  820. as a DWORD, then do Step 2.
  821. Step 2:
  822. QuadAligned
  823. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  824. pixel left, do as a DWORD.
  825. Step 3:
  826. Load two source pixels, S1 and S2, as one QWORD. Expand S1 and S2 as four words into two MMX registers.
  827. Multiply each word in S1 and S2 by ConstAlpha. Add 128 to each result of both pixels. Copy the results
  828. of each pixel into an MMX register. Shift each result of both pixels by 8. Add the shifted results
  829. to the copied results. Shift these results by 8. Pack the results into one MMX register...this will
  830. be used later.
  831. Shift the packed results by 24 to get only the alpha value for each pixel.
  832. Step 4:
  833. Get (255 - new alpha value) for each pixel, 255-S1a and 255-S2a.
  834. Copy 255-S1a as four words into an MMX register. Copy 255-S2a as four words into an MMX register.
  835. Load two destination pixels, D1 and D2. Expand D1 and D2 as four words into two MMX registers.
  836. Multiply each byte of D1 by 255-S1a. Multiply each byte of D2 by 255-S2a. Add 128 to each intermediate
  837. result of both pixels. Copy the results of each pixel into an MMX register. Shift each result of
  838. both pixels by 8. Add the shifted results to the copied results. Shift these results by 8.
  839. Pack the results into one MMX register. Add the packed results to the new source pixels saved from
  840. above. Store result over destination pixels. Stay in TwoPixelsAtOnceLoop loop until there is less than
  841. two pixels to do.
  842. Step 5:
  843. OnePixelLeft
  844. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  845. **************************************************************************/
  846. VOID
  847. mmxPixelBlendOrDissolveOver(
  848. ALPHAPIX *pDst,
  849. ALPHAPIX *pSrc,
  850. LONG Width,
  851. BLENDFUNCTION BlendFunction,
  852. PBYTE pwrMask
  853. )
  854. {
  855. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  856. static QWORD W128 = 0x0080008000800080;
  857. static QWORD AlphaMask = 0x000000FF000000FF;
  858. static QWORD Zeros = 0;
  859. _asm
  860. {
  861. mov esi, pSrc
  862. mov edi, pDst
  863. movq mm7, W128 // This register never changes
  864. pxor mm4, mm4 // This register never changes
  865. xor eax, eax
  866. mov al, ConstAlpha
  867. movd mm5, eax // | | | | CA |
  868. punpcklwd mm5, mm5 // | | | CA | CA |
  869. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  870. // This register never changes
  871. mov ecx, Width
  872. // Step 1:
  873. test edi, 7 // Test first pixel for QWORD alignment
  874. jz QuadAligned // if unaligned,
  875. jmp Do1Pixel // do first pixel only
  876. QuadAligned: // Step 2:
  877. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  878. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  879. test ecx, ecx // Make sure there is at least 1 quad to do
  880. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  881. TwoPixelsAtOnceLoop: // Step 3:
  882. // Within this loop, instructions will pair as shown for the Pentium processor
  883. /* Dissolve
  884. pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
  885. pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
  886. pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
  887. pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
  888. */
  889. movq mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  890. movq mm1, mm0 // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  891. punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b |
  892. punpckhbw mm1, mm4 // | 0 | S2a | 0 | S2r | 0 | S2g | 0 | S2b |
  893. pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b |
  894. add esi, 8 // pSrc++;
  895. pmullw mm1, mm5 // | CA*S2a | CA*S2r | CA*S2g | CA*S2b |
  896. paddusw mm1, mm7 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 |
  897. paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  898. movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  899. psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 |
  900. // S1x' = CA*S1x+128 S2x' = CA*S2x+128
  901. movq mm3, mm1 // |CA*S2a+128 |CA*S2r+128 |CA*S2g+128 |CA*S2b+128 |
  902. psrlw mm1, 8 // | S2a'>>8 | S2r'>>8 | S2g'>>8 | S2b'>>8 |
  903. // S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8
  904. paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" |
  905. paddusw mm1, mm3 // | S2a" | S2r" | S2g" | S2b" |
  906. psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 |
  907. // SXx'" = ((CA*SXx+128)>>8)>>8)
  908. psrlw mm1, 8 // | S2a">>8 | S2r">>8 | S2g">>8 | S2b">>8 |
  909. packuswb mm0, mm1 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"|
  910. movq mm6, mm0
  911. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  912. /* Over
  913. SrcTran = 255 - pixSrc.a
  914. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255);
  915. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255);
  916. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255);
  917. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255);
  918. */
  919. // Step 4:
  920. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  921. movq mm1, mm0 // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  922. punpcklwd mm0, mm0 // | 0 | 0 | 255-S1a | 255-S1a |
  923. movq mm2, [edi] // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  924. punpcklwd mm0, mm0 // | 255-S1a | 255-S1a | 255-S1a | 255-S1a |
  925. movq mm3, mm2 // | D2a | D2r | D2g | D2b | D1a | D1r | D1g | D1b |
  926. punpckhwd mm1, mm1 // | 0 | 0 | 255-S2a | 255-S2a |
  927. punpcklwd mm1, mm1 // | 255-S2a | 255-S2a | 255-S2a | 255-S2a |
  928. punpckhbw mm3, mm4 // | 0 | D2a | 0 | D2r | 0 | D2g | 0 | D2b |
  929. // T1 = 255-S1a T2 = 255-S2a
  930. punpcklbw mm2, mm4 // | 0 | D1a | 0 | D1r | 0 | D1g | 0 | D1b |
  931. pmullw mm1, mm3 // | T2*D2a | T2*D2r | T2*D2g | T2*D2b |
  932. add edi, 8 // pDst++;
  933. pmullw mm0, mm2 // | T1*D1a | T1*D1r | T1*D1g | T1*D1b |
  934. paddusw mm0, mm7 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  935. paddusw mm1, mm7 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  936. movq mm3, mm1 // |T2*D2a+128 |T2*D2r+128 |T2*D2g+128 |T2*D2b+128 |
  937. // TDXx' = TX*DXx+128
  938. psrlw mm1, 8 // | TD2a'>>8 | TD2r'>>8 | TD2g'>>8 | TD2b'>>8 |
  939. movq mm2, mm0 // |T1*D1a+128 |T1*D1r+128 |T1*D1g+128 |T1*D1b+128 |
  940. psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  941. // TDXx" = (TX*DXx+128)+(TDXx'>>8)
  942. paddusw mm1, mm3 // | TD2a" | TD2r" | TD2g" | TD2b" |
  943. paddusw mm0, mm2 // | TD1a" | TD1r" | TD1g" | TD1b" |
  944. psrlw mm1, 8 // | TD2a">>8 | TD2r">>8 | TD2g">>8 | TD2b">>8 |
  945. psrlw mm0, 8 // | TD1a">>8 | TD1r">>8 | TD1g">>8 | TD1b">>8 |
  946. packuswb mm0, mm1 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  947. // SXx = SXx'" TDXx = TDXx'"
  948. paddusb mm0, mm6// |S2a+TD2a|S2r+TD2r|S2g+TD2g|S2b+TD2b|S1a+TD1a|S1r+TD1r|S1g+TD1g|S1b+TD1b|
  949. movq [edi-8], mm0
  950. dec ecx
  951. jnz TwoPixelsAtOnceLoop
  952. OnePixelLeft: // Step 5:
  953. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  954. // If 0, there were an even number of pixels and we're done
  955. // If 1, there is an odd number of pixels and we need to do one more
  956. test eax, 1
  957. jz Done
  958. Do1Pixel: // make as a macro if used in asm file
  959. /* Dissolve
  960. pixSrc.r = (((ConstAlpha * pixSrc.r)+127)/255);
  961. pixSrc.g = (((ConstAlpha * pixSrc.g)+127)/255);
  962. pixSrc.b = (((ConstAlpha * pixSrc.b)+127)/255);
  963. pixSrc.a = (((ConstAlpha * pixSrc.a)+127)/255);
  964. */
  965. movd mm0, [esi] // | S2a | S2r | S2g | S2b | S1a | S1r | S1g | S1b |
  966. punpcklbw mm0, mm4 // | 0 | S1a | 0 | S1r | 0 | S1g | 0 | S1b |
  967. pmullw mm0, mm5 // | CA*S1a | CA*S1r | CA*S1g | CA*S1b |
  968. paddusw mm0, mm7 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  969. movq mm2, mm0 // |CA*S1a+128 |CA*S1r+128 |CA*S1g+128 |CA*S1b+128 |
  970. // S1x' = CA*S1x+128 S2x' = CA*S2x+128
  971. psrlw mm0, 8 // | S1a'>>8 | S1r'>>8 | S1g'>>8 | S1b'>>8 |
  972. // S1x" = (CA*S1x+128)>>8 S2x" = (CA*S2x+128)>>8
  973. paddusw mm0, mm2 // | S1a" | S1r" | S1g" | S1b" |
  974. psrlw mm0, 8 // | S1a">>8 | S1r">>8 | S1g">>8 | S1b">>8 |
  975. packuswb mm0, mm0 // |S2a'"|S2r'"|S2g'"|S2b'"|S1a'"|S1r'"|S1g'"|S1b'"|
  976. movq mm6, mm0
  977. psrld mm0, 24 // | 0 | 0 | 0 | S2a | 0 | 0 | 0 | S1a |
  978. /* Over
  979. SrcTran = 255 - pixSrc.a
  980. pixDst.r = pixSrc.r + (((SrcTran * pixDst.r)+128)/255);
  981. pixDst.g = pixSrc.g + (((SrcTran * pixDst.g)+128)/255);
  982. pixDst.b = pixSrc.b + (((SrcTran * pixDst.b)+128)/255);
  983. pixDst.a = pixSrc.a + (((SrcTran * pixDst.a)+128)/255);
  984. */
  985. pxor mm0, AlphaMask // | 0 | 0 | 0 |255-S2a| 0 | 0 | 0 |255-S1a|
  986. punpcklwd mm0, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 |255-S1a|255-S1a|
  987. punpckldq mm0, mm0 // | 255-S1a| 255-S1a| 255-S1a| 255-S1a|
  988. movd mm2, [edi] // | 0 | 0 | 0 | 0 | D1a | D1r | D1g | D1b |
  989. punpcklbw mm2, mm4 // | D1a | D1r | D1g | D1b |
  990. // T = 255-S1x
  991. pmullw mm0, mm2 // | T*D1a | T*D1r | T*D1g | T*D1b |
  992. paddusw mm0, mm7 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  993. movq mm1, mm0 // | T*D1a+128 | T*D1r+128 | T*D1g+128 | T*D1b+128 |
  994. psrlw mm0, 8 // | TD1a'>>8 | TD1r'>>8 | TD1g'>>8 | TD1b'>>8 |
  995. paddusw mm0, mm1 // | TD1a" | TD1r" | TD1g" | TD1b" |
  996. psrlw mm0, 8
  997. packuswb mm0, mm0 // |TD2a'"|TD2r'"|TD2g'"|TD2b'"|TD1a'"|TD1r'"|TD1g'"|TD1b'"|
  998. paddusb mm0, mm6
  999. movd [edi], mm0
  1000. add edi, 4 // pDst++;
  1001. add esi, 4 // pSrc++;
  1002. test ecx, ecx
  1003. jz Done // just processed the last pixel of the row
  1004. dec ecx
  1005. jmp QuadAligned // just processed the first pixel of the row
  1006. Done:
  1007. emms // remove for optimizations, have calling function do emms
  1008. }
  1009. }
  1010. /**************************************************************************
  1011. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  1012. This function operates on 16 bit pixels (5 for Red, 5 for Green, and 5 for Blue) in a row of a bitmap.
  1013. It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
  1014. The function performs the following on each byte:
  1015. tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31)
  1016. tmp2 = tmp1 AND 3E0h (mask off low 5 bits)
  1017. tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
  1018. tmp2 = tmp2 + tmp1
  1019. tmp2 = tmp2 AND 3E0h (mask off low 5 bits)
  1020. tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
  1021. Dst = tmp2
  1022. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  1023. Red and blue are processed together in the same register. Green is processed separately.
  1024. For two pixels at once, the reds and blues for both pixels are processed in the same register; and the
  1025. greens are processed together in a separate register.
  1026. The loop structure is as follows:
  1027. Step 1:
  1028. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  1029. as a DWORD (OnePixelLeft:), then do Step 2.
  1030. Step 2:
  1031. (QuadAligned:)
  1032. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  1033. pixel left, do as a DWORD.
  1034. Step 3:
  1035. (TwoPixelsAtOnceLoop:)
  1036. Perform the above function, using MMX instructions, on two pixels per pass of the loop.
  1037. Step 4:
  1038. (OnePixelLeft:)
  1039. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  1040. **************************************************************************/
  1041. VOID
  1042. mmxPixelBlend16_555(
  1043. PALPHAPIX pDst,
  1044. PALPHAPIX pSrc,
  1045. LONG Width,
  1046. BLENDFUNCTION BlendFunction,
  1047. PBYTE pwrMask
  1048. )
  1049. {
  1050. static QWORD RMask = 0x007C0000007C0000;
  1051. static QWORD GMask = 0x0000000003E003E0;
  1052. static QWORD BMask = 0x0000001F0000001F;
  1053. static QWORD RBConst = 0x0010001000100010;
  1054. static QWORD GConst = 0x0000000000100010;
  1055. static QWORD RGBMask = 0x03E003E003E003E0;
  1056. static QWORD RedMask = 0x001F0000001F0000;
  1057. static QWORD CA; // ConstAlpha in 4 words of a qword
  1058. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1059. _asm
  1060. {
  1061. mov ecx, Width // Make sure there is at least one pixel to do
  1062. test ecx, ecx
  1063. jz Done
  1064. mov esi, pSrc
  1065. mov edi, pDst
  1066. xor eax, eax
  1067. mov al, ConstAlpha
  1068. movd mm5, eax // | | | | CA |
  1069. punpcklwd mm5, mm5 // | | | CA | CA |
  1070. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1071. movq CA, mm5
  1072. // Step 1:
  1073. test edi, 7 // Test first pixel for QWORD alignment
  1074. jz QuadAligned // if unaligned,
  1075. jmp Do1Pixel // do first pixel only
  1076. QuadAligned: // Step 2:
  1077. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  1078. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  1079. test ecx, ecx // Make sure there is at least 1 quad to do
  1080. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  1081. TwoPixelsAtOnceLoop: // Step 3:
  1082. movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1083. pxor mm7, mm7
  1084. movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1085. movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1086. movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1087. punpcklbw mm0, mm7 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1088. punpcklbw mm1, mm7 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1089. movq mm4, mm0 // | D2xrrrrrgg | D2gggbbbbb | D1xrrrrrgg | D1gggbbbbb |
  1090. pand mm0, RMask // | D20rrrrr00 | 0 | D10rrrrr00 | 0 |
  1091. movq mm5, mm1 // | S2xrrrrrgg | S2gggbbbbb | S1xrrrrrgg | S1gggbbbbb |
  1092. pand mm4, BMask // | 0 | D2000bbbbb | 0 | D1000bbbbb |
  1093. psrlw mm0, 2 // | D2rrrrr | 0 | D1rrrrr | 0 |
  1094. pand mm1, RMask // | S20rrrrr00 | 0 | S10rrrrr00 | 0 |
  1095. por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1096. pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb |
  1097. movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1098. pand mm2, GMask // | 0 | 0 |D2ggggg00000|D1ggggg00000|
  1099. psllw mm4, 5 // |D2rrrrr00000|D2bbbbb00000|D1rrrrr00000|D1bbbbb00000|
  1100. pand mm3, GMask // | 0 | 0 |S2ggggg00000|S1ggggg00000|
  1101. psrlw mm1, 2 // | S2rrrrr | 0 | S1rrrrr | 0 |
  1102. por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb |
  1103. movq mm6, mm2 // | 0 | 0 |D2ggggg00000|D1ggggg00000|
  1104. psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b |
  1105. psrlw mm2, 5 // | 0 | 0 | D2ggggg | D1ggggg |
  1106. pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b |
  1107. psubw mm4, mm0 // | D2r*31 | D2b*31 | D1r*31 | D1b*31 |
  1108. paddw mm4, RBConst// | CA2r+c | CA2b+c | CA1r+c | CA1b+c |
  1109. psrlw mm3, 5 // | 0 | 0 | S2ggggg | S1ggggg |
  1110. psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g |
  1111. add esi, 4 // pSrc++;
  1112. pmullw mm3, CA // | 0 | 0 | CA2g | CA1g |
  1113. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1114. psubw mm6, mm2 // | 0 | 0 | D2g*31 | D2g*31 |
  1115. add edi, 4 // pDst++;
  1116. paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c |
  1117. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1118. pand mm4, RGBMask// RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
  1119. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1120. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1121. psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1122. pand mm6, RGBMask// Gtmp2 = Gtmp1 AND 3E0h (mask off low 5 bits)
  1123. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1124. pand mm1, RGBMask// RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
  1125. psrlw mm6, 5 // Gtmp2 = Gtmp2 shr 5 (move high 5 bits to low 5 bits)
  1126. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1127. psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1128. pand mm5, RGBMask// Gtmp2 = Gtmp2 AND 3E0h (mask off low 5 bits)
  1129. movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1130. pand mm4, RedMask// Mask to get red
  1131. pand mm1, BMask // Mask to get blue
  1132. psllw mm4, 2 // Line up the red
  1133. por mm4, mm1 // Combine reds and blues in proper bit location
  1134. packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb |
  1135. por mm4, mm5 // | 0 | 0 | 0 | 0 | D20rrrrrgg | D2gggbbbbb | D10rrrrrgg | D1gggbbbbb |
  1136. movd [edi-4], mm4
  1137. dec ecx
  1138. jnz TwoPixelsAtOnceLoop
  1139. OnePixelLeft: // Step 4:
  1140. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  1141. // If 0, there was an even number of pixels and we're done
  1142. // If 1, there is an odd number of pixels and we need to do one more
  1143. test eax, 1
  1144. jz Done
  1145. Do1Pixel: // make as a macro if used in asm file
  1146. movzx edx,WORD PTR[edi] ; edx = D 0000 0000 0rrr rrgg gggb bbbb
  1147. movzx ebx,WORD PTR[esi] ; ebx = S 0000 0000 0rrr rrgg gggb bbbb
  1148. movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1149. pxor mm7, mm7
  1150. movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1151. movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1152. movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1153. punpcklbw mm0, mm7 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1154. punpcklbw mm1, mm7 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1155. movq mm4, mm0 // | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1156. pand mm0, RMask // | 0 | 0 | D10rrrrr00 | 0 |
  1157. movq mm5, mm1 // | 0 | 0 | S1xrrrrrgg | S1gggbbbbb |
  1158. pand mm4, BMask // | 0 | 0 | 0 | D1000bbbbb |
  1159. psrlw mm0, 2 // | 0 | 0 | D1rrrrr | 0 |
  1160. pand mm1, RMask // | 0 | 0 | S10rrrrr00 | 0 |
  1161. por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1162. pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb |
  1163. movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1164. pand mm2, GMask // | 0 | 0 | 0 |D1ggggg00000|
  1165. psllw mm4, 5 // | 0 | 0 |D1rrrrr00000|D1bbbbb00000|
  1166. pand mm3, GMask // | 0 | 0 | 0 |S1ggggg00000|
  1167. psrlw mm1, 2 // | 0 | 0 | S1rrrrr | 0 |
  1168. por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb |
  1169. movq mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000|
  1170. // mm1 is free
  1171. psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b |
  1172. psrlw mm2, 5 // | 0 | 0 | 0 | D1ggggg |
  1173. pmullw mm5, CA // | 0 | 0 | CA1r | CA1b |
  1174. psubw mm4, mm0 // | 0 | 0 | D1r*31 | D1b*31 |
  1175. paddw mm4, RBConst// | 0 | 0 | CA1r+c | CA1b+c |
  1176. psrlw mm3, 5 // | 0 | 0 | 0 | S1ggggg |
  1177. psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g |
  1178. add esi, 2 // pSrc++;
  1179. pmullw mm3, CA // | 0 | 0 | 0 | CA1g |
  1180. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1181. psubw mm6, mm2 // | 0 | 0 | 0 |D1ggggg00000-D1ggggg|
  1182. add edi, 2 // pDst++;
  1183. paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c |
  1184. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1185. pand mm4, RGBMask// RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
  1186. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1187. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 16 + (GDst * 31)
  1188. psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1189. pand mm6, RGBMask// Gtmp2 = Gtmp1 AND 3E0h (mask off low 5 bits)
  1190. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1191. pand mm1, RGBMask// RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
  1192. psrlw mm6, 5 // Gtmp2 = Gtmp2 shr 5 (move high 5 bits to low 5 bits)
  1193. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1194. psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1195. pand mm5, RGBMask// Gtmp2 = Gtmp2 AND 3E0h (mask off low 5 bits)
  1196. movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1197. pand mm4, RedMask// Mask to get red
  1198. pand mm1, BMask // Mask to get blue
  1199. psllw mm4, 2 // Line up the red
  1200. por mm4, mm1 // Combine reds and blues in proper bit location
  1201. packsswb mm4, mm7 // | 0 | 0 | D10rrrrr00 | D1000bbbbb |
  1202. por mm4, mm5 // | 0 | 0 | D10rrrrrgg | D1gggbbbbb |
  1203. movd edx, mm4
  1204. mov [edi-2], dx
  1205. test ecx, ecx
  1206. jz Done // just processed the last pixel of the row
  1207. dec ecx
  1208. jmp QuadAligned // just processed the first pixel of the row
  1209. Done:
  1210. emms // remove for optimizations, have calling function do emms
  1211. }
  1212. }
  1213. /**************************************************************************
  1214. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  1215. This function operates on 16 bit pixels (5 for Red, 6 for Green, and 5 for Blue) in a row of a bitmap.
  1216. It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
  1217. The function performs the following:
  1218. For red and blue:
  1219. tmp1 = Alpha(Src - Dst) + 16 + (Dst * 31)
  1220. tmp2 = tmp1 AND 3E0h (mask off low 5 bits)
  1221. tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
  1222. tmp2 = tmp2 + tmp1
  1223. tmp2 = tmp2 AND 3E0h (mask off low 5 bits)
  1224. tmp2 = tmp2 shr 5 (move high 5 bits to low 5 bits)
  1225. Dst = tmp2
  1226. For green:
  1227. tmp1 = Alpha(Src - Dst) + 32 + (Dst * 63)
  1228. tmp2 = tmp1 AND FC0h (mask off low 6 bits)
  1229. tmp2 = tmp2 shr 6 (move high 6 bits to low 6 bits)
  1230. tmp2 = tmp2 + tmp1
  1231. tmp2 = tmp2 AND FC0h (mask off low 6 bits)
  1232. tmp2 = tmp2 shr 6 (move high 6 bits to low 6 bits)
  1233. Dst = tmp2
  1234. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  1235. Red and blue are processed together in the same register. Green is processed separately.
  1236. For two pixels at once, the reds and blues for both pixels are processed in the same register; and the
  1237. greens are processed together in a separate register.
  1238. The loop structure is as follows:
  1239. Step 1:
  1240. Check pDst for QWORD alignment. If aligned, do Step 2. If unaligned, do first pixel
  1241. as a DWORD (OnePixelLeft:), then do Step 2.
  1242. Step 2:
  1243. (QuadAligned:)
  1244. pDst is QWORD aligned. If two pixels can be done as a QWORD, do Step 3. If only one
  1245. pixel left, do as a DWORD.
  1246. Step 3:
  1247. (TwoPixelsAtOnceLoop:)
  1248. Perform the above function, using MMX instructions, on two pixels per pass of the loop.
  1249. Step 4:
  1250. (OnePixelLeft:)
  1251. If there is one pixel left (odd number of original pixels) do last pixel as a DWORD.
  1252. **************************************************************************/
  1253. VOID
  1254. mmxPixelBlend16_565(
  1255. PALPHAPIX pDst,
  1256. PALPHAPIX pSrc,
  1257. LONG Width,
  1258. BLENDFUNCTION BlendFunction,
  1259. PBYTE pwrMask
  1260. )
  1261. {
  1262. static QWORD RMask = 0x00FF000000FF0000;
  1263. static QWORD GMask = 0x0000000007E007E0;
  1264. static QWORD BMask = 0x0000001F0000001F;
  1265. static QWORD RBConst = 0x0010001000100010;
  1266. static QWORD GConst = 0x0000000000200020;
  1267. static QWORD RBMask = 0x03E003E003E003E0;
  1268. static QWORD GreenMask = 0x000000000FC00FC0;
  1269. static QWORD CA; // ConstAlpha in 4 words of a qword
  1270. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1271. _asm
  1272. {
  1273. mov ecx, Width // Make sure there is at least one pixel to do
  1274. test ecx, ecx
  1275. jz Done
  1276. mov esi, pSrc
  1277. mov edi, pDst
  1278. xor eax, eax
  1279. mov al, ConstAlpha
  1280. movd mm5, eax // | | | | CA |
  1281. punpcklwd mm5, mm5 // | | | CA | CA |
  1282. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1283. movq CA, mm5
  1284. // Step 1:
  1285. test edi, 7 // Test first pixel for QWORD alignment
  1286. jz QuadAligned // if unaligned,
  1287. jmp Do1Pixel // do first pixel only
  1288. QuadAligned: // Step 2:
  1289. mov eax, ecx // Save the width in eax for later (see OnePixelLeft:)
  1290. shr ecx, 1 // Want to do 2 pixels (1 quad) at once, so make ecx even
  1291. test ecx, ecx // Make sure there is at least 1 quad to do
  1292. jz OnePixelLeft // If we take this jump, width was 1 (aligned) or 2 (unaligned)
  1293. TwoPixelsAtOnceLoop: // Step 3:
  1294. movd mm0, [edi] // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1295. pxor mm7, mm7
  1296. movd mm1, [esi] // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1297. movq mm2, mm0 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1298. movq mm3, mm1 // | 0 | 0 | 0 | 0 | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1299. punpcklbw mm0, mm7 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1300. punpcklbw mm1, mm7 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1301. movq mm4, mm0 // | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1302. pand mm0, RMask // | D2rrrrr000 | 0 | D1rrrrr000 | 0 |
  1303. movq mm5, mm1 // | S2rrrrrggg | S2gggbbbbb | S1rrrrrggg | S1gggbbbbb |
  1304. pand mm4, BMask // | 0 | D2000bbbbb | 0 | D1000bbbbb |
  1305. psrlw mm0, 3 // | D2rrrrr | 0 | D1rrrrr | 0 |
  1306. pand mm1, RMask // | S2rrrrr000 | 0 | S1rrrrr000 | 0 |
  1307. por mm0, mm4 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1308. pand mm5, BMask // | 0 | S2bbbbb | 0 | S1bbbbb |
  1309. movq mm4, mm0 // | D2rrrrr | D2bbbbb | D1rrrrr | D1bbbbb |
  1310. pand mm2, GMask // | 0 | 0 |D2gggggg00000|D1gggggg00000|
  1311. psllw mm4, 5 // |D2rrrrr00000|D2bbbbb00000|D1rrrrr00000|D1bbbbb00000|
  1312. pand mm3, GMask // | 0 | 0 |S2gggggg00000|S1gggggg00000|
  1313. psrlw mm1, 3 // | S2rrrrr | 0 | S1rrrrr | 0 |
  1314. por mm5, mm1 // | S2rrrrr | S2bbbbb | S1rrrrr | S1bbbbb |
  1315. movq mm6, mm2 // | 0 | 0 |D2gggggg00000|D1gggggg00000|
  1316. psubw mm5, mm0 // | S2r-D2r | S2b-D2b | S1r-D1r | S1b-D1b |
  1317. psrlw mm2, 5 // | 0 | 0 | D2gggggg | D1gggggg |
  1318. pmullw mm5, CA // | CA2r | CA2b | CA1r | CA1b |
  1319. psubw mm4, mm0 // | D2r*31 | D2b*31 | D1r*31 | D1b*31 |
  1320. paddw mm4, RBConst // | CA2r+c | CA2b+c | CA1r+c | CA1b+c |
  1321. psrlw mm3, 5 // | 0 | 0 | S2gggggg | S1gggggg |
  1322. psubw mm3, mm2 // | 0 | 0 | S2g-D2g | S1g-D1g |
  1323. add esi, 4 // pSrc++;
  1324. pmullw mm3, CA // | 0 | 0 | CA2g | CA1g |
  1325. psllw mm6, 1 // | 0 | 0 |D2gggggg000000|D1gggggg000000|
  1326. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1327. psubw mm6, mm2 // | 0 | 0 | D2g*63 | D1g*63 |
  1328. paddw mm6, GConst // | 0 | 0 | CA2g+c | CA1g+c |
  1329. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1330. add edi, 4 // pDst++;
  1331. psllw mm3, 1 // | 0 | 0 | CA2g*2 | CA1g*2 |
  1332. pand mm4, RBMask // RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
  1333. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1334. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1335. psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1336. pand mm6, GreenMask // Gtmp2 = Gtmp1 AND FC0h (mask off low 6 bits)
  1337. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1338. pand mm1, RBMask // RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
  1339. psrlw mm6, 6 // Gtmp2 = Gtmp2 shr 6 (move high 6 bits to low 6 bits)
  1340. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1341. psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1342. pand mm5, GreenMask // Gtmp2 = Gtmp2 AND FC0h (mask off low 6 bits)
  1343. movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1344. pand mm4, RMask // Mask to get red
  1345. psrlw mm5, 1 // Align the green
  1346. pand mm1, BMask // Mask to get blue
  1347. psllw mm4, 3 // Align the red
  1348. por mm4, mm1 // Combine reds and blues in proper bit location
  1349. packuswb mm4, mm7 // | 0 | 0 | 0 | 0 | D2rrrrr000 | D2000bbbbb | D1rrrrr000 | D1000bbbbb |
  1350. por mm4, mm5 // | 0 | 0 | 0 | 0 | D2rrrrrggg | D2gggbbbbb | D1rrrrrggg | D1gggbbbbb |
  1351. movd [edi-4], mm4
  1352. dec ecx
  1353. jnz TwoPixelsAtOnceLoop
  1354. OnePixelLeft: // Step 4:
  1355. // This tests for 0 or 1 pixel left in row - eax contains real width, not width/2
  1356. // If 0, there were an even number of pixels and we're done
  1357. // If 1, there is an odd number of pixels and we need to do one more
  1358. test eax, 1
  1359. jz Done
  1360. Do1Pixel: // make as a macro if used in asm file
  1361. movzx edx,WORD PTR[edi] ; edx = D 0000 0000 rrrr rggg gggb bbbb
  1362. movzx ebx,WORD PTR[esi] ; ebx = S 0000 0000 rrrr rggg gggb bbbb
  1363. movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | D1xrrrrrgg | D1gggbbbbb |
  1364. pxor mm7, mm7
  1365. movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1366. movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1367. movq mm3, mm1 // | 0 | 0 | 0 | 0 | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1368. punpcklbw mm0, mm7 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1369. punpcklbw mm1, mm7 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1370. movq mm4, mm0 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1371. pand mm0, RMask // | 0 | 0 | D1rrrrr000 | 0 |
  1372. movq mm5, mm1 // | 0 | 0 | S1rrrrrggg | S1gggbbbbb |
  1373. pand mm4, BMask // | 0 | 0 | 0 | D1000bbbbb |
  1374. psrlw mm0, 3 // | 0 | 0 | D1rrrrr | 0 |
  1375. pand mm1, RMask // | 0 | 0 | S1rrrrr000 | 0 |
  1376. por mm0, mm4 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1377. pand mm5, BMask // | 0 | 0 | 0 | S1bbbbb |
  1378. movq mm4, mm0 // | 0 | 0 | D1rrrrr | D1bbbbb |
  1379. pand mm2, GMask // | 0 | 0 | 0 |D1gggggg00000|
  1380. psllw mm4, 5 // | 0 | 0 |D1rrrrr00000|D1bbbbb00000|
  1381. pand mm3, GMask // | 0 | 0 | 0 |S1gggggg00000|
  1382. psrlw mm1, 3 // | 0 | 0 | S1rrrrr | 0 |
  1383. por mm5, mm1 // | 0 | 0 | S1rrrrr | S1bbbbb |
  1384. movq mm6, mm2 // | 0 | 0 | 0 |D1gggggg00000|
  1385. psubw mm5, mm0 // | 0 | 0 | S1r-D1r | S1b-D1b |
  1386. psrlw mm2, 5 // | 0 | 0 | 0 | D1gggggg |
  1387. pmullw mm5, CA // | 0 | 0 | CA1r | CA1b |
  1388. psubw mm4, mm0 // | 0 | 0 | D1r*31 | D1b*31 |
  1389. paddw mm4, RBConst // | 0 | 0 | CA1r+c | CA1b+c |
  1390. psrlw mm3, 5 // | 0 | 0 | 0 | S1gggggg |
  1391. psubw mm3, mm2 // | 0 | 0 | 0 | S1g-D1g |
  1392. add esi, 2 // pSrc++;
  1393. pmullw mm3, CA // | 0 | 0 | 0 | CA1g |
  1394. paddw mm4, mm5 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1395. psllw mm6, 1 // | 0 | 0 | 0 |D1gggggg000000|
  1396. psubw mm6, mm2 // | 0 | 0 | 0 | D1g*63 |
  1397. add edi, 2 // pDst++;
  1398. paddw mm6, GConst // | 0 | 0 | 0 | CA1g+c |
  1399. movq mm1, mm4 // RBtmp1 = Alpha(RBSrc - RBDst) + 16 + (RBDst * 31)
  1400. psllw mm3, 1 // | 0 | 0 | 0 | CA1g*2 |
  1401. pand mm4, RBMask // RBtmp2 = RBtmp1 AND 3E0h (mask off low 5 bits)
  1402. paddw mm6, mm3 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1403. movq mm5, mm6 // Gtmp1 = Alpha(GSrc - GDst) + 32 + (GDst * 63)
  1404. psrlw mm4, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1405. pand mm6, GreenMask // Gtmp2 = Gtmp1 AND FC0h (mask off low 6 bits)
  1406. paddw mm1, mm4 // RBtmp2 = RBtmp2 + RBtmp1
  1407. pand mm1, RBMask // RBtmp2 = RBtmp2 AND 3E0h (mask off low 5 bits)
  1408. psrlw mm6, 6 // Gtmp2 = Gtmp2 shr 6 (move high 6 bits to low 6 bits)
  1409. paddw mm5, mm6 // Gtmp2 = Gtmp2 + Gtmp1
  1410. psrlw mm1, 5 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1411. pand mm5, GreenMask // Gtmp2 = Gtmp2 AND FC0h (mask off low 6 bits)
  1412. movq mm4, mm1 // RBtmp2 = RBtmp2 shr 5 (move high 5 bits to low 5 bits)
  1413. pand mm4, RMask // Mask to get red
  1414. psrlw mm5, 1 // Align the green
  1415. pand mm1, BMask // Mask to get blue
  1416. psllw mm4, 3 // Align the red
  1417. por mm4, mm1 // Combine reds and blues in proper bit location
  1418. packuswb mm4, mm7 // | 0 | 0 | D1rrrrr000 | D1000bbbbb |
  1419. por mm4, mm5 // | 0 | 0 | D1rrrrrggg | D1gggbbbbb |
  1420. movd edx, mm4
  1421. mov [edi-2], dx
  1422. test ecx, ecx
  1423. jz Done // just processed the last pixel of the row
  1424. dec ecx
  1425. jmp QuadAligned // just processed the first pixel of the row
  1426. Done:
  1427. emms // remove for optimizations, have calling function do emms
  1428. }
  1429. }
  1430. /**************************************************************************
  1431. THIS FUNCTION DOES NOT DO ANY PARAMETER VALIDATION
  1432. This function operates on 24 bit pixels (8 bits each for Red, Green, and Blue) in a row of a bitmap.
  1433. It blends source and destination bitmaps, without alpha channels, using a constant alpha input.
  1434. The function performs the following on each byte:
  1435. tmp1 = Alpha(Src - Dst) + 128 + (Dst * 127)
  1436. tmp2 = tmp1 AND FF00h (mask off low byte)
  1437. tmp2 = tmp2 shr 8 (move high byte to low byte)
  1438. tmp2 = tmp2 + tmp1
  1439. tmp2 = tmp2 AND FF00h (mask off low byte)
  1440. tmp2 = tmp2 shr 8 (move high byte to low byte)
  1441. Dst = tmp2
  1442. pDst is assumed to be aligned to a DWORD boundary when passed to this function.
  1443. The loop structure is as follows:
  1444. Step 1:
  1445. Multiply width in pixels by 3 to get width in bytes. Byte count is kept in ecx and eax.
  1446. ecx is used as the loop counter.
  1447. Step 2:
  1448. Check pDst for QWORD alignment. If aligned, do Step 3. If unaligned, test to see if there
  1449. are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:) and then do Step 3.
  1450. If no, there are only 3 bytes to do; so do them one at a time (OneToThreeBytesLeft:).
  1451. Step 3:
  1452. (QuadAligned:)
  1453. pDst is QWORD aligned. We want to do 8 bytes (1 quad) at once, so divide byte count by 8 to get loop
  1454. count. If ecx is 0 at this point, there are no more quads to do; so do 0 to 7 bytes (NoQuadsLeft:),
  1455. in Step 5.
  1456. Step 4:
  1457. (Do1QUAD:)
  1458. Perform the above function, using MMX instructions, on 8 bytes per pass of the loop.
  1459. Step 5:
  1460. (NoQuadsLeft:)
  1461. Mask eax with 7 to get the byte count modulo 8, 0 to 7 bytes left. Copy eax into ecx. Test to see
  1462. if there are at least 4 bytes to do...if yes, do four bytes at once (Do1DWORD:); if no, there are
  1463. only 3 bytes to do, so do them one at a time (OneToThreeBytesLeft:).
  1464. Step 6:
  1465. (Do1DWORD:)
  1466. Perform the above function, using MMX instructions, on 4 bytes. Do Step 3 (QuadAligned:) to see if
  1467. there are more bytes to do.
  1468. Step 7:
  1469. (OneToThreeBytesLeft:)
  1470. Do one byte at a time. This will happen if there are less than 4 bytes left to do.
  1471. **************************************************************************/
  1472. VOID
  1473. mmxPixelBlend24(
  1474. PALPHAPIX pDst,
  1475. PALPHAPIX pSrc,
  1476. LONG Width,
  1477. BLENDFUNCTION BlendFunction,
  1478. PBYTE pwrMask
  1479. )
  1480. {
  1481. static QWORD WordConst = 0x0080008000800080;
  1482. static QWORD WordMask = 0xFF00FF00FF00FF00;
  1483. static QWORD ByteConst = 0x0000000000000080;
  1484. static QWORD ByteMask = 0x000000000000FF00;
  1485. static QWORD CA; // ConstAlpha in 4 words of a qword
  1486. BYTE ConstAlpha = BlendFunction.SourceConstantAlpha;
  1487. _asm
  1488. {
  1489. mov ecx, Width // Make sure there is at least one pixel to do
  1490. test ecx, ecx
  1491. jz Done
  1492. mov esi, pSrc
  1493. mov edi, pDst
  1494. xor eax, eax
  1495. mov al, ConstAlpha
  1496. movd mm5, eax // | | | | CA |
  1497. punpcklwd mm5, mm5 // | | | CA | CA |
  1498. punpcklwd mm5, mm5 // | CA | CA | CA | CA |
  1499. movq CA, mm5
  1500. // Step 1:
  1501. lea ecx, [2*ecx+ecx]// NumPixels * 3 bytes/pixel = NumBytes
  1502. // Step 2:
  1503. test edi, 7 // Test first pixel for QWORD alignment
  1504. jz QuadAligned // If unaligned,
  1505. cmp ecx, 4 // test to see if there are 4 bytes to do
  1506. jae Do1DWORD // if yes, do 4 bytes
  1507. jmp OneToThreeBytesLeft// if no, do 1 to 3 bytes
  1508. QuadAligned: // Step 3:
  1509. mov eax, ecx // Save the width in eax for later (see NoQuadsLeft:)
  1510. shr ecx, 3 // Want to do 8 bytes at once, so divide
  1511. // byte count by 8 to get loop count
  1512. test ecx, ecx // Make sure there is at least 1 QUAD (8 bytes) to do
  1513. jz NoQuadsLeft // If we take this jump, there are 0 to 7 bytes left
  1514. Do1QUAD: // Step 4:
  1515. // Instructions will pair as shown for the Pentium processor
  1516. movq mm0, [edi] // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
  1517. pxor mm7, mm7
  1518. movq mm1, [esi] // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 |
  1519. movq mm2, mm0 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
  1520. movq mm3, mm1 // | S8 | S7 | S6 | S5 | S4 | S3 | S2 | S1 |
  1521. punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 |
  1522. movq mm4, mm0 // | D4 | D3 | D2 | D1 |
  1523. punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 |
  1524. punpckhbw mm2, mm7 // | D8 | D7 | D6 | D5 |
  1525. psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 |
  1526. pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 |
  1527. punpckhbw mm3, mm7 // | S8 | S7 | S6 | S5 |
  1528. psubw mm3, mm2 // | S8-D8 | S7-D7 | S6-D6 | S5-D5 |
  1529. movq mm6, mm2 // | D8 | D7 | D6 | D5 |
  1530. pmullw mm3, CA // | CA8 | CA7 | CA6 | CA5 |
  1531. psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 |
  1532. psllw mm6, 8 // | D8*128 | D7*128 | D6*128 | D5*128 |
  1533. psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 |
  1534. paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C|
  1535. psubw mm6, mm2 // | D8*127 | D7*127 | D6*127 | D5*127 |
  1536. paddw mm6, WordConst // | D8*127+C| D7*127+C| D6*127+C| D5*127+C|
  1537. paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1538. paddw mm6, mm3 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127)
  1539. movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1540. pand mm4, WordMask // tmp3 = tmp1 AND FF00h (mask off low bytes)
  1541. movq mm5, mm6 // tmp2 = Alpha(Src2 - Dst2) + 128 + (Dst2 * 127)
  1542. pand mm6, WordMask // tmp4 = tmp2 AND FF00h (mask off low bytes)
  1543. psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte)
  1544. psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte)
  1545. paddw mm4, mm3 // tmp3 = tmp3 + tmp1
  1546. pand mm4, WordMask // tmp3 = tmp3 AND FF00h (mask off low bytes)
  1547. paddw mm6, mm5 // tmp4 = tmp4 + tmp2
  1548. pand mm6, WordMask // tmp4 = tmp4 AND FF00h (mask off low bytes)
  1549. psrlw mm4, 8 // tmp3 = tmp3 shr 8 (move high byte to low byte)
  1550. psrlw mm6, 8 // tmp4 = tmp4 shr 8 (move high byte to low byte)
  1551. add edi, 8 // pDst++;
  1552. packuswb mm4, mm6 // | D8 | D7 | D6 | D5 | D4 | D3 | D2 | D1 |
  1553. add esi, 8 // pSrc++;
  1554. movq [edi-8], mm4
  1555. dec ecx
  1556. jnz Do1QUAD
  1557. NoQuadsLeft: // Step 5:
  1558. // This tests for 0 to 7 bytes left in row - eax contains initial byte count
  1559. and eax, 7 // 0 to 7 bytes left to do
  1560. jz Done
  1561. cmp eax, 4 // Test to see if there are 4 bytes to do
  1562. mov ecx, eax
  1563. jae Do1DWORD // if yes, do 4 bytes
  1564. jmp OneToThreeBytesLeft // if no, do 1 to 3 bytes
  1565. // Step 6:
  1566. Do1DWORD: // make as a macro if used in asm file
  1567. movd mm0, [edi] // | 0 | 0 | 0 | 0 | D4 | D3 | D2 | D1 |
  1568. pxor mm7, mm7
  1569. movd mm1, [esi] // | 0 | 0 | 0 | 0 | S4 | S3 | S2 | S1 |
  1570. punpcklbw mm0, mm7 // | D4 | D3 | D2 | D1 |
  1571. movq mm4, mm0 // | D4 | D3 | D2 | D1 |
  1572. punpcklbw mm1, mm7 // | S4 | S3 | S2 | S1 |
  1573. psllw mm4, 8 // | D4*128 | D3*128 | D2*128 | D1*128 |
  1574. psubw mm1, mm0 // | S4-D4 | S3-D3 | S2-D2 | S1-D1 |
  1575. pmullw mm1, CA // | CA4 | CA3 | CA2 | CA1 |
  1576. psubw mm4, mm0 // | D4*127 | D3*127 | D2*127 | D1*127 |
  1577. paddw mm4, WordConst // | D4*127+C| D3*127+C| D2*127+C| D1*127+C|
  1578. paddw mm4, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1579. movq mm3, mm4 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1580. pand mm4, WordMask // tmp2 = tmp1 AND FF00h (mask off low bytes)
  1581. psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte)
  1582. paddw mm4, mm3 // tmp2 = tmp2 + tmp1
  1583. pand mm4, WordMask // tmp2 = tmp2 AND FF00h (mask off low bytes)
  1584. psrlw mm4, 8 // tmp2 = tmp2 shr 8 (move high byte to low byte)
  1585. add edi, 4 // pDst++;
  1586. packuswb mm4, mm4 // | D4 | D3 | D2 | D1 | D4 | D3 | D2 | D1 |
  1587. add esi, 4 // pSrc++;
  1588. movd [edi-4], mm4
  1589. sub ecx, 4 // Just did 4 bytes at the beginning or end of a scan line
  1590. jmp QuadAligned // Jump to QuadAligned to determine if there are more bytes to do
  1591. OneToThreeBytesLeft: // Step 7:
  1592. movzx edx,BYTE PTR[edi] ; edx = Dest Byte
  1593. movzx ebx,BYTE PTR[esi] ; ebx = Src Byte
  1594. movd mm0, edx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db |
  1595. pxor mm7, mm7
  1596. movd mm1, ebx // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Sb |
  1597. movq mm2, mm0 // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Db |
  1598. psllw mm2, 8 // | 0 | 0 | 0 | 0 | 0 | 0 | Db| 0 |
  1599. psubw mm1, mm0 // | 0 | 0 | 0 | Sb-Db |
  1600. pmullw mm1, CA // | 0 | 0 | 0 | CAb |
  1601. psubw mm2, mm0 // | 0 | 0 | 0 | Db*127|
  1602. paddw mm2, ByteConst // | 0 | 0 | 0 |Db*127+128|
  1603. paddw mm1, mm2 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1604. movq mm2, mm1 // tmp1 = Alpha(Src1 - Dst1) + 128 + (Dst1 * 127)
  1605. pand mm2, ByteMask // tmp2 = tmp1 AND FF00h
  1606. psrlw mm2, 8 // tmp2 = tmp2 shr 8
  1607. paddw mm2, mm1 // tmp2 = tmp2 + tmp1
  1608. pand mm2, ByteMask // tmp2 = tmp2 AND FF00h
  1609. psrlw mm2, 8 // tmp2 = tmp2 shr 8
  1610. movd edx, mm2
  1611. mov BYTE PTR[edi], dl
  1612. inc edi
  1613. inc esi
  1614. dec ecx
  1615. jnz OneToThreeBytesLeft
  1616. Done:
  1617. emms // remove for optimizations, have calling function do emms
  1618. }
  1619. }
  1620. #endif
  1621. /******************************Public*Routine******************************\
  1622. * AlphaScanLineBlend
  1623. *
  1624. * Blends source and destionation surfaces one scan line at a time.
  1625. *
  1626. * Allocate a scan line buffer for xlate of src to 32BGRA if needed.
  1627. * Allocate a scan line buffer for xlate of dst to 32BGRA if needed.
  1628. * Blend scan line using blend function from pAlphaDispatch
  1629. * Write scan line back to dst (if needed)
  1630. *
  1631. * Arguments:
  1632. *
  1633. * pDst - pointer to dst surface
  1634. * pDstRect - Dst output rect
  1635. * DeltaDst - dst scan line delat
  1636. * pSrc - pointer to src surface
  1637. * DeltaSrc - src scan line delta
  1638. * pptlSrc - src offset
  1639. * pxloSrcTo32 - xlateobj from src to 32BGR
  1640. * pxlo32ToDst - xlateobj from 32BGR to dst
  1641. * palDst - destination palette
  1642. * palSrc - source palette
  1643. * pAlphaDispatch - blend data and function pointers
  1644. *
  1645. * Return Value:
  1646. *
  1647. * ALPHA_COMPLETE: success, written to destination
  1648. * ALPHA_SEND_TEMP: success, must write tmp bmp to dest
  1649. * ALPHA_FAIL: error
  1650. *
  1651. * History:
  1652. *
  1653. * 10/14/1996 Mark Enstrom [marke]
  1654. *
  1655. \**************************************************************************/
  1656. ULONG
  1657. AlphaScanLineBlend(
  1658. PBYTE pDst,
  1659. PRECTL pDstRect,
  1660. ULONG DeltaDst,
  1661. PBYTE pSrc,
  1662. ULONG DeltaSrc,
  1663. PPOINTL pptlSrc,
  1664. PALPHA_DISPATCH_FORMAT pAlphaDispatch,
  1665. PDIBINFO pDibInfoSrc,
  1666. PDIBINFO pDibInfoDst
  1667. )
  1668. {
  1669. //
  1670. // get two scanlines of RGBA data, blend pixels, store
  1671. //
  1672. LONG cx = pDstRect->right - pDstRect->left;
  1673. LONG cy = pDstRect->bottom - pDstRect->top;
  1674. LONG ScanBufferWidth = cx * 4;
  1675. LONG WriteMaskSize = cx;
  1676. LONG AllocationSize = 0;
  1677. ULONG ulSrcBytesPerPixel = pAlphaDispatch->ulSrcBitsPerPixel/8;
  1678. ULONG ulDstBytesPerPixel = pAlphaDispatch->ulDstBitsPerPixel/8;
  1679. PBYTE pjSrcTempScanBuffer = NULL;
  1680. PBYTE pjDstTempScanBuffer = NULL;
  1681. PBYTE pjAlloc = NULL;
  1682. PBYTE pjDstTmp;
  1683. PBYTE pjSrcTmp;
  1684. PBYTE pWriteMask;
  1685. LONG lRet = ALPHA_SEND_TEMP;
  1686. HDC hdc32 = NULL;
  1687. PULONG pulDIBSrc = NULL;
  1688. //
  1689. // if there is a temp dst needed, use dc allocator
  1690. //
  1691. if (pAlphaDispatch->pfnLoadDstAndConvert != NULL)
  1692. {
  1693. hdc32 = hdcAllocateScanLineDC(cx,&pulDIBSrc);
  1694. if (hdc32 == NULL)
  1695. {
  1696. return(ALPHA_FAIL);
  1697. }
  1698. //
  1699. // set temp scan line
  1700. //
  1701. pjDstTempScanBuffer = (PBYTE)pulDIBSrc;
  1702. }
  1703. //
  1704. // calculate destination starting address
  1705. //
  1706. if (ulDstBytesPerPixel)
  1707. {
  1708. pjDstTmp = pDst + ulDstBytesPerPixel * pDstRect->left + DeltaDst * pDstRect->top;
  1709. }
  1710. else if (pAlphaDispatch->ulDstBitsPerPixel == 1)
  1711. {
  1712. pjDstTmp = pDst + pDstRect->left/8 + DeltaDst * pDstRect->top;
  1713. }
  1714. else
  1715. {
  1716. pjDstTmp = pDst + pDstRect->left/2 + DeltaDst * pDstRect->top;
  1717. }
  1718. //
  1719. // calculate source starting address
  1720. //
  1721. if (ulSrcBytesPerPixel)
  1722. {
  1723. pjSrcTmp = pSrc + ulSrcBytesPerPixel * pptlSrc->x + DeltaSrc * pptlSrc->y;
  1724. }
  1725. else if (pAlphaDispatch->ulSrcBitsPerPixel == 1)
  1726. {
  1727. pjSrcTmp = pSrc + pptlSrc->x/8 + DeltaSrc * pptlSrc->y;
  1728. }
  1729. else
  1730. {
  1731. pjSrcTmp = pSrc + pptlSrc->x/2 + DeltaSrc * pptlSrc->y;
  1732. }
  1733. //
  1734. // calculate size of needed scan line buffer
  1735. //
  1736. if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL)
  1737. {
  1738. AllocationSize += ScanBufferWidth;
  1739. }
  1740. AllocationSize += WriteMaskSize;
  1741. //
  1742. // allocate scan line buffer memory
  1743. //
  1744. pWriteMask = (PBYTE)LOCALALLOC(AllocationSize);
  1745. if (pWriteMask != NULL)
  1746. {
  1747. //
  1748. // calc offsets
  1749. //
  1750. PBYTE pjTemp = pWriteMask + WriteMaskSize;
  1751. if (pAlphaDispatch->pfnLoadSrcAndConvert != NULL)
  1752. {
  1753. pjSrcTempScanBuffer = pjTemp;
  1754. pjTemp += ScanBufferWidth;
  1755. }
  1756. //
  1757. // Blend scan lines
  1758. //
  1759. LONG yScan = 0;
  1760. while (cy--)
  1761. {
  1762. PBYTE pjSource = pjSrcTmp;
  1763. PBYTE pjDest = pjDstTmp;
  1764. //
  1765. // get src scan line if needed
  1766. //
  1767. if (pjSrcTempScanBuffer)
  1768. {
  1769. (*pAlphaDispatch->pfnLoadSrcAndConvert)(
  1770. (PULONG)pjSrcTempScanBuffer,
  1771. pjSrcTmp,
  1772. 0,
  1773. cx,
  1774. (PVOID)pDibInfoSrc);
  1775. pjSource = pjSrcTempScanBuffer;
  1776. }
  1777. //
  1778. // get dst scan line if needed
  1779. //
  1780. if (pjDstTempScanBuffer)
  1781. {
  1782. (*pAlphaDispatch->pfnLoadDstAndConvert)(
  1783. (PULONG)pjDstTempScanBuffer,
  1784. pjDstTmp,
  1785. 0,
  1786. cx,
  1787. (PVOID)pDibInfoDst);
  1788. pjDest = pjDstTempScanBuffer;
  1789. }
  1790. //
  1791. // blend
  1792. //
  1793. memset(pWriteMask,1,WriteMaskSize);
  1794. (*pAlphaDispatch->pfnGeneralBlend)(
  1795. (PALPHAPIX)pjDest,
  1796. (PALPHAPIX)pjSource,
  1797. cx,
  1798. pAlphaDispatch->BlendFunction,
  1799. pWriteMask
  1800. );
  1801. //
  1802. // write buffer back if needed
  1803. //
  1804. if (pjDstTempScanBuffer)
  1805. {
  1806. (*pAlphaDispatch->pfnConvertAndStore)(
  1807. pjDstTmp,
  1808. (PULONG)pjDstTempScanBuffer,
  1809. cx,
  1810. 0,
  1811. yScan,
  1812. (PVOID)pDibInfoDst,
  1813. pWriteMask,
  1814. hdc32
  1815. );
  1816. }
  1817. pjDstTmp += DeltaDst;
  1818. pjSrcTmp += DeltaSrc;
  1819. yScan++;
  1820. }
  1821. //
  1822. // free any temp buffer memory
  1823. //
  1824. LOCALFREE(pWriteMask);
  1825. }
  1826. else
  1827. {
  1828. lRet = ALPHA_FAIL;
  1829. }
  1830. if (hdc32)
  1831. {
  1832. vFreeScanLineDC(hdc32);
  1833. }
  1834. if (
  1835. (lRet != ALPHA_FAIL) &&
  1836. (pAlphaDispatch->pfnConvertAndStore == vConvertAndSaveBGRAToDest)
  1837. )
  1838. {
  1839. lRet = ALPHA_COMPLETE;
  1840. }
  1841. return(lRet);
  1842. }
  1843. #endif