Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

820 lines
25 KiB

  1. ;-----------------------------------------------------------------------------
  2. ;
  3. ; Monolith 14. Perspective Correct Bi-linear gouraud modulated
  4. ; 565 input texture NO Z buffered 565 output.
  5. ;
  6. ;
  7. ; Exactly the same as monolith 7 except Z buffer code removed.
  8. ;
  9. ;-----------------------------------------------------------------------------
  10. INCLUDE iammx.inc
  11. INCLUDE offs_acp.inc
  12. ; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
  13. ; at the LSB, then six bits of green, then five bits of red.
  14. ;TBD check to see if this value is correct.
  15. COLOR_SHIFT equ 8
  16. .586
  17. .model flat
  18. ; Big seperating lines seperate code into span code
  19. ; and loop code. If span and loop are not going to
  20. ; end up being combined then it will be easy to
  21. ; seperate the code.
  22. .data
  23. ; Need externs for all of the variables that are needed for various beads
  24. EXTERN IncHighandLow16:MMWORD
  25. EXTERN UFracVFracMask:MMWORD
  26. EXTERN UV32to15Mask:MMWORD
  27. EXTERN Makelow16one:MMWORD
  28. EXTERN MaskKeepUValues:MMWORD
  29. EXTERN MaskKeepVValues:MMWORD
  30. EXTERN UFrac:MMWORD
  31. EXTERN VFrac:MMWORD
  32. EXTERN Zero:MMWORD
  33. EXTERN memD3DTFG_POINT:MMWORD
  34. EXTERN GiveUp:MMWORD
  35. EXTERN LastW:MMWORD
  36. EXTERN Val0x000a000a:MMWORD
  37. EXTERN Val0xffff:MMWORD
  38. EXTERN Val0x0000002000000020:MMWORD
  39. EXTERN Val0x0000ffff0000ffff:MMWORD
  40. EXTERN MaskRed565to888:MMWORD
  41. EXTERN MaskGreen565to888:MMWORD
  42. EXTERN MaskBlue565to888:MMWORD
  43. EXTERN MaskRed555to888:MMWORD
  44. EXTERN MaskGreen555to888:MMWORD
  45. EXTERN MaskBlue555to888:MMWORD
  46. EXTERN MaskAlpha1555to8888:MMWORD
  47. EXTERN MaskRed1555to8888:MMWORD
  48. EXTERN MaskGreen1555to8888:MMWORD
  49. EXTERN MaskBlue1555to8888:MMWORD
  50. ; TBD. I think that I want to do 0xffff instead of 0xff. This will
  51. ; have to be checked. There is a value very similiar to this in
  52. ; buf write.
  53. EXTERN SetAlphato0xffff:MMWORD
  54. EXTERN SetAlphato0xff:MMWORD
  55. ; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
  56. RedShift565to888 equ 8
  57. GreenShift565to888 equ 5
  58. BlueShift565to888 equ 3
  59. RedShift555to888 equ 9
  60. GreenShift555to888 equ 6
  61. BlueShift555to888 equ 3
  62. AlphaShift1555to8888 equ 16
  63. RedShift1555to8888 equ 9
  64. GreenShift1555to8888 equ 6
  65. BlueShift1555to8888 equ 3
  66. EXTERN BilinearMaskRed565to888:MMWORD
  67. EXTERN BilinearMaskGreen565to888:MMWORD
  68. EXTERN BilinearMaskBlue565to888:MMWORD
  69. EXTERN BilinearShiftRed565to888:MMWORD
  70. EXTERN BilinearShiftGreen565to888:MMWORD
  71. EXTERN BilinearShiftBlue565to888:MMWORD
  72. EXTERN Zero:MMWORD
  73. EXTERN DW_One_One:MMWORD
  74. EXTERN MaskOffAlpha:MMWORD
  75. EXTERN ShiftTA:MMWORD
  76. EXTERN Val0x00ff00ff00ff00ff:MMWORD
  77. EXTERN Val0x000000ff00ff00ff:MMWORD
  78. EXTERN Val0X0000000001000000:MMWORD
  79. EXTERN AlphaVal128:MMWORD
  80. EXTERN RGBVal128:MMWORD
  81. EXTERN g_uDitherValue:MMWORD
  82. EXTERN SetAlphato0xff:MMWORD
  83. EXTERN u888to565RedBlueMask:MMWORD
  84. EXTERN u888to565GreenMask:MMWORD
  85. EXTERN u888to565Multiplier:MMWORD
  86. EXTERN uVal0x000007ff03ff07ff:MMWORD
  87. EXTERN uVal0x0000078003c00780:MMWORD
  88. EXTERN u888to555RedBlueMask:MMWORD
  89. EXTERN u888to555GreenMask:MMWORD
  90. EXTERN u888to555Multiplier:MMWORD
  91. EXTERN uVal0x000007ff07ff07ff:MMWORD
  92. EXTERN uVal0x0000078007800780:MMWORD
  93. ;-----------------------------------------------------------------------------
  94. ; Span Variables
  95. StackPos dd ?
  96. uSpans dd ?
  97. ;-----------------------------------------------------------------------------
  98. ;-----------------------------------------------------------------------------
  99. ; Loop Variables
  100. iSurfaceStep dd ?
  101. uPix dd ?
  102. ;-----------------------------------------------------------------------------
  103. .code
  104. PUBLIC _MMXMLRast_14
  105. _MMXMLRast_14:
  106. push ebp
  107. mov StackPos, esp
  108. mov eax, esp
  109. sub esp, 0Ch ; This will need to change if stack frame size changes.
  110. push ebx
  111. push esi
  112. push edi
  113. ; Put pCtx into ebx
  114. mov ebx, [eax+8]
  115. ;PD3DI_RASTPRIM pP = pCtx->pPrim;
  116. mov ecx, [ebx+RASTCTX_pPrim]
  117. ;while (pP)
  118. ;{
  119. PrimLoop:
  120. cmp ecx, 0
  121. je ExitPrimLoop
  122. ;UINT16 uSpans = pP->uSpans;
  123. movzx eax, word ptr [ecx+RASTPRIM_uSpans]
  124. mov uSpans, eax
  125. ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
  126. mov ebp, ecx
  127. add ebp, SIZEOF_RASTPRIM
  128. ;while (uSpans-- > 0)
  129. ;{
  130. SpanLoop:
  131. mov edx, uSpans
  132. mov eax, edx
  133. dec eax
  134. mov uSpans, eax
  135. test edx, edx
  136. jle ExitSpanLoop
  137. ;pCtx->pfnBegin(pCtx, pP, pS);
  138. ;-----------------------------------------------------------------------------
  139. ; LoopAny code inserted here. This is to get rid of an extra
  140. ; jump.
  141. ;-----------------------------------------------------------------------------
  142. ; Setup Code begins
  143. ; get values to iterate
  144. ;uPix = pS->uPix;
  145. movzx eax, word ptr [ebp+RASTSPAN_uPix]
  146. mov uPix, eax
  147. ;pCtx->SI.iDW = 0x0;
  148. mov dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0
  149. mov esi, [ebp+RASTSPAN_iW]
  150. movq mm5, MMWORD PTR [ebp+RASTSPAN_iUoW1]
  151. ;pCtx->SI.iUd_TexNum = d_WTimesUVoW(pS->iW,pS->iUoW1);
  152. ;pCtx->SI.iVd_TexNum = d_WTimesUVoW(pS->iW,pS->iVoW1);
  153. pslld mm5, 8
  154. shl esi, 4
  155. movd eax, mm5
  156. psrlq mm5, 32
  157. imul esi
  158. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  159. movd eax, mm5
  160. imul esi
  161. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  162. ;if (pP->iDOoWDX > 0)
  163. ;{
  164. cmp dword ptr [ecx+RASTPRIM_iDOoWDX], 0
  165. jg SpecialWLastMonTest
  166. ;// iSpecialW should be negative for the first 3 pixels of span
  167. ;pCtx->SI.iSpecialW = -3;
  168. mov word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], -3
  169. jmp DoneSpecialWifMonTest
  170. ;}
  171. ;else
  172. ;{
  173. SpecialWLastMonTest:
  174. ;// iSpecialW should be negative for the last 3 pixels of span
  175. ;pCtx->SI.iSpecialW = 0x7fff - uPix;
  176. mov eax, 07fffh
  177. sub eax, uPix
  178. ;pCtx->SI.iSpecialW += 5; // this may wrap, but it should
  179. add eax, 5
  180. mov [ebx+RASTCTX_SI+SPANITER_iSpecialW], eax
  181. ;}
  182. DoneSpecialWifMonTest:
  183. ;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
  184. ;{
  185. mov eax, [ecx+RASTPRIM_uFlags]
  186. and eax, D3DI_RASTPRIM_X_DEC
  187. test eax, eax
  188. jz LeftToRightSpan
  189. ;iSurfaceStep = -pCtx->iSurfaceStep;
  190. mov eax, [ebx+RASTCTX_iSurfaceStep]
  191. neg eax
  192. mov iSurfaceStep, eax
  193. ;}
  194. jmp DoneSpanDirif
  195. ;else
  196. ;{
  197. LeftToRightSpan:
  198. ;iSurfaceStep = pCtx->iSurfaceStep;
  199. mov eax, [ebx+RASTCTX_iSurfaceStep]
  200. mov iSurfaceStep, eax
  201. ;}
  202. DoneSpanDirif:
  203. ; Setup Code Ends
  204. ; ----------------------------------------------------------------------------------------------------------------
  205. ; Loop Code Begins
  206. ;//while (1)
  207. ;//{
  208. PixelLoop:
  209. ; texturecode
  210. ;---------------------------------------------------------------------------
  211. ;void Tex1Addr_TexAddrWrapMirror_Persp_Bilinear_NoMip(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  212. ; PD3DI_RASTSPAN pS)
  213. ;{
  214. ;PD3DI_SPANTEX pTex = &pCtx->Texture[0];
  215. mov esi, [ebx+RASTCTX_pTexture]
  216. ; ----------------------------------------
  217. ; Doing UV calculation a little more accurate
  218. ; Exactly like C code.
  219. ; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
  220. ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
  221. ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
  222. ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
  223. ; COMMENT1**
  224. ; If textures have a max of 1024 then shiftU0 would be at most 10 which would
  225. ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
  226. ; It will also give bi-linear 6 bits of precision I think it was said that
  227. ; only five was needed.
  228. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
  229. ;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
  230. movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
  231. ;iLOD0 is zero in monolithic case so no subtraction needed.
  232. movd mm4, [esi+SPANTEX_iShiftU]
  233. psubw mm5, mm4
  234. movq mm4, mm5
  235. pand mm5, MMWORD PTR Val0xffff
  236. psrld mm4, 16
  237. movd mm1, [ebx+RASTCTX_SI+SPANITER_iU1]
  238. psrad mm1, mm5
  239. movd mm2, [ebx+RASTCTX_SI+SPANITER_iV1]
  240. psrad mm2, mm4
  241. punpckldq mm1, mm2
  242. psubd mm1, MMWORD PTR Val0x0000002000000020
  243. ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
  244. ; ----------------- Start of hack
  245. ; ATTENTION This is really hacked right now. Just to get it working
  246. ; Pitch would be better for me, instead of shift pitch.
  247. ; With actual pitch, this would be two moves and a shift.
  248. ;shl eax, 1
  249. movzx edx, word ptr [esi+SPANTEX_iShiftPitch]
  250. add edx, 16
  251. movd mm2, edx
  252. movq mm5, MMWORD ptr Makelow16one
  253. pslld mm5, mm2
  254. ;pslld mm5, 16 ;. Use this after hack.
  255. ; not needed in hacked version since i add to shifted value.
  256. ; ----------------- End of hack
  257. por mm5, MMWORD ptr Makelow16one
  258. ; Make the low 16 bits of dword one
  259. ; This helps in calculating texture address.
  260. ; Gets U and V value into mm1 so that it can be mirrored, wrapped or
  261. ; clamped. This can be done for two values in the point case
  262. ; or four values in the bilinear case.
  263. ;INT32 iUFrac = iU00 & 0x03f;
  264. ;INT32 iVFrac = iV00 & 0x03f;
  265. ;iU00 >>= 6;
  266. ;iV00 >>= 6;
  267. movq mm2, mm1
  268. psrad mm1, 6
  269. ;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
  270. pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
  271. ; Going to use only 8 bits for bi-linear so that I can do a pmullw.
  272. ; Currently at 6 bits so shift up by 2.
  273. psllw mm2, 2
  274. movq mm0, mm2
  275. ; Replicate VFrac value for bilinear
  276. punpckhwd mm2, mm2
  277. punpcklwd mm2, mm2
  278. ; Replicate UFrac Value for bilinear
  279. punpcklwd mm0, mm0
  280. punpcklwd mm0, mm0
  281. movq dword ptr VFrac, mm2
  282. movq dword ptr UFrac, mm0
  283. ;INT32 iU01 = iU00 + 1;
  284. ;INT32 iV01 = iV00 + 1;
  285. packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
  286. paddw mm1, dword ptr IncHighandLow16
  287. ; This will make texture values be (High word to low word):
  288. ; iV01, iU00, iV00, iU01
  289. ; Need to do this to make texture look up for bilinear easier.
  290. ; I have to combine to get all combinations anyway. It just
  291. ; happens to be better for me to have iV00, iU01 pair first.
  292. ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
  293. ; put mask in mm3 and replicate to match location for wrap/mirror/clamp
  294. movd mm0, [esi+SPANTEX_uMaskU] ; Load U and V mask
  295. ; replicate mask if doing bilinear
  296. punpckldq mm0, mm0
  297. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  298. ;INT16 iFlip;
  299. ; MM1 should contain 16 bit iU and iV for both texture locations
  300. ; End Result is MM1 value wrapped or mirrored
  301. ; in Bilinear Case, four values can be done
  302. ; iU00, iV00, iU01, iV01
  303. ; This code really does alot for the bilinear case and is kinda wasteful
  304. ; in the normal mode.
  305. ;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
  306. movq mm7, mm1
  307. ; Point doesnt need replication
  308. movd mm4, [esi+SPANTEX_iFlipMaskU]
  309. ; if bilinear replicate values together, Point doesnt need this.
  310. punpckldq mm4, mm4
  311. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  312. pand mm7, mm4
  313. ;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
  314. pcmpeqw mm7, MMWORD PTR Zero
  315. ;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
  316. pandn mm7, mm0
  317. ;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
  318. pand mm1, mm0
  319. ;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
  320. pxor mm1, mm7
  321. ; Result in mm1 now since TexAddrAll ends up that way.
  322. ; Making other two cases for texture addressing has to be simplier than
  323. ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
  324. ; TBD Make this better.
  325. ; values are still stored as iV01, iU00, iV00, iU01
  326. movq mm2, mm1
  327. movq mm3, mm1
  328. ; Calculate address for 1st and 3rd texels
  329. pmaddwd mm1, mm5 ; Throw in first address calculation.
  330. ; Just to get it started. Calculate
  331. ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
  332. ; values are being changed to iV01, iU01, iV00, iU00
  333. ; seven instructions for this seems excessive.
  334. pand mm2, MMWORD ptr MaskKeepUValues
  335. pand mm3, MMWORD ptr MaskKeepVValues
  336. movq mm4, mm2
  337. psllq mm2, 32
  338. psrlq mm4, 32
  339. por mm3, mm2
  340. por mm3, mm4
  341. ; Calculate address for 2nd and 4th texels
  342. pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
  343. mov edi, [esi+SPANTEX_pBits]
  344. ;UINT32 uTex00 = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU,
  345. ; pTex->pBits[iLOD0], &pCtx->Texture[0]);
  346. ; Combine U and V values before making call.
  347. ; -------------------- In Monolithic version calls are inlined.
  348. ;D3DCOLOR TexRead_B5G6R5_NoBorder(INT32 iU, INT32 iV, INT32 iShiftU, PUINT8 pBits, PD3DI_SPANTEX pTex)
  349. ;{
  350. ; Added code for color converting 2 pixels at a time
  351. ; movq mm2, MMWORD PTR Zero
  352. pxor mm2, mm2
  353. ; 1st (mm1) and 2nd (mm3) texel
  354. movd eax, mm3 ; load 2nd texel address
  355. movzx eax, word ptr [edi+2*eax]
  356. movd mm4, eax ; mm4 = 2nd texel
  357. movd eax, mm1 ; load 1st texel address
  358. movzx eax, word ptr [edi+2*eax]
  359. movd mm7, eax ; mm7 = 1st texel
  360. ; mm7 = 2nd texel (high 32 bits), 1st texel (low 32 bits)
  361. punpckldq mm7, mm4
  362. movq mm5, mm7
  363. movq mm4, mm7
  364. pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
  365. pand mm7, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
  366. pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
  367. pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
  368. pslld mm7, MMWORD PTR BilinearShiftGreen565to888 ; = 5
  369. pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
  370. por mm7, mm5 ; combine R+G
  371. por mm7, mm4 ; combine (R+G) + B
  372. movq mm4, mm7 ; copy 1st and 2nd texels
  373. ; mm4 calculated from high 32 bits of mm3 (2nd texel)
  374. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  375. ; bits for each color component
  376. punpckhbw mm4, mm2
  377. ; mm7 calculated from low 32 bits of mm1 (1st texel)
  378. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  379. ; bits for each color component
  380. punpcklbw mm7, mm2
  381. psrlq mm3, 32 ; shift 4th texel address into low 32 bits
  382. ; mm7 = final calc on 1st and 2nd texel
  383. psubw mm7, mm4
  384. psllw mm4, 8
  385. pmullw mm7, dword ptr UFrac
  386. paddw mm7, mm4
  387. ; 3rd (mm1) and 4th (mm3) texel
  388. movd eax, mm3 ; load 4th texel address
  389. psrlq mm1, 32 ; shift 3rd texel address into low 32 bits
  390. movzx eax, word ptr [edi+2*eax]
  391. movd mm6, eax ; mm6 = 4th texel
  392. movd eax, mm1 ; load 3rd texel address
  393. movzx eax, word ptr [edi+2*eax]
  394. movd mm4, eax ; mm4 = 3rd texel
  395. ; mm6 = 4th texel (high 32 bits), 3rd texel (low 32 bits)
  396. punpckldq mm6, mm4
  397. movq mm5, mm6
  398. movq mm4, mm6
  399. pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
  400. pand mm6, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
  401. pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
  402. pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
  403. pslld mm6, MMWORD PTR BilinearShiftGreen565to888 ; = 5
  404. pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
  405. por mm6, mm5 ; combine R+G
  406. por mm6, mm4 ; combine (R+G) + B
  407. movq mm4, mm6 ; copy 3rd and 4th texels
  408. ; mm4 calculated from high 32 bits of mm3 (4th texel)
  409. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  410. ; bits for each color component
  411. punpckhbw mm4, mm2
  412. ; mm6 calculated from low 32 bits of mm1 (3rd texel)
  413. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  414. ; bits for each color component
  415. punpcklbw mm6, mm2
  416. psubw mm6, mm4
  417. psllw mm4, 8
  418. pmullw mm6, dword ptr UFrac
  419. movq mm1, mm7
  420. ; mm6 = final calc on 3rd and 4th texel
  421. paddw mm6, mm4
  422. ; mm1 = final calc on 1st+2nd texel and 3rd+4th texel
  423. psrlw mm6, 8
  424. psrlw mm7, 8
  425. psubw mm6, mm7
  426. pmullw mm6, dword ptr VFrac
  427. paddw mm1, mm6
  428. psrlw mm1, 8
  429. ;modulate
  430. ;UINT16 uB = pS->uB>>COLOR_SHIFT;
  431. ;UINT16 uG = pS->uG>>COLOR_SHIFT;
  432. ;UINT16 uR = pS->uR>>COLOR_SHIFT;
  433. movq mm4, [ebp+RASTSPAN_uB]
  434. psrlw mm4, COLOR_SHIFT ; COLOR_SHIFT is set to 8.
  435. ; Alpha not needed in this monolith
  436. ;UINT16 uTB = (UINT16)(RGBA_GETBLUE(pCtx->SI.TexCol[0]));
  437. ;UINT16 uTG = (UINT16)(RGBA_GETGREEN(pCtx->SI.TexCol[0]));
  438. ;UINT16 uTR = (UINT16)(RGBA_GETRED(pCtx->SI.TexCol[0]));
  439. ;UINT16 uTA = (UINT16)(RGBA_GETALPHA(pCtx->SI.TexCol[0]));
  440. ; this is a PMULLW, which works on unsigned 16 bit quantities
  441. ;pCtx->SI.uBB = uB*uTB;
  442. ;pCtx->SI.uBG = uG*uTG;
  443. ;pCtx->SI.uBR = uR*uTR;
  444. ;pCtx->SI.uBA = uTA<<COLOR_SHIFT;
  445. pmullw mm4, mm1
  446. ; write
  447. mov edi, [ebp+RASTSPAN_pSurface]
  448. psrlw mm4, 8 ; Convert color1 from 8.8 two 0.8
  449. packuswb mm4, mm7 ; pack one color
  450. movq mm3, mm4
  451. pand mm4, MMWORD PTR u888to565RedBlueMask
  452. pmaddwd mm4, MMWORD PTR u888to565Multiplier
  453. pand mm3, MMWORD PTR u888to565GreenMask
  454. por mm4, mm3
  455. psrld mm4, 5
  456. movd edx, mm4
  457. mov [edi], dx
  458. ;//if (--uPix <= 0)
  459. ;// break;
  460. dec uPix ;// BUG BUG?? uPix should never start as zero should it?
  461. ;// if so, this is a bug.
  462. jle ExitPixelLoop
  463. ; Doing update code after span length test so that an extra update is not done.
  464. ;void TestFail_Gouraud_PerspTex1_NoSpecularFog(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  465. ; PD3DI_RASTSPAN pS)
  466. ;{
  467. ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
  468. ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
  469. movq mm1, [ebp+RASTSPAN_uB]
  470. paddw mm1, [ecx+RASTPRIM_iDBDX]
  471. movq [ebp+RASTSPAN_uB], mm1
  472. ;pS->iUoW1 += pP->iDUoW1DX;
  473. ;pS->iVoW1 += pP->iDVoW1DX;
  474. movq mm5, [ebp+RASTSPAN_iUoW1]
  475. paddd mm5, [ecx+RASTPRIM_iDUoW1DX]
  476. movq [ebp+RASTSPAN_iUoW1], mm5
  477. ;pS->iOoW += pP->iDOoWDX;
  478. mov eax, [ebp+RASTSPAN_iOoW]
  479. add eax, [ecx+RASTPRIM_iDOoWDX]
  480. mov [ebp+RASTSPAN_iOoW], eax
  481. ;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
  482. ; TODO Could do this and OoW Add at same time with MMX.
  483. mov edx, [ebp+RASTSPAN_iW]
  484. mov LastW, edx ; Save iW to calc iDW for next time.
  485. add edx, [ebx+RASTCTX_SI+SPANITER_iDW]
  486. ;if (pCtx->SI.iSpecialW < 0)
  487. ;{
  488. xor edi, edi
  489. cmp di, word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  490. jle DontDoSpecialW1
  491. ;DoSpecialW1:
  492. ; This label is a left over from when
  493. ;if (iWn0 < 0)
  494. ;{
  495. cmp edx, edi
  496. jl WOutOfRange1
  497. ;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
  498. mov edx, LastW
  499. sar edx, 1
  500. ;}
  501. WOutOfRange1:
  502. ;VAL32 iWn1;
  503. ;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
  504. ; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
  505. ;INT32 iGiveUp = 7;
  506. mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
  507. ;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
  508. ;{
  509. SpecW1Loop1:
  510. ; Could move this to bottom of loop and combine results somehow.
  511. ; TBD look at it more.
  512. dec GiveUp
  513. jz ExitSpecWLoop1
  514. ; Shift iOoW by one since imul cannot have sign bit set
  515. ; OoW cannot reach one, only 0x7fffffff
  516. ;shr eax, 1 ; 1.31 >> 1 = 1.30
  517. ; Get ready to do Two minus iOoW*iW
  518. mov esi, (1 SHL 16)
  519. ;iWnOld = iWn0;
  520. mov edi, edx
  521. ; Result should be close to one so we want most of the
  522. ; precision in the low bits. Need to give more bits
  523. ; leaway since these are the bad cases.
  524. ; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
  525. imul edx
  526. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  527. sub esi, edx
  528. ;while(iWn1.i < 0)
  529. ;{
  530. SpecW1Loop2:
  531. test esi, esi
  532. jns SpecW1ExitLoop2 ; This jump should be predicted correctly most of the time.
  533. ;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
  534. add esi, (1 SHL 15)
  535. sar esi, 1
  536. jmp SpecW1Loop2
  537. ;}
  538. SpecW1ExitLoop2:
  539. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  540. mov eax, edi
  541. shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
  542. shl esi, 12 ; 4.15 << 12 = 4.27 ;
  543. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  544. ; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
  545. mul esi
  546. ; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
  547. sub edi, edx
  548. ; These four lines are abs code.
  549. mov eax, edi
  550. sar eax, 31
  551. xor edi, eax
  552. sub edi, eax
  553. cmp edi, 020h ;Assuming that loop will only happen once.
  554. jbe ExitSpecWLoop1
  555. ; Reload eax with iOoW.
  556. mov eax, [ebp+RASTSPAN_iOoW]
  557. jmp SpecW1Loop1
  558. ;}
  559. ;else
  560. ;{
  561. DontDoSpecialW1:
  562. ; Everything should be positive in Non-SpecialW case.
  563. ;INT32 iWn1;
  564. mov esi, (1 SHL 16)
  565. mov edi, edx
  566. ; This should be close to one so Low bits are most important.
  567. ;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
  568. mul edx
  569. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  570. sub esi, edx
  571. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  572. shl esi, 15 ; 0.16.15 << 15 = 0.2.30
  573. mov eax, esi
  574. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  575. mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
  576. shl edx, 2 ; 1.17.14 << 2 = 1.15.16
  577. ;}
  578. ;}
  579. ExitSpecWLoop1:
  580. ;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
  581. ;pS->iW = iWn0;
  582. mov [ebp+RASTSPAN_iW], edx
  583. mov esi, edx ; Save W for multiplying by UoW and VoW
  584. sub edx, LastW
  585. mov [ebx+RASTCTX_SI+SPANITER_iDW], edx
  586. ;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
  587. inc word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  588. ;pCtx->SI.iU1 = d_WTimesUVoW(pS->iW,pS->iUoW1);
  589. ;pCtx->SI.iV1 = d_WTimesUVoW(pS->iW,pS->iVoW1);
  590. pslld mm5, 8
  591. shl esi, 4
  592. movd eax, mm5
  593. psrlq mm5, 32
  594. imul esi
  595. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  596. movd eax, mm5
  597. imul esi
  598. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  599. ;//pS->pSurface += iSurfaceStep;
  600. mov edx, dword ptr [ebp+RASTSPAN_pSurface]
  601. add edx, iSurfaceStep
  602. mov dword ptr [ebp+RASTSPAN_pSurface], edx
  603. ;#ifdef DBG
  604. ;// handy for debug to see where we are
  605. ;//pS->uX += (INT16)pCtx->SI.iXStep;
  606. ;#endif
  607. ;// } // while
  608. jmp PixelLoop
  609. ExitPixelLoop:
  610. ; Loop code ends
  611. ;-----------------------------------------------------------------------------
  612. ; LoopAny code ends here
  613. ;-----------------------------------------------------------------------------
  614. ;pS++;
  615. add ebp, SIZEOF_RASTSPAN
  616. ;}
  617. jmp SpanLoop
  618. ExitSpanLoop:
  619. ;pP = pP->pNext;
  620. mov ecx, [ecx+RASTPRIM_pNext]
  621. ;}
  622. jmp PrimLoop
  623. ExitPrimLoop:
  624. ;_asm{
  625. emms
  626. ;}
  627. ;return S_OK;
  628. xor eax, eax
  629. ;}
  630. pop edi
  631. pop esi
  632. pop ebx
  633. mov esp, StackPos
  634. pop ebp
  635. ret
  636. END