Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

914 lines
27 KiB

  1. ;-----------------------------------------------------------------------------
  2. ;
  3. ; Monolith 7. Perspective Correct Bi-linear gouraud modulated
  4. ; 565 input texture 16 bit Z buffered (LE or GT)
  5. ; 565 output.
  6. ;
  7. ;
  8. ; Globals
  9. ;
  10. ; StackPos - stack pos holder
  11. ; uSpans - Number of spans to process
  12. ; iSurfaceStep - what to add to screen pointer
  13. ; iZStep - what to add to Z buffer pointer
  14. ; uPix - Pixel Count
  15. ;
  16. ; Changes from general MMX assembly.
  17. ; 1) Registers renamed a to remove additional moves
  18. ; 2) Since there are 4 texels used in bi-linear and
  19. ; the 565 - 888 color conversion can convert 2
  20. ; texels at a time, two texels are loaded, combined
  21. ; and then converted at once then moved into seperate
  22. ; registers.
  23. ; 3) Most register renaming was done in the bi-linear calculation
  24. ; since the original code always read into mm1 which
  25. ; caused alot of additional moves.
  26. ; 4) Texcolor is not written to since it is just loaded
  27. ; and then written.
  28. ;
  29. ;-----------------------------------------------------------------------------
  30. INCLUDE iammx.inc
  31. INCLUDE offs_acp.inc
  32. ; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
  33. ; at the LSB, then six bits of green, then five bits of red.
  34. ;TBD check to see if this value is correct.
  35. COLOR_SHIFT equ 8
  36. .586
  37. .model flat
  38. ; Big seperating lines seperate code into span code
  39. ; and loop code. If span and loop are not going to
  40. ; end up being combined then it will be easy to
  41. ; seperate the code.
  42. .data
  43. ; Need externs for all of the variables that are needed for various beads
  44. EXTERN IncHighandLow16:MMWORD
  45. EXTERN UFracVFracMask:MMWORD
  46. EXTERN UV32to15Mask:MMWORD
  47. EXTERN Makelow16one:MMWORD
  48. EXTERN MaskKeepUValues:MMWORD
  49. EXTERN MaskKeepVValues:MMWORD
  50. EXTERN UFrac:MMWORD
  51. EXTERN VFrac:MMWORD
  52. EXTERN Zero:MMWORD
  53. EXTERN memD3DTFG_POINT:MMWORD
  54. EXTERN GiveUp:MMWORD
  55. EXTERN LastW:MMWORD
  56. EXTERN Val0x000a000a:MMWORD
  57. EXTERN Val0xffff:MMWORD
  58. EXTERN Val0x0000002000000020:MMWORD
  59. EXTERN Val0x0000ffff0000ffff:MMWORD
  60. EXTERN MaskRed565to888:MMWORD
  61. EXTERN MaskGreen565to888:MMWORD
  62. EXTERN MaskBlue565to888:MMWORD
  63. EXTERN MaskRed555to888:MMWORD
  64. EXTERN MaskGreen555to888:MMWORD
  65. EXTERN MaskBlue555to888:MMWORD
  66. EXTERN MaskAlpha1555to8888:MMWORD
  67. EXTERN MaskRed1555to8888:MMWORD
  68. EXTERN MaskGreen1555to8888:MMWORD
  69. EXTERN MaskBlue1555to8888:MMWORD
  70. ; TBD. I think that I want to do 0xffff instead of 0xff. This will
  71. ; have to be checked. There is a value very similiar to this in
  72. ; buf write.
  73. EXTERN SetAlphato0xffff:MMWORD
  74. EXTERN SetAlphato0xff:MMWORD
  75. ; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
  76. RedShift565to888 equ 8
  77. GreenShift565to888 equ 5
  78. BlueShift565to888 equ 3
  79. RedShift555to888 equ 9
  80. GreenShift555to888 equ 6
  81. BlueShift555to888 equ 3
  82. AlphaShift1555to8888 equ 16
  83. RedShift1555to8888 equ 9
  84. GreenShift1555to8888 equ 6
  85. BlueShift1555to8888 equ 3
  86. EXTERN BilinearMaskRed565to888:MMWORD
  87. EXTERN BilinearMaskGreen565to888:MMWORD
  88. EXTERN BilinearMaskBlue565to888:MMWORD
  89. EXTERN BilinearShiftRed565to888:MMWORD
  90. EXTERN BilinearShiftGreen565to888:MMWORD
  91. EXTERN BilinearShiftBlue565to888:MMWORD
  92. EXTERN Zero:MMWORD
  93. EXTERN DW_One_One:MMWORD
  94. EXTERN MaskOffAlpha:MMWORD
  95. EXTERN ShiftTA:MMWORD
  96. EXTERN Val0x00ff00ff00ff00ff:MMWORD
  97. EXTERN Val0x000000ff00ff00ff:MMWORD
  98. EXTERN Val0X0000000001000000:MMWORD
  99. EXTERN AlphaVal128:MMWORD
  100. EXTERN RGBVal128:MMWORD
  101. EXTERN g_uDitherValue:MMWORD
  102. EXTERN SetAlphato0xff:MMWORD
  103. EXTERN u888to565RedBlueMask:MMWORD
  104. EXTERN u888to565GreenMask:MMWORD
  105. EXTERN u888to565Multiplier:MMWORD
  106. EXTERN uVal0x000007ff03ff07ff:MMWORD
  107. EXTERN uVal0x0000078003c00780:MMWORD
  108. EXTERN u888to555RedBlueMask:MMWORD
  109. EXTERN u888to555GreenMask:MMWORD
  110. EXTERN u888to555Multiplier:MMWORD
  111. EXTERN uVal0x000007ff07ff07ff:MMWORD
  112. EXTERN uVal0x0000078007800780:MMWORD
  113. ;-----------------------------------------------------------------------------
  114. ; Span Variables
  115. StackPos dd ?
  116. uSpans dd ?
  117. ;-----------------------------------------------------------------------------
  118. ;-----------------------------------------------------------------------------
  119. ; Loop Variables
  120. iSurfaceStep dd ?
  121. iZStep dd ?
  122. uPix dd ?
  123. ;-----------------------------------------------------------------------------
  124. .code
  125. PUBLIC _MMXMLRast_7
  126. _MMXMLRast_7:
  127. push ebp
  128. mov StackPos, esp
  129. mov eax, esp
  130. sub esp, 0Ch ; This will need to change if stack frame size changes.
  131. push ebx
  132. push esi
  133. push edi
  134. ; Put pCtx into ebx
  135. mov ebx, [eax+8]
  136. ;PD3DI_RASTPRIM pP = pCtx->pPrim;
  137. mov ecx, [ebx+RASTCTX_pPrim]
  138. ;while (pP)
  139. ;{
  140. PrimLoop:
  141. cmp ecx, 0
  142. je ExitPrimLoop
  143. ;UINT16 uSpans = pP->uSpans;
  144. movzx eax, word ptr [ecx+RASTPRIM_uSpans]
  145. mov uSpans, eax
  146. ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
  147. mov ebp, ecx
  148. add ebp, SIZEOF_RASTPRIM
  149. ;while (uSpans-- > 0)
  150. ;{
  151. SpanLoop:
  152. mov edx, uSpans
  153. mov eax, edx
  154. dec eax
  155. mov uSpans, eax
  156. test edx, edx
  157. jle ExitSpanLoop
  158. ;pCtx->pfnBegin(pCtx, pP, pS);
  159. ;-----------------------------------------------------------------------------
  160. ; LoopAny code inserted here. This is to get rid of an extra
  161. ; jump.
  162. ;-----------------------------------------------------------------------------
  163. ; Setup Code begins
  164. ; get values to iterate
  165. ;uPix = pS->uPix;
  166. movzx eax, word ptr [ebp+RASTSPAN_uPix]
  167. mov uPix, eax
  168. ;pCtx->SI.iDW = 0x0;
  169. mov dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0
  170. mov esi, [ebp+RASTSPAN_iW]
  171. movq mm5, MMWORD PTR [ebp+RASTSPAN_iUoW1]
  172. ;pCtx->SI.iUd_TexNum = d_WTimesUVoW(pS->iW,pS->iUoW1);
  173. ;pCtx->SI.iVd_TexNum = d_WTimesUVoW(pS->iW,pS->iVoW1);
  174. pslld mm5, 8
  175. shl esi, 4
  176. movd eax, mm5
  177. psrlq mm5, 32
  178. imul esi
  179. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  180. movd eax, mm5
  181. imul esi
  182. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  183. ;if (pP->iDOoWDX > 0)
  184. ;{
  185. cmp dword ptr [ecx+RASTPRIM_iDOoWDX], 0
  186. jg SpecialWLastMonTest
  187. ;// iSpecialW should be negative for the first 3 pixels of span
  188. ;pCtx->SI.iSpecialW = -3;
  189. mov word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], -3
  190. jmp DoneSpecialWifMonTest
  191. ;}
  192. ;else
  193. ;{
  194. SpecialWLastMonTest:
  195. ;// iSpecialW should be negative for the last 3 pixels of span
  196. ;pCtx->SI.iSpecialW = 0x7fff - uPix;
  197. mov eax, 07fffh
  198. sub eax, uPix
  199. ;pCtx->SI.iSpecialW += 5; // this may wrap, but it should
  200. add eax, 5
  201. mov [ebx+RASTCTX_SI+SPANITER_iSpecialW], eax
  202. ;}
  203. DoneSpecialWifMonTest:
  204. ;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
  205. ;{
  206. mov eax, [ecx+RASTPRIM_uFlags]
  207. and eax, D3DI_RASTPRIM_X_DEC
  208. test eax, eax
  209. jz LeftToRightSpan
  210. ;iZStep = -pCtx->iZStep;
  211. mov eax, [ebx+RASTCTX_iZStep]
  212. neg eax
  213. mov iZStep, eax
  214. ;iSurfaceStep = -pCtx->iSurfaceStep;
  215. mov eax, [ebx+RASTCTX_iSurfaceStep]
  216. neg eax
  217. mov iSurfaceStep, eax
  218. ;}
  219. jmp DoneSpanDirif
  220. ;else
  221. ;{
  222. LeftToRightSpan:
  223. ;iZStep = pCtx->iZStep;
  224. mov eax, [ebx+RASTCTX_iZStep]
  225. mov iZStep, eax
  226. ;iSurfaceStep = pCtx->iSurfaceStep;
  227. mov eax, [ebx+RASTCTX_iSurfaceStep]
  228. mov iSurfaceStep, eax
  229. ;}
  230. DoneSpanDirif:
  231. ; Setup Code Ends
  232. ; ----------------------------------------------------------------------------------------------------------------
  233. ; Loop Code Begins
  234. ;//while (1)
  235. ;//{
  236. PixelLoop:
  237. ; Ztestcode
  238. ; edx is uZ
  239. ; eax is uZB
  240. ; 16 bit unsigned format
  241. ;UINT16 uZ = (UINT16)(pS->uZ>>15);
  242. ;UINT16 uZB = *((UINT16*)pS->pZ);
  243. mov edx, [ebp+RASTSPAN_uZ]
  244. movd mm4, edx
  245. mov esi, [ebp+RASTSPAN_pZ]
  246. shr edx, 15
  247. movzx eax, word ptr [esi]
  248. ;pS->uZ += pP->iDZDX;
  249. ;if ((pCtx->iZXorMask)^(uZ > uZB))
  250. ; !(uZ > uZB) <==>
  251. ; (uZ <= uZB) <==>
  252. ; (uZ < uZB+1) <==>
  253. ;
  254. sub eax, edx
  255. paddd mm4, [ecx+RASTPRIM_iDZDX]
  256. movd [ebp+RASTSPAN_uZ], mm4
  257. xor eax, [ebx+RASTCTX_iZXorMask]
  258. test eax, eax
  259. js FailLabel
  260. mov word ptr [esi], dx
  261. ; texturecode
  262. ;---------------------------------------------------------------------------
  263. ;void Tex1Addr_TexAddrWrapMirror_Persp_Bilinear_NoMip(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  264. ; PD3DI_RASTSPAN pS)
  265. ;{
  266. ;PD3DI_SPANTEX pTex = &pCtx->Texture[0];
  267. mov esi, [ebx+RASTCTX_pTexture]
  268. ; Doing UV calculation a little more accurate
  269. ; Exactly like C code.
  270. ; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
  271. ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
  272. ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
  273. ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
  274. ; COMMENT1**
  275. ; If textures have a max of 1024 then shiftU0 would be at most 10 which would
  276. ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
  277. ; It will also give bi-linear 6 bits of precision I think it was said that
  278. ; only five was needed.
  279. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
  280. ;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
  281. movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
  282. ;iLOD0 is zero in monolithic case so no subtraction needed.
  283. movd mm4, [esi+SPANTEX_iShiftU]
  284. psubw mm5, mm4
  285. movq mm4, mm5
  286. pand mm5, MMWORD PTR Val0xffff
  287. psrld mm4, 16
  288. movd mm1, [ebx+RASTCTX_SI+SPANITER_iU1]
  289. psrad mm1, mm5
  290. movd mm2, [ebx+RASTCTX_SI+SPANITER_iV1]
  291. psrad mm2, mm4
  292. punpckldq mm1, mm2
  293. psubd mm1, MMWORD PTR Val0x0000002000000020
  294. ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
  295. ; ----------------- Start of hack
  296. ; ATTENTION This is really hacked right now. Just to get it working
  297. ; Pitch would be better for me, instead of shift pitch.
  298. ; With actual pitch, this would be two moves and a shift.
  299. ;shl eax, 1
  300. movzx edx, word ptr [esi+SPANTEX_iShiftPitch]
  301. add edx, 16
  302. movd mm2, edx
  303. movq mm5, MMWORD ptr Makelow16one
  304. pslld mm5, mm2
  305. ;pslld mm5, 16 ;. Use this after hack.
  306. ; not needed in hacked version since i add to shifted value.
  307. ; ----------------- End of hack
  308. por mm5, MMWORD ptr Makelow16one
  309. ; Make the low 16 bits of dword one
  310. ; This helps in calculating texture address.
  311. ; Gets U and V value into mm1 so that it can be mirrored, wrapped or
  312. ; clamped. This can be done for two values in the point case
  313. ; or four values in the bilinear case.
  314. ;INT32 iUFrac = iU00 & 0x03f;
  315. ;INT32 iVFrac = iV00 & 0x03f;
  316. ;iU00 >>= 6;
  317. ;iV00 >>= 6;
  318. movq mm2, mm1
  319. psrad mm1, 6
  320. ;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
  321. pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
  322. ; Going to use only 8 bits for bi-linear so that I can do a pmullw.
  323. ; Currently at 6 bits so shift up by 2.
  324. psllw mm2, 2
  325. movq mm0, mm2
  326. ; Replicate VFrac value for bilinear
  327. punpckhwd mm2, mm2
  328. punpcklwd mm2, mm2
  329. ; Replicate UFrac Value for bilinear
  330. punpcklwd mm0, mm0
  331. punpcklwd mm0, mm0
  332. movq dword ptr VFrac, mm2
  333. movq dword ptr UFrac, mm0
  334. ;INT32 iU01 = iU00 + 1;
  335. ;INT32 iV01 = iV00 + 1;
  336. packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
  337. paddw mm1, dword ptr IncHighandLow16
  338. ; This will make texture values be (High word to low word):
  339. ; iV01, iU00, iV00, iU01
  340. ; Need to do this to make texture look up for bilinear easier.
  341. ; I have to combine to get all combinations anyway. It just
  342. ; happens to be better for me to have iV00, iU01 pair first.
  343. ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
  344. ; put mask in mm3 and replicate to match location for wrap/mirror/clamp
  345. movd mm0, [esi+SPANTEX_uMaskU] ; Load U and V mask
  346. ; replicate mask if doing bilinear
  347. punpckldq mm0, mm0
  348. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  349. ;INT16 iFlip;
  350. ; MM1 should contain 16 bit iU and iV for both texture locations
  351. ; End Result is MM1 value wrapped or mirrored
  352. ; in Bilinear Case, four values can be done
  353. ; iU00, iV00, iU01, iV01
  354. ; This code really does alot for the bilinear case and is kinda wasteful
  355. ; in the normal mode.
  356. ;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
  357. movq mm7, mm1
  358. ; Point doesnt need replication
  359. movd mm4, [esi+SPANTEX_iFlipMaskU]
  360. ; if bilinear replicate values together, Point doesnt need this.
  361. punpckldq mm4, mm4
  362. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  363. pand mm7, mm4
  364. ;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
  365. pcmpeqw mm7, MMWORD PTR Zero
  366. ;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
  367. pandn mm7, mm0
  368. ;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
  369. pand mm1, mm0
  370. ;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
  371. pxor mm1, mm7
  372. ; Result in mm1 now since TexAddrAll ends up that way.
  373. ; Making other two cases for texture addressing has to be simplier than
  374. ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
  375. ; TBD Make this better.
  376. ; values are still stored as iV01, iU00, iV00, iU01
  377. movq mm2, mm1
  378. movq mm3, mm1
  379. ; Calculate 1st and 3rd texel addresses
  380. pmaddwd mm1, mm5 ; Throw in first address calculation.
  381. ; Just to get it started. Calculate
  382. ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
  383. ; values are being changed to iV01, iU01, iV00, iU00
  384. ; seven instructions for this seems excessive.
  385. pand mm2, MMWORD ptr MaskKeepUValues
  386. pand mm3, MMWORD ptr MaskKeepVValues
  387. movq mm4, mm2
  388. psllq mm2, 32
  389. psrlq mm4, 32
  390. por mm3, mm2
  391. por mm3, mm4
  392. ; From here until mov edi is code that is needed for border.
  393. ; all sign bits are stored in bytes so that border code can tell if uv went below zero.
  394. ; Calculate 2nd and 4th texel addresses
  395. pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
  396. mov edi, [esi+SPANTEX_pBits]
  397. ; was esi. Cant change to esi because it is the pointer to pTex
  398. ; which is used by Border and ColorKey. Use edi for now and
  399. ; call routines through memory. Figure out if this is bad.
  400. ; load the read texture routine address into a register early
  401. ;mov edi, [ebx+RASTCTX_pfnTexRead]
  402. ;UINT32 uTex00 = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU,
  403. ; pTex->pBits[iLOD0], &pCtx->Texture[0]);
  404. ; Combine U and V values before making call.
  405. ;call edi
  406. ; -------------------- In Monolithic version calls are inlined.
  407. ;D3DCOLOR TexRead_B5G6R5_NoBorder(INT32 iU, INT32 iV, INT32 iShiftU, PUINT8 pBits, PD3DI_SPANTEX pTex)
  408. ;{
  409. ; Color convert 2 pixels at a time
  410. ; iV0 iU1 address should be done by now.
  411. ; movq mm2, MMWORD PTR Zero
  412. pxor mm2, mm2
  413. ; 1st (mm1) and 2nd (mm3) texel
  414. movd eax, mm3 ; load 2nd texel address
  415. movzx eax, word ptr [edi+2*eax]
  416. movd mm4, eax ; mm4 = 2nd texel
  417. movd eax, mm1 ; load 1st texel address
  418. movzx eax, word ptr [edi+2*eax]
  419. movd mm7, eax ; mm7 = 1st texel
  420. ; mm7 = 2nd texel (high 32 bits), 1st texel (low 32 bits)
  421. punpckldq mm7, mm4
  422. movq mm5, mm7
  423. movq mm4, mm7
  424. pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
  425. pand mm7, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
  426. pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
  427. pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
  428. pslld mm7, MMWORD PTR BilinearShiftGreen565to888 ; = 5
  429. pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
  430. por mm7, mm5 ; combine R+G
  431. por mm7, mm4 ; combine (R+G) + B
  432. movq mm4, mm7 ; copy 1st and 2nd texels
  433. ; mm4 calculated from high 32 bits of mm3 (2nd texel)
  434. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  435. ; bits for each color component
  436. punpckhbw mm4, mm2
  437. ; mm7 calculated from low 32 bits of mm1 (1st texel)
  438. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  439. ; bits for each color component
  440. punpcklbw mm7, mm2
  441. psrlq mm3, 32 ; shift 4th texel address into low 32 bits
  442. ; mm7 = final calc on 1st and 2nd texel
  443. psubw mm7, mm4
  444. psllw mm4, 8
  445. pmullw mm7, dword ptr UFrac
  446. paddw mm7, mm4
  447. ; 3rd (mm1) and 4th (mm3) texel
  448. movd eax, mm3 ; load 4th texel address
  449. psrlq mm1, 32 ; shift 3rd texel address into low 32 bits
  450. movzx eax, word ptr [edi+2*eax]
  451. movd mm6, eax ; mm6 = 4th texel
  452. movd eax, mm1 ; load 3rd texel address
  453. movzx eax, word ptr [edi+2*eax]
  454. movd mm4, eax ; mm4 = 3rd texel
  455. ; mm6 = 4th texel (high 32 bits), 3rd texel (low 32 bits)
  456. punpckldq mm6, mm4
  457. movq mm5, mm6
  458. movq mm4, mm6
  459. pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
  460. pand mm6, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
  461. pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
  462. pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
  463. pslld mm6, MMWORD PTR BilinearShiftGreen565to888 ; = 5
  464. pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
  465. por mm6, mm5 ; combine R+G
  466. por mm6, mm4 ; combine (R+G) + B
  467. movq mm4, mm6 ; copy 3rd and 4th texels
  468. ; mm4 calculated from high 32 bits of mm3 (4th texel)
  469. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  470. ; bits for each color component
  471. punpckhbw mm4, mm2
  472. ; mm6 calculated from low 32 bits of mm1 (3rd texel)
  473. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  474. ; bits for each color component
  475. punpcklbw mm6, mm2
  476. psubw mm6, mm4
  477. psllw mm4, 8
  478. pmullw mm6, dword ptr UFrac
  479. movq mm1, mm7
  480. ; mm6 = final calc on 3rd and 4th texel
  481. paddw mm6, mm4
  482. ; mm1 = final calc on 1st+2nd texel and 3rd+4th texel
  483. psrlw mm6, 8
  484. psrlw mm7, 8
  485. psubw mm6, mm7
  486. pmullw mm6, dword ptr VFrac
  487. paddw mm1, mm6
  488. psrlw mm1, 8
  489. ;modulate
  490. ; ATTENTION shouldnt have to move to and from memory in monolithic case. Use registers
  491. ;UINT16 uB = pS->uB>>COLOR_SHIFT;
  492. ;UINT16 uG = pS->uG>>COLOR_SHIFT;
  493. ;UINT16 uR = pS->uR>>COLOR_SHIFT;
  494. movq mm4, [ebp+RASTSPAN_uB]
  495. psrlw mm4, COLOR_SHIFT ; COLOR_SHIFT is set to 8.
  496. ;UINT16 uTB = (UINT16)(RGBA_GETBLUE(pCtx->SI.TexCol[0]));
  497. ;UINT16 uTG = (UINT16)(RGBA_GETGREEN(pCtx->SI.TexCol[0]));
  498. ;UINT16 uTR = (UINT16)(RGBA_GETRED(pCtx->SI.TexCol[0]));
  499. ;UINT16 uTA = (UINT16)(RGBA_GETALPHA(pCtx->SI.TexCol[0]));
  500. ; this is a PMULLW, which works on unsigned 16 bit quantities
  501. ;pCtx->SI.uBB = uB*uTB;
  502. ;pCtx->SI.uBG = uG*uTG;
  503. ;pCtx->SI.uBR = uR*uTR;
  504. ;pCtx->SI.uBA = uTA<<COLOR_SHIFT;
  505. pmullw mm4, mm1
  506. ; write
  507. ;*(PUINT16)pS->pSurface =
  508. ; ((pCtx->SI.uBR >> 0) & 0xf800) |
  509. ; ((pCtx->SI.uBG >> 5) & 0x07e0) |
  510. ; ((pCtx->SI.uBB >> 11) & 0x001f);
  511. mov edi, [ebp+RASTSPAN_pSurface]
  512. psrlw mm4, 8 ; Convert color1 from 8.8 two 0.8
  513. packuswb mm4, mm7 ; pack one color
  514. movq mm3, mm4
  515. pand mm4, MMWORD PTR u888to565RedBlueMask
  516. pmaddwd mm4, MMWORD PTR u888to565Multiplier
  517. pand mm3, MMWORD PTR u888to565GreenMask
  518. por mm4, mm3
  519. psrld mm4, 5
  520. movd edx, mm4
  521. mov [edi], dx
  522. FailLabel:
  523. ;if (--uPix <= 0)
  524. ; break;
  525. dec uPix
  526. jle ExitPixelLoop
  527. ; Doing update code after span length test so that an extra update is not done.
  528. ;void TestFail_Gouraud_PerspTex1_NoSpecularFog(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  529. ; PD3DI_RASTSPAN pS)
  530. ;{
  531. ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
  532. ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
  533. movq mm1, [ebp+RASTSPAN_uB]
  534. paddw mm1, [ecx+RASTPRIM_iDBDX]
  535. movq [ebp+RASTSPAN_uB], mm1
  536. ;pS->iUoW1 += pP->iDUoW1DX;
  537. ;pS->iVoW1 += pP->iDVoW1DX;
  538. movq mm5, [ebp+RASTSPAN_iUoW1]
  539. paddd mm5, [ecx+RASTPRIM_iDUoW1DX]
  540. movq [ebp+RASTSPAN_iUoW1], mm5
  541. ;pS->iOoW += pP->iDOoWDX;
  542. mov eax, [ebp+RASTSPAN_iOoW]
  543. add eax, [ecx+RASTPRIM_iDOoWDX]
  544. mov [ebp+RASTSPAN_iOoW], eax
  545. ;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
  546. ; TODO Could do this and OoW Add at same time with MMX.
  547. mov edx, [ebp+RASTSPAN_iW]
  548. mov LastW, edx ; Save iW to calc iDW for next time.
  549. add edx, [ebx+RASTCTX_SI+SPANITER_iDW]
  550. ;if (pCtx->SI.iSpecialW < 0)
  551. ;{
  552. xor edi, edi
  553. cmp di, word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  554. jle DontDoSpecialW1
  555. ;DoSpecialW1:
  556. ; This label is a left over from when
  557. ;if (iWn0 < 0)
  558. ;{
  559. cmp edx, edi
  560. jl WOutOfRange1
  561. ;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
  562. mov edx, LastW
  563. sar edx, 1
  564. ;}
  565. WOutOfRange1:
  566. ;VAL32 iWn1;
  567. ;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
  568. ; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
  569. ;INT32 iGiveUp = 7;
  570. mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
  571. ;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
  572. ;{
  573. SpecW1Loop1:
  574. ; Could move this to bottom of loop and combine results somehow.
  575. ; TBD look at it more.
  576. dec GiveUp
  577. jz ExitSpecWLoop1
  578. ; Shift iOoW by one since imul cannot have sign bit set
  579. ; OoW cannot reach one, only 0x7fffffff
  580. ;shr eax, 1 ; 1.31 >> 1 = 1.30
  581. ; Get ready to do Two minus iOoW*iW
  582. mov esi, (1 SHL 16)
  583. ;iWnOld = iWn0;
  584. mov edi, edx
  585. ; Result should be close to one so we want most of the
  586. ; precision in the low bits. Need to give more bits
  587. ; leaway since these are the bad cases.
  588. ; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
  589. imul edx
  590. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  591. sub esi, edx
  592. ;while(iWn1.i < 0)
  593. ;{
  594. SpecW1Loop2:
  595. test esi, esi
  596. jns SpecW1ExitLoop2 ; This jump should be predicted correctly most of the time.
  597. ;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
  598. add esi, (1 SHL 15)
  599. sar esi, 1
  600. jmp SpecW1Loop2
  601. ;}
  602. SpecW1ExitLoop2:
  603. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  604. mov eax, edi
  605. shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
  606. shl esi, 12 ; 4.15 << 12 = 4.27 ;
  607. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  608. ; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
  609. mul esi
  610. ; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
  611. sub edi, edx
  612. ; These four lines are abs code.
  613. mov eax, edi
  614. sar eax, 31
  615. xor edi, eax
  616. sub edi, eax
  617. cmp edi, 020h ;Assuming that loop will only happen once.
  618. jbe ExitSpecWLoop1
  619. ; Reload eax with iOoW.
  620. mov eax, [ebp+RASTSPAN_iOoW]
  621. jmp SpecW1Loop1
  622. ;}
  623. ;else
  624. ;{
  625. DontDoSpecialW1:
  626. ; Everything should be positive in Non-SpecialW case.
  627. ;INT32 iWn1;
  628. mov esi, (1 SHL 16)
  629. mov edi, edx
  630. ; This should be close to one so Low bits are most important.
  631. ;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
  632. mul edx
  633. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  634. sub esi, edx
  635. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  636. shl esi, 15 ; 0.16.15 << 15 = 0.2.30
  637. mov eax, esi
  638. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  639. mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
  640. shl edx, 2 ; 1.17.14 << 2 = 1.15.16
  641. ;}
  642. ;}
  643. ExitSpecWLoop1:
  644. ;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
  645. ;pS->iW = iWn0;
  646. mov [ebp+RASTSPAN_iW], edx
  647. mov esi, edx ; Save W for multiplying by UoW and VoW
  648. sub edx, LastW
  649. mov [ebx+RASTCTX_SI+SPANITER_iDW], edx
  650. ;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
  651. inc word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  652. ;pCtx->SI.iU1 = d_WTimesUVoW(pS->iW,pS->iUoW1);
  653. ;pCtx->SI.iV1 = d_WTimesUVoW(pS->iW,pS->iVoW1);
  654. pslld mm5, 8
  655. shl esi, 4
  656. movd eax, mm5
  657. psrlq mm5, 32
  658. imul esi
  659. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  660. movd eax, mm5
  661. imul esi
  662. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  663. ;//pS->pZ += iZStep;
  664. ;//pS->pSurface += iSurfaceStep;
  665. mov eax, dword ptr [ebp+RASTSPAN_pZ]
  666. mov edx, dword ptr [ebp+RASTSPAN_pSurface]
  667. add eax, iZStep
  668. add edx, iSurfaceStep
  669. mov dword ptr [ebp+RASTSPAN_pZ], eax
  670. mov dword ptr [ebp+RASTSPAN_pSurface], edx
  671. ;#ifdef DBG
  672. ;// handy for debug to see where we are
  673. ;//pS->uX += (INT16)pCtx->SI.iXStep;
  674. ;#endif
  675. ;// } // while
  676. jmp PixelLoop
  677. ExitPixelLoop:
  678. ; Loop code ends
  679. ;-----------------------------------------------------------------------------
  680. ; LoopAny code ends here
  681. ;-----------------------------------------------------------------------------
  682. ;pS++;
  683. add ebp, SIZEOF_RASTSPAN
  684. ;}
  685. jmp SpanLoop
  686. ExitSpanLoop:
  687. ;pP = pP->pNext;
  688. mov ecx, [ecx+RASTPRIM_pNext]
  689. ;}
  690. jmp PrimLoop
  691. ExitPrimLoop:
  692. ;_asm{
  693. emms
  694. ;}
  695. ;return S_OK;
  696. xor eax, eax
  697. ;}
  698. pop edi
  699. pop esi
  700. pop ebx
  701. mov esp, StackPos
  702. pop ebp
  703. ret
  704. END