Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

888 lines
27 KiB

  1. ;-----------------------------------------------------------------------------
  2. ;
  3. ; Monolith 4. Perspective Correct Bi-linear
  4. ; 565 input texture 16 bit Z buffered (LE or GT)
  5. ; 565 output.
  6. ;
  7. ;
  8. ; Globals
  9. ;
  10. ; StackPos - stack pos holder
  11. ; uSpans - Number of spans to process
  12. ; iSurfaceStep - what to add to screen pointer
  13. ; iZStep - what to add to Z buffer pointer
  14. ; uPix - Pixel Count
  15. ;
  16. ; Changes from general MMX assembly.
  17. ; 1) Registers renamed a to remove additional moves
  18. ; 2) Since there are 4 texels used in bi-linear and
  19. ; the 565 - 888 color conversion can convert 2
  20. ; texels at a time, two texels are loaded, combined
  21. ; and then converted at once then moved into seperate
  22. ; registers.
  23. ; 3) Most register renaming was done in the bi-linear calculation
  24. ; since the original code always read into mm1 which
  25. ; caused alot of additional moves.
  26. ; 4) Texcolor is not written to since it is just loaded
  27. ; and then written.
  28. ;
  29. ;-----------------------------------------------------------------------------
  30. INCLUDE iammx.inc
  31. INCLUDE offs_acp.inc
  32. ; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
  33. ; at the LSB, then six bits of green, then five bits of red.
  34. ;TBD check to see if this value is correct.
  35. COLOR_SHIFT equ 8
  36. .586
  37. .model flat
  38. ; Big seperating lines seperate code into span code
  39. ; and loop code. If span and loop are not going to
  40. ; end up being combined then it will be easy to
  41. ; seperate the code.
  42. .data
  43. ; Need externs for all of the variables that are needed for various beads
  44. EXTERN IncHighandLow16:MMWORD
  45. EXTERN UFracVFracMask:MMWORD
  46. EXTERN UV32to15Mask:MMWORD
  47. EXTERN Makelow16one:MMWORD
  48. EXTERN MaskKeepUValues:MMWORD
  49. EXTERN MaskKeepVValues:MMWORD
  50. EXTERN UFrac:MMWORD
  51. EXTERN VFrac:MMWORD
  52. EXTERN Zero:MMWORD
  53. EXTERN memD3DTFG_POINT:MMWORD
  54. EXTERN GiveUp:MMWORD
  55. EXTERN LastW:MMWORD
  56. EXTERN Val0x000a000a:MMWORD
  57. EXTERN Val0xffff:MMWORD
  58. EXTERN Val0x0000002000000020:MMWORD
  59. EXTERN Val0x0000ffff0000ffff:MMWORD
  60. EXTERN MaskRed565to888:MMWORD
  61. EXTERN MaskGreen565to888:MMWORD
  62. EXTERN MaskBlue565to888:MMWORD
  63. EXTERN MaskRed555to888:MMWORD
  64. EXTERN MaskGreen555to888:MMWORD
  65. EXTERN MaskBlue555to888:MMWORD
  66. EXTERN MaskAlpha1555to8888:MMWORD
  67. EXTERN MaskRed1555to8888:MMWORD
  68. EXTERN MaskGreen1555to8888:MMWORD
  69. EXTERN MaskBlue1555to8888:MMWORD
  70. ; TBD. I think that I want to do 0xffff instead of 0xff. This will
  71. ; have to be checked. There is a value very similiar to this in
  72. ; buf write.
  73. EXTERN SetAlphato0xffff:MMWORD
  74. EXTERN SetAlphato0xff:MMWORD
  75. ; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
  76. RedShift565to888 equ 8
  77. GreenShift565to888 equ 5
  78. BlueShift565to888 equ 3
  79. RedShift555to888 equ 9
  80. GreenShift555to888 equ 6
  81. BlueShift555to888 equ 3
  82. AlphaShift1555to8888 equ 16
  83. RedShift1555to8888 equ 9
  84. GreenShift1555to8888 equ 6
  85. BlueShift1555to8888 equ 3
  86. PUBLIC BilinearMaskRed565to888
  87. BilinearMaskRed565to888 dq 0000f8000000f800h
  88. PUBLIC BilinearMaskGreen565to888
  89. BilinearMaskGreen565to888 dq 000007e0000007e0h
  90. PUBLIC BilinearMaskBlue565to888
  91. BilinearMaskBlue565to888 dq 0000001f0000001fh
  92. PUBLIC BilinearShiftRed565to888
  93. BilinearShiftRed565to888 dq 8
  94. PUBLIC BilinearShiftGreen565to888
  95. BilinearShiftGreen565to888 dq 5
  96. PUBLIC BilinearShiftBlue565to888
  97. BilinearShiftBlue565to888 dq 3
  98. EXTERN Zero:MMWORD
  99. EXTERN DW_One_One:MMWORD
  100. EXTERN MaskOffAlpha:MMWORD
  101. EXTERN ShiftTA:MMWORD
  102. EXTERN Val0x00ff00ff00ff00ff:MMWORD
  103. EXTERN Val0x000000ff00ff00ff:MMWORD
  104. EXTERN Val0X0000000001000000:MMWORD
  105. EXTERN AlphaVal128:MMWORD
  106. EXTERN RGBVal128:MMWORD
  107. EXTERN g_uDitherValue:MMWORD
  108. EXTERN SetAlphato0xff:MMWORD
  109. EXTERN u888to565RedBlueMask:MMWORD
  110. EXTERN u888to565GreenMask:MMWORD
  111. EXTERN u888to565Multiplier:MMWORD
  112. EXTERN uVal0x000007ff03ff07ff:MMWORD
  113. EXTERN uVal0x0000078003c00780:MMWORD
  114. EXTERN u888to555RedBlueMask:MMWORD
  115. EXTERN u888to555GreenMask:MMWORD
  116. EXTERN u888to555Multiplier:MMWORD
  117. EXTERN uVal0x000007ff07ff07ff:MMWORD
  118. EXTERN uVal0x0000078007800780:MMWORD
  119. ;-----------------------------------------------------------------------------
  120. ; Span Variables
  121. StackPos dd ?
  122. uSpans dd ?
  123. ;-----------------------------------------------------------------------------
  124. ;-----------------------------------------------------------------------------
  125. ; Loop Variables
  126. iSurfaceStep dd ?
  127. iZStep dd ?
  128. uPix dd ?
  129. ;-----------------------------------------------------------------------------
  130. .code
  131. PUBLIC _MMXMLRast_4
  132. _MMXMLRast_4:
  133. push ebp
  134. mov StackPos, esp
  135. mov eax, esp
  136. sub esp, 0Ch ; This will need to change if stack frame size changes.
  137. push ebx
  138. push esi
  139. push edi
  140. ; Put pCtx into ebx
  141. mov ebx, [eax+8]
  142. ;PD3DI_RASTPRIM pP = pCtx->pPrim;
  143. mov ecx, [ebx+RASTCTX_pPrim]
  144. ;while (pP)
  145. ;{
  146. PrimLoop:
  147. cmp ecx, 0
  148. je ExitPrimLoop
  149. ;UINT16 uSpans = pP->uSpans;
  150. movzx eax, word ptr [ecx+RASTPRIM_uSpans]
  151. mov uSpans, eax
  152. ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
  153. mov ebp, ecx
  154. add ebp, SIZEOF_RASTPRIM
  155. ;while (uSpans-- > 0)
  156. ;{
  157. SpanLoop:
  158. mov edx, uSpans
  159. mov eax, edx
  160. dec eax
  161. mov uSpans, eax
  162. test edx, edx
  163. jle ExitSpanLoop
  164. ;pCtx->pfnBegin(pCtx, pP, pS);
  165. ;-----------------------------------------------------------------------------
  166. ; LoopAny code inserted here. This is to get rid of an extra
  167. ; jump.
  168. ;-----------------------------------------------------------------------------
  169. ; Setup Code begins
  170. ; get values to iterate
  171. ;uPix = pS->uPix;
  172. movzx eax, word ptr [ebp+RASTSPAN_uPix]
  173. mov uPix, eax
  174. ;pCtx->SI.iDW = 0x0;
  175. mov dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0
  176. mov esi, [ebp+RASTSPAN_iW]
  177. movq mm5, MMWORD PTR [ebp+RASTSPAN_iUoW1]
  178. ;pCtx->SI.iUd_TexNum = d_WTimesUVoW(pS->iW,pS->iUoW1);
  179. ;pCtx->SI.iVd_TexNum = d_WTimesUVoW(pS->iW,pS->iVoW1);
  180. pslld mm5, 8
  181. shl esi, 4
  182. movd eax, mm5
  183. psrlq mm5, 32
  184. imul esi
  185. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  186. movd eax, mm5
  187. imul esi
  188. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  189. ;if (pP->iDOoWDX > 0)
  190. ;{
  191. cmp dword ptr [ecx+RASTPRIM_iDOoWDX], 0
  192. jg SpecialWLastMonTest
  193. ;// iSpecialW should be negative for the first 3 pixels of span
  194. ;pCtx->SI.iSpecialW = -3;
  195. mov word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], -3
  196. jmp DoneSpecialWifMonTest
  197. ;}
  198. ;else
  199. ;{
  200. SpecialWLastMonTest:
  201. ;// iSpecialW should be negative for the last 3 pixels of span
  202. ;pCtx->SI.iSpecialW = 0x7fff - uPix;
  203. mov eax, 07fffh
  204. sub eax, uPix
  205. ;pCtx->SI.iSpecialW += 5; // this may wrap, but it should
  206. add eax, 5
  207. mov [ebx+RASTCTX_SI+SPANITER_iSpecialW], eax
  208. ;}
  209. DoneSpecialWifMonTest:
  210. ;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
  211. ;{
  212. mov eax, [ecx+RASTPRIM_uFlags]
  213. and eax, D3DI_RASTPRIM_X_DEC
  214. test eax, eax
  215. jz LeftToRightSpan
  216. ;iZStep = -pCtx->iZStep;
  217. mov eax, [ebx+RASTCTX_iZStep]
  218. neg eax
  219. mov iZStep, eax
  220. ;iSurfaceStep = -pCtx->iSurfaceStep;
  221. mov eax, [ebx+RASTCTX_iSurfaceStep]
  222. neg eax
  223. mov iSurfaceStep, eax
  224. ;}
  225. jmp DoneSpanDirif
  226. ;else
  227. ;{
  228. LeftToRightSpan:
  229. ;iZStep = pCtx->iZStep;
  230. mov eax, [ebx+RASTCTX_iZStep]
  231. mov iZStep, eax
  232. ;iSurfaceStep = pCtx->iSurfaceStep;
  233. mov eax, [ebx+RASTCTX_iSurfaceStep]
  234. mov iSurfaceStep, eax
  235. ;}
  236. DoneSpanDirif:
  237. ; Setup Code Ends
  238. ; ----------------------------------------------------------------------------------------------------------------
  239. ; Loop Code Begins
  240. ;//while (1)
  241. ;//{
  242. PixelLoop:
  243. ; Ztestcode
  244. ; edx is uZ
  245. ; eax is uZB
  246. ; 16 bit unsigned format
  247. ;UINT16 uZ = (UINT16)(pS->uZ>>15);
  248. ;UINT16 uZB = *((UINT16*)pS->pZ);
  249. mov edx, [ebp+RASTSPAN_uZ]
  250. movd mm4, edx
  251. mov esi, [ebp+RASTSPAN_pZ]
  252. shr edx, 15
  253. movzx eax, word ptr [esi]
  254. ;pS->uZ += pP->iDZDX;
  255. ;if ((pCtx->iZXorMask)^(uZ > uZB))
  256. ; !(uZ > uZB) <==>
  257. ; (uZ <= uZB) <==>
  258. ; (uZ < uZB+1) <==>
  259. ;
  260. sub eax, edx
  261. paddd mm4, [ecx+RASTPRIM_iDZDX]
  262. movd [ebp+RASTSPAN_uZ], mm4
  263. xor eax, [ebx+RASTCTX_iZXorMask]
  264. test eax, eax
  265. js FailLabel
  266. mov word ptr [esi], dx
  267. ; texturecode
  268. ;---------------------------------------------------------------------------
  269. ;void Tex1Addr_TexAddrWrapMirror_Persp_Bilinear_NoMip(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  270. ; PD3DI_RASTSPAN pS)
  271. ;{
  272. ;PD3DI_SPANTEX pTex = &pCtx->Texture[0];
  273. mov esi, [ebx+RASTCTX_pTexture]
  274. ; ----------------------------------------
  275. ; Doing UV calculation a little more accurate
  276. ; Exactly like C code.
  277. ; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
  278. ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
  279. ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
  280. ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
  281. ; COMMENT1**
  282. ; If textures have a max of 1024 then shiftU0 would be at most 10 which would
  283. ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
  284. ; It will also give bi-linear 6 bits of precision I think it was said that
  285. ; only five was needed.
  286. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
  287. ;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
  288. movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
  289. ;iLOD0 is zero in monolithic case so no subtraction needed.
  290. movd mm4, [esi+SPANTEX_iShiftU]
  291. psubw mm5, mm4
  292. movq mm4, mm5
  293. pand mm5, MMWORD PTR Val0xffff
  294. psrld mm4, 16
  295. movd mm1, [ebx+RASTCTX_SI+SPANITER_iU1]
  296. psrad mm1, mm5
  297. movd mm2, [ebx+RASTCTX_SI+SPANITER_iV1]
  298. psrad mm2, mm4
  299. punpckldq mm1, mm2
  300. psubd mm1, MMWORD PTR Val0x0000002000000020
  301. ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
  302. movzx edx, word ptr [esi+SPANTEX_iShiftPitch]
  303. add edx, 16
  304. movd mm2, edx
  305. movq mm5, MMWORD ptr Makelow16one
  306. pslld mm5, mm2
  307. por mm5, MMWORD ptr Makelow16one
  308. ; Make the low 16 bits of dword one
  309. ; This helps in calculating texture address.
  310. ; Gets U and V value into mm1 so that it can be mirrored, wrapped or
  311. ; clamped. This can be done for two values in the point case
  312. ; or four values in the bilinear case.
  313. ;INT32 iUFrac = iU00 & 0x03f;
  314. ;INT32 iVFrac = iV00 & 0x03f;
  315. ;iU00 >>= 6;
  316. ;iV00 >>= 6;
  317. movq mm2, mm1
  318. psrad mm1, 6
  319. ;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
  320. pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
  321. ; Going to use only 8 bits for bi-linear so that I can do a pmullw.
  322. ; Currently at 6 bits so shift up by 2.
  323. psllw mm2, 2
  324. movq mm0, mm2
  325. ; Replicate VFrac value for bilinear
  326. punpckhwd mm2, mm2
  327. punpcklwd mm2, mm2
  328. ; Replicate UFrac Value for bilinear
  329. punpcklwd mm0, mm0
  330. punpcklwd mm0, mm0
  331. movq dword ptr VFrac, mm2
  332. movq dword ptr UFrac, mm0
  333. ;INT32 iU01 = iU00 + 1;
  334. ;INT32 iV01 = iV00 + 1;
  335. packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
  336. paddw mm1, dword ptr IncHighandLow16
  337. ; This will make texture values be (High word to low word):
  338. ; iV01, iU00, iV00, iU01
  339. ; Need to do this to make texture look up for bilinear easier.
  340. ; I have to combine to get all combinations anyway. It just
  341. ; happens to be better for me to have iV00, iU01 pair first.
  342. ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
  343. ; put mask in mm3 and replicate to match location for wrap/mirror/clamp
  344. movd mm0, [esi+SPANTEX_uMaskU] ; Load U and V mask
  345. ; replicate mask if doing bilinear
  346. punpckldq mm0, mm0
  347. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  348. ;INT16 iFlip;
  349. ; MM1 should contain 16 bit iU and iV for both texture locations
  350. ; End Result is MM1 value wrapped or mirrored
  351. ; in Bilinear Case, four values can be done
  352. ; iU00, iV00, iU01, iV01
  353. ; This code really does alot for the bilinear case and is kinda wasteful
  354. ; in the normal mode.
  355. ;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
  356. movq mm7, mm1
  357. ; Point doesnt need replication
  358. movd mm4, [esi+SPANTEX_iFlipMaskU]
  359. ; if bilinear replicate values together, Point doesnt need this.
  360. punpckldq mm4, mm4
  361. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  362. pand mm7, mm4
  363. ;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
  364. pcmpeqw mm7, MMWORD PTR Zero
  365. ;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
  366. pandn mm7, mm0
  367. ;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
  368. pand mm1, mm0
  369. ;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
  370. pxor mm1, mm7
  371. ; Result in mm1 now since TexAddrAll ends up that way.
  372. ; Making other two cases for texture addressing has to be simplier than
  373. ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
  374. ; TBD Make this better.
  375. ; values are still stored as iV01, iU00, iV00, iU01
  376. movq mm2, mm1
  377. movq mm3, mm1
  378. ; Calculate 1st and 3rd texel addresses
  379. pmaddwd mm1, mm5 ; Throw in first address calculation.
  380. ; Just to get it started. Calculate
  381. ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
  382. ; values are being changed to iV01, iU01, iV00, iU00
  383. ; seven instructions for this seems excessive.
  384. pand mm2, MMWORD ptr MaskKeepUValues
  385. pand mm3, MMWORD ptr MaskKeepVValues
  386. movq mm4, mm2
  387. psllq mm2, 32
  388. psrlq mm4, 32
  389. por mm3, mm2
  390. por mm3, mm4
  391. ; From here until mov edi is code that is needed for border.
  392. ; all sign bits are stored in bytes so that border code can tell if uv went below zero.
  393. ; Calculate 2nd and 4th texel address
  394. pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
  395. mov edi, [esi+SPANTEX_pBits]
  396. ; was esi. Cant change to esi because it is the pointer to pTex
  397. ; which is used by Border and ColorKey. Use edi for now and
  398. ; call routines through memory. Figure out if this is bad.
  399. ; load the read texture routine address into a register early
  400. ;mov edi, [ebx+RASTCTX_pfnTexRead]
  401. ;UINT32 uTex00 = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU,
  402. ; pTex->pBits[iLOD0], &pCtx->Texture[0]);
  403. ; Combine U and V values before making call.
  404. ;call edi
  405. ; -------------------- In Monolithic version calls are inlined.
  406. ;D3DCOLOR TexRead_B5G6R5_NoBorder(INT32 iU, INT32 iV, INT32 iShiftU, PUINT8 pBits, PD3DI_SPANTEX pTex)
  407. ;{
  408. ; Color convert 2 pixels at a time
  409. ; iV0 iU1 address should be done by now.
  410. ; movq mm2, MMWORD PTR Zero
  411. pxor mm2, mm2
  412. ; 1st (mm1) and 2nd (mm3) texel
  413. movd eax, mm3 ; load 2nd texel address
  414. movzx eax, word ptr [edi+2*eax]
  415. movd mm4, eax ; mm4 = 2nd texel
  416. movd eax, mm1 ; load 1st texel address
  417. movzx eax, word ptr [edi+2*eax]
  418. movd mm7, eax ; mm7 = 1st texel
  419. ; mm7 = 2nd texel (high 32 bits), 1st texel (low 32 bits)
  420. punpckldq mm7, mm4
  421. movq mm5, mm7
  422. movq mm4, mm7
  423. pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
  424. pand mm7, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
  425. pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
  426. pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
  427. pslld mm7, MMWORD PTR BilinearShiftGreen565to888 ; = 5
  428. pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
  429. por mm7, mm5 ; combine R+G
  430. por mm7, mm4 ; combine (R+G) + B
  431. movq mm4, mm7 ; copy 1st and 2nd texels
  432. ; mm4 calculated from high 32 bits of mm3 (2nd texel)
  433. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  434. ; bits for each color component
  435. punpckhbw mm4, mm2
  436. ; mm7 calculated from low 32 bits of mm1 (1st texel)
  437. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  438. ; bits for each color component
  439. punpcklbw mm7, mm2
  440. psrlq mm3, 32 ; shift 4th texel address into low 32 bits
  441. ; mm7 = final calc on 1st and 2nd texel
  442. psubw mm7, mm4
  443. psllw mm4, 8
  444. pmullw mm7, dword ptr UFrac
  445. paddw mm7, mm4
  446. ; 3rd (mm1) and 4th (mm3) texel
  447. movd eax, mm3 ; load 4th texel address
  448. psrlq mm1, 32 ; shift 3rd texel address into low 32 bits
  449. movzx eax, word ptr [edi+2*eax]
  450. movd mm6, eax ; mm6 = 4th texel
  451. movd eax, mm1 ; load 3rd texel address
  452. movzx eax, word ptr [edi+2*eax]
  453. movd mm4, eax ; mm4 = 3rd texel
  454. ; mm6 = 4th texel (high 32 bits), 3rd texel (low 32 bits)
  455. punpckldq mm6, mm4
  456. movq mm5, mm6
  457. movq mm4, mm6
  458. pand mm5, MMWORD PTR BilinearMaskRed565to888 ; = 0x0000f8000000f800
  459. pand mm6, MMWORD PTR BilinearMaskGreen565to888 ; = 0x000007e0000007e0
  460. pand mm4, MMWORD PTR BilinearMaskBlue565to888 ; = 0x0000001f0000001f
  461. pslld mm5, MMWORD PTR BilinearShiftRed565to888 ; = 8
  462. pslld mm6, MMWORD PTR BilinearShiftGreen565to888 ; = 5
  463. pslld mm4, MMWORD PTR BilinearShiftBlue565to888 ; = 3
  464. por mm6, mm5 ; combine R+G
  465. por mm6, mm4 ; combine (R+G) + B
  466. movq mm4, mm6 ; copy 3rd and 4th texels
  467. ; mm4 calculated from high 32 bits of mm3 (4th texel)
  468. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  469. ; bits for each color component
  470. punpckhbw mm4, mm2
  471. ; mm6 calculated from low 32 bits of mm1 (3rd texel)
  472. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  473. ; bits for each color component
  474. punpcklbw mm6, mm2
  475. psubw mm6, mm4
  476. psllw mm4, 8
  477. pmullw mm6, dword ptr UFrac
  478. movq mm1, mm7
  479. ; mm6 = final calc on 3rd and 4th texel
  480. paddw mm6, mm4
  481. ; mm4 = final calc on 1st+2nd texel and 3rd+4th texel
  482. psrlw mm6, 8
  483. psrlw mm7, 8
  484. psubw mm6, mm7
  485. pmullw mm6, dword ptr VFrac
  486. paddw mm6, mm1
  487. ; write
  488. ;*(PUINT16)pS->pSurface =
  489. ; ((pCtx->SI.uBR >> 0) & 0xf800) |
  490. ; ((pCtx->SI.uBG >> 5) & 0x07e0) |
  491. ; ((pCtx->SI.uBB >> 11) & 0x001f);
  492. mov edi, [ebp+RASTSPAN_pSurface]
  493. psrlw mm6, 8 ; Convert color1 from 8.8 two 0.8
  494. packuswb mm6, mm7 ; pack one color
  495. movq mm3, mm6
  496. pand mm6, MMWORD PTR u888to565RedBlueMask
  497. pmaddwd mm6, MMWORD PTR u888to565Multiplier
  498. pand mm3, MMWORD PTR u888to565GreenMask
  499. por mm6, mm3
  500. psrld mm6, 5
  501. movd edx, mm6
  502. mov [edi], dx
  503. FailLabel:
  504. ;//if (--uPix <= 0)
  505. ;// break;
  506. dec uPix ;// BUG BUG?? uPix should never start as zero should it?
  507. ;// if so, this is a bug.
  508. jle ExitPixelLoop
  509. ; Doing update code after span length test so that an extra update is not done.
  510. ;void TestFail_Gouraud_PerspTex1_NoSpecularFog(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  511. ; PD3DI_RASTSPAN pS)
  512. ;{
  513. ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
  514. ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
  515. movq mm1, [ebp+RASTSPAN_uB]
  516. paddw mm1, [ecx+RASTPRIM_iDBDX]
  517. movq [ebp+RASTSPAN_uB], mm1
  518. ;pS->iUoW1 += pP->iDUoW1DX;
  519. ;pS->iVoW1 += pP->iDVoW1DX;
  520. movq mm5, [ebp+RASTSPAN_iUoW1]
  521. paddd mm5, [ecx+RASTPRIM_iDUoW1DX]
  522. movq [ebp+RASTSPAN_iUoW1], mm5
  523. ;pS->iOoW += pP->iDOoWDX;
  524. mov eax, [ebp+RASTSPAN_iOoW]
  525. add eax, [ecx+RASTPRIM_iDOoWDX]
  526. mov [ebp+RASTSPAN_iOoW], eax
  527. ;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
  528. ; TODO Could do this and OoW Add at same time with MMX.
  529. mov edx, [ebp+RASTSPAN_iW]
  530. mov LastW, edx ; Save iW to calc iDW for next time.
  531. add edx, [ebx+RASTCTX_SI+SPANITER_iDW]
  532. ;if (pCtx->SI.iSpecialW < 0)
  533. ;{
  534. xor edi, edi
  535. cmp di, word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  536. jle DontDoSpecialW1
  537. ;DoSpecialW1:
  538. ; This label is a left over from when
  539. ;if (iWn0 < 0)
  540. ;{
  541. cmp edx, edi
  542. jl WOutOfRange1
  543. ;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
  544. mov edx, LastW
  545. sar edx, 1
  546. ;}
  547. WOutOfRange1:
  548. ;VAL32 iWn1;
  549. ;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
  550. ; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
  551. ;INT32 iGiveUp = 7;
  552. mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
  553. ;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
  554. ;{
  555. SpecW1Loop1:
  556. ; Could move this to bottom of loop and combine results somehow.
  557. ; TBD look at it more.
  558. dec GiveUp
  559. jz ExitSpecWLoop1
  560. ; Shift iOoW by one since imul cannot have sign bit set
  561. ; OoW cannot reach one, only 0x7fffffff
  562. ;shr eax, 1 ; 1.31 >> 1 = 1.30
  563. ; Get ready to do Two minus iOoW*iW
  564. mov esi, (1 SHL 16)
  565. ;iWnOld = iWn0;
  566. mov edi, edx
  567. ; Result should be close to one so we want most of the
  568. ; precision in the low bits. Need to give more bits
  569. ; leaway since these are the bad cases.
  570. ; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
  571. imul edx
  572. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  573. sub esi, edx
  574. ;while(iWn1.i < 0)
  575. ;{
  576. SpecW1Loop2:
  577. test esi, esi
  578. jns SpecW1ExitLoop2 ; This jump should be predicted correctly most of the time.
  579. ;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
  580. add esi, (1 SHL 15)
  581. sar esi, 1
  582. jmp SpecW1Loop2
  583. ;}
  584. SpecW1ExitLoop2:
  585. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  586. mov eax, edi
  587. shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
  588. shl esi, 12 ; 4.15 << 12 = 4.27 ;
  589. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  590. ; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
  591. mul esi
  592. ; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
  593. sub edi, edx
  594. ; These four lines are abs code.
  595. mov eax, edi
  596. sar eax, 31
  597. xor edi, eax
  598. sub edi, eax
  599. cmp edi, 020h ;Assuming that loop will only happen once.
  600. jbe ExitSpecWLoop1
  601. ; Reload eax with iOoW.
  602. mov eax, [ebp+RASTSPAN_iOoW]
  603. jmp SpecW1Loop1
  604. ;}
  605. ;else
  606. ;{
  607. DontDoSpecialW1:
  608. ; Everything should be positive in Non-SpecialW case.
  609. ;INT32 iWn1;
  610. mov esi, (1 SHL 16)
  611. mov edi, edx
  612. ; This should be close to one so Low bits are most important.
  613. ;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
  614. mul edx
  615. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  616. sub esi, edx
  617. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  618. shl esi, 15 ; 0.16.15 << 15 = 0.2.30
  619. mov eax, esi
  620. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  621. mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
  622. shl edx, 2 ; 1.17.14 << 2 = 1.15.16
  623. ;}
  624. ;}
  625. ExitSpecWLoop1:
  626. ;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
  627. ;pS->iW = iWn0;
  628. mov [ebp+RASTSPAN_iW], edx
  629. mov esi, edx ; Save W for multiplying by UoW and VoW
  630. sub edx, LastW
  631. mov [ebx+RASTCTX_SI+SPANITER_iDW], edx
  632. ;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
  633. inc word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  634. ;pCtx->SI.iU1 = d_WTimesUVoW(pS->iW,pS->iUoW1);
  635. ;pCtx->SI.iV1 = d_WTimesUVoW(pS->iW,pS->iVoW1);
  636. pslld mm5, 8
  637. shl esi, 4
  638. movd eax, mm5
  639. psrlq mm5, 32
  640. imul esi
  641. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  642. movd eax, mm5
  643. imul esi
  644. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  645. ;//pS->pZ += iZStep;
  646. ;//pS->pSurface += iSurfaceStep;
  647. mov eax, dword ptr [ebp+RASTSPAN_pZ]
  648. mov edx, dword ptr [ebp+RASTSPAN_pSurface]
  649. add eax, iZStep
  650. add edx, iSurfaceStep
  651. mov dword ptr [ebp+RASTSPAN_pZ], eax
  652. mov dword ptr [ebp+RASTSPAN_pSurface], edx
  653. ;#ifdef DBG
  654. ;// handy for debug to see where we are
  655. ;//pS->uX += (INT16)pCtx->SI.iXStep;
  656. ;#endif
  657. ;// } // while
  658. jmp PixelLoop
  659. ExitPixelLoop:
  660. ; Loop code ends
  661. ;-----------------------------------------------------------------------------
  662. ; LoopAny code ends here
  663. ;-----------------------------------------------------------------------------
  664. ;pS++;
  665. add ebp, SIZEOF_RASTSPAN
  666. ;}
  667. jmp SpanLoop
  668. ExitSpanLoop:
  669. ;pP = pP->pNext;
  670. mov ecx, [ecx+RASTPRIM_pNext]
  671. ;}
  672. jmp PrimLoop
  673. ExitPrimLoop:
  674. ;_asm{
  675. emms
  676. ;}
  677. ;return S_OK;
  678. xor eax, eax
  679. ;}
  680. pop edi
  681. pop esi
  682. pop ebx
  683. mov esp, StackPos
  684. pop ebp
  685. ret
  686. END