Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

814 lines
24 KiB

  1. ;-----------------------------------------------------------------------------
  2. ;
  3. ;
  4. ; Monolith 18. Perspective Correct Bi-linear
  5. ; X888 input texture 16 bit Z buffered (LE or GT)
  6. ; X888 output.
  7. ;
  8. ; Exactly the same as monolith 4 except color input is 32 bits and
  9. ; output is 32 bits
  10. ;
  11. ;-----------------------------------------------------------------------------
  12. INCLUDE iammx.inc
  13. INCLUDE offs_acp.inc
  14. ; Names are read LSB to MSB, so B5G6R5 means five bits of blue starting
  15. ; at the LSB, then six bits of green, then five bits of red.
  16. ;TBD check to see if this value is correct.
  17. COLOR_SHIFT equ 8
  18. .586
  19. .model flat
  20. ; Big seperating lines seperate code into span code
  21. ; and loop code. If span and loop are not going to
  22. ; end up being combined then it will be easy to
  23. ; seperate the code.
  24. .data
  25. ; Need externs for all of the variables that are needed for various beads
  26. EXTERN IncHighandLow16:MMWORD
  27. EXTERN UFracVFracMask:MMWORD
  28. EXTERN UV32to15Mask:MMWORD
  29. EXTERN Makelow16one:MMWORD
  30. EXTERN MaskKeepUValues:MMWORD
  31. EXTERN MaskKeepVValues:MMWORD
  32. EXTERN UFrac:MMWORD
  33. EXTERN VFrac:MMWORD
  34. EXTERN Zero:MMWORD
  35. EXTERN memD3DTFG_POINT:MMWORD
  36. EXTERN GiveUp:MMWORD
  37. EXTERN LastW:MMWORD
  38. EXTERN Val0x000a000a:MMWORD
  39. EXTERN Val0xffff:MMWORD
  40. EXTERN Val0x0000002000000020:MMWORD
  41. EXTERN Val0x0000ffff0000ffff:MMWORD
  42. EXTERN MaskRed565to888:MMWORD
  43. EXTERN MaskGreen565to888:MMWORD
  44. EXTERN MaskBlue565to888:MMWORD
  45. EXTERN MaskRed555to888:MMWORD
  46. EXTERN MaskGreen555to888:MMWORD
  47. EXTERN MaskBlue555to888:MMWORD
  48. EXTERN MaskAlpha1555to8888:MMWORD
  49. EXTERN MaskRed1555to8888:MMWORD
  50. EXTERN MaskGreen1555to8888:MMWORD
  51. EXTERN MaskBlue1555to8888:MMWORD
  52. ; TBD. I think that I want to do 0xffff instead of 0xff. This will
  53. ; have to be checked. There is a value very similiar to this in
  54. ; buf write.
  55. EXTERN SetAlphato0xffff:MMWORD
  56. EXTERN SetAlphato0x00:MMWORD
  57. ; TODO This equate are identical to the ones in texread.mas. Maybe they should be in a common .inc file.
  58. RedShift565to888 equ 8
  59. GreenShift565to888 equ 5
  60. BlueShift565to888 equ 3
  61. RedShift555to888 equ 9
  62. GreenShift555to888 equ 6
  63. BlueShift555to888 equ 3
  64. AlphaShift1555to8888 equ 16
  65. RedShift1555to8888 equ 9
  66. GreenShift1555to8888 equ 6
  67. BlueShift1555to8888 equ 3
  68. EXTERN BilinearMaskRed565to888:MMWORD
  69. EXTERN BilinearMaskGreen565to888:MMWORD
  70. EXTERN BilinearMaskBlue565to888:MMWORD
  71. ; These are not needed as qwords since they can be done with constants
  72. EXTERN BilinearShiftRed565to888:MMWORD
  73. EXTERN BilinearShiftGreen565to888:MMWORD
  74. EXTERN BilinearShiftBlue565to888:MMWORD
  75. EXTERN Zero:MMWORD
  76. EXTERN DW_One_One:MMWORD
  77. EXTERN MaskOffAlpha:MMWORD
  78. EXTERN ShiftTA:MMWORD
  79. EXTERN Val0x00ff00ff00ff00ff:MMWORD
  80. EXTERN Val0x000000ff00ff00ff:MMWORD
  81. EXTERN Val0X0000000001000000:MMWORD
  82. EXTERN AlphaVal128:MMWORD
  83. EXTERN RGBVal128:MMWORD
  84. EXTERN g_uDitherValue:MMWORD
  85. EXTERN SetAlphato0x00:MMWORD
  86. EXTERN u888to565RedBlueMask:MMWORD
  87. EXTERN u888to565GreenMask:MMWORD
  88. EXTERN u888to565Multiplier:MMWORD
  89. EXTERN uVal0x000007ff03ff07ff:MMWORD
  90. EXTERN uVal0x0000078003c00780:MMWORD
  91. EXTERN u888to555RedBlueMask:MMWORD
  92. EXTERN u888to555GreenMask:MMWORD
  93. EXTERN u888to555Multiplier:MMWORD
  94. EXTERN uVal0x000007ff07ff07ff:MMWORD
  95. EXTERN uVal0x0000078007800780:MMWORD
  96. ;-----------------------------------------------------------------------------
  97. ; Span Variables
  98. StackPos dd ?
  99. uSpans dd ?
  100. ;-----------------------------------------------------------------------------
  101. ;-----------------------------------------------------------------------------
  102. ; Loop Variables
  103. iSurfaceStep dd ?
  104. iZStep dd ?
  105. uPix dd ?
  106. ;-----------------------------------------------------------------------------
  107. .code
  108. PUBLIC _MMXMLRast_18
  109. _MMXMLRast_18:
  110. push ebp
  111. mov StackPos, esp
  112. mov eax, esp
  113. sub esp, 0Ch ; This will need to change if stack frame size changes.
  114. push ebx
  115. push esi
  116. push edi
  117. ; Put pCtx into ebx
  118. mov ebx, [eax+8]
  119. ;PD3DI_RASTPRIM pP = pCtx->pPrim;
  120. mov ecx, [ebx+RASTCTX_pPrim]
  121. ;while (pP)
  122. ;{
  123. PrimLoop:
  124. cmp ecx, 0
  125. je ExitPrimLoop
  126. ;UINT16 uSpans = pP->uSpans;
  127. movzx eax, word ptr [ecx+RASTPRIM_uSpans]
  128. mov uSpans, eax
  129. ;PD3DI_RASTSPAN pS = (PD3DI_RASTSPAN)(pP + 1);
  130. mov ebp, ecx
  131. add ebp, SIZEOF_RASTPRIM
  132. ;while (uSpans-- > 0)
  133. ;{
  134. SpanLoop:
  135. mov edx, uSpans
  136. mov eax, edx
  137. dec eax
  138. mov uSpans, eax
  139. test edx, edx
  140. jle ExitSpanLoop
  141. ;pCtx->pfnBegin(pCtx, pP, pS);
  142. ;-----------------------------------------------------------------------------
  143. ; LoopAny code inserted here. This is to get rid of an extra
  144. ; jump.
  145. ;-----------------------------------------------------------------------------
  146. ; Setup Code begins
  147. ; get values to iterate
  148. ;uPix = pS->uPix;
  149. movzx eax, word ptr [ebp+RASTSPAN_uPix]
  150. mov uPix, eax
  151. ;pCtx->SI.iDW = 0x0;
  152. mov dword ptr [ebx+RASTCTX_SI+SPANITER_iDW], 0
  153. mov esi, [ebp+RASTSPAN_iW]
  154. movq mm5, MMWORD PTR [ebp+RASTSPAN_iUoW1]
  155. ;pCtx->SI.iUd_TexNum = d_WTimesUVoW(pS->iW,pS->iUoW1);
  156. ;pCtx->SI.iVd_TexNum = d_WTimesUVoW(pS->iW,pS->iVoW1);
  157. pslld mm5, 8
  158. shl esi, 4
  159. movd eax, mm5
  160. psrlq mm5, 32
  161. imul esi
  162. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  163. movd eax, mm5
  164. imul esi
  165. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  166. ;if (pP->iDOoWDX > 0)
  167. ;{
  168. cmp dword ptr [ecx+RASTPRIM_iDOoWDX], 0
  169. jg SpecialWLastMonTest
  170. ;// iSpecialW should be negative for the first 3 pixels of span
  171. ;pCtx->SI.iSpecialW = -3;
  172. mov word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW], -3
  173. jmp DoneSpecialWifMonTest
  174. ;}
  175. ;else
  176. ;{
  177. SpecialWLastMonTest:
  178. ;// iSpecialW should be negative for the last 3 pixels of span
  179. ;pCtx->SI.iSpecialW = 0x7fff - uPix;
  180. mov eax, 07fffh
  181. sub eax, uPix
  182. ;pCtx->SI.iSpecialW += 5; // this may wrap, but it should
  183. add eax, 5
  184. mov [ebx+RASTCTX_SI+SPANITER_iSpecialW], eax
  185. ;}
  186. DoneSpecialWifMonTest:
  187. ;if (pP->uFlags & D3DI_RASTPRIM_X_DEC)
  188. ;{
  189. mov eax, [ecx+RASTPRIM_uFlags]
  190. and eax, D3DI_RASTPRIM_X_DEC
  191. test eax, eax
  192. jz LeftToRightSpan
  193. ;iZStep = -pCtx->iZStep;
  194. mov eax, [ebx+RASTCTX_iZStep]
  195. neg eax
  196. mov iZStep, eax
  197. ;iSurfaceStep = -pCtx->iSurfaceStep;
  198. mov eax, [ebx+RASTCTX_iSurfaceStep]
  199. neg eax
  200. mov iSurfaceStep, eax
  201. ;}
  202. jmp DoneSpanDirif
  203. ;else
  204. ;{
  205. LeftToRightSpan:
  206. ;iZStep = pCtx->iZStep;
  207. mov eax, [ebx+RASTCTX_iZStep]
  208. mov iZStep, eax
  209. ;iSurfaceStep = pCtx->iSurfaceStep;
  210. mov eax, [ebx+RASTCTX_iSurfaceStep]
  211. mov iSurfaceStep, eax
  212. ;}
  213. DoneSpanDirif:
  214. ; Setup Code Ends
  215. ; ----------------------------------------------------------------------------------------------------------------
  216. ; Loop Code Begins
  217. ;//while (1)
  218. ;//{
  219. PixelLoop:
  220. ; Ztestcode
  221. ; edx is uZ
  222. ; eax is uZB
  223. ; 16 bit unsigned format
  224. ;UINT16 uZ = (UINT16)(pS->uZ>>15);
  225. ;UINT16 uZB = *((UINT16*)pS->pZ);
  226. mov edx, [ebp+RASTSPAN_uZ]
  227. movd mm4, edx
  228. mov esi, [ebp+RASTSPAN_pZ]
  229. shr edx, 15
  230. movzx eax, word ptr [esi]
  231. ;pS->uZ += pP->iDZDX;
  232. ;if ((pCtx->iZXorMask)^(uZ > uZB))
  233. ; !(uZ > uZB) <==>
  234. ; (uZ <= uZB) <==>
  235. ; (uZ < uZB+1) <==>
  236. ;
  237. sub eax, edx
  238. paddd mm4, [ecx+RASTPRIM_iDZDX]
  239. movd [ebp+RASTSPAN_uZ], mm4
  240. xor eax, [ebx+RASTCTX_iZXorMask]
  241. test eax, eax
  242. js FailLabel
  243. mov word ptr [esi], dx
  244. ; texturecode
  245. ;---------------------------------------------------------------------------
  246. ;void Tex1Addr_TexAddrWrapMirror_Persp_Bilinear_NoMip(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  247. ; PD3DI_RASTSPAN pS)
  248. ;{
  249. ;PD3DI_SPANTEX pTex = &pCtx->Texture[0];
  250. mov esi, [ebx+RASTCTX_pTexture]
  251. ; ----------------------------------------
  252. ; Doing UV calculation a little more accurate
  253. ; Exactly like C code.
  254. ; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
  255. ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
  256. ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
  257. ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
  258. ; COMMENT1**
  259. ; If textures have a max of 1024 then shiftU0 would be at most 10 which would
  260. ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
  261. ; It will also give bi-linear 6 bits of precision I think it was said that
  262. ; only five was needed.
  263. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
  264. ;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
  265. movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
  266. ;iLOD0 is zero in monolithic case so no subtraction needed.
  267. movd mm4, [esi+SPANTEX_iShiftU]
  268. psubw mm5, mm4
  269. movq mm4, mm5
  270. pand mm5, MMWORD PTR Val0xffff
  271. psrld mm4, 16
  272. movd mm1, [ebx+RASTCTX_SI+SPANITER_iU1]
  273. psrad mm1, mm5
  274. movd mm2, [ebx+RASTCTX_SI+SPANITER_iV1]
  275. psrad mm2, mm4
  276. punpckldq mm1, mm2
  277. psubd mm1, MMWORD PTR Val0x0000002000000020
  278. ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
  279. movzx edx, word ptr [esi+SPANTEX_iShiftPitch]
  280. add edx, 16
  281. movd mm2, edx
  282. movq mm5, MMWORD ptr Makelow16one
  283. pslld mm5, mm2
  284. por mm5, MMWORD ptr Makelow16one
  285. ; Make the low 16 bits of dword one
  286. ; This helps in calculating texture address.
  287. ; Gets U and V value into mm1 so that it can be mirrored, wrapped or
  288. ; clamped. This can be done for two values in the point case
  289. ; or four values in the bilinear case.
  290. ;INT32 iUFrac = iU00 & 0x03f;
  291. ;INT32 iVFrac = iV00 & 0x03f;
  292. ;iU00 >>= 6;
  293. ;iV00 >>= 6;
  294. movq mm2, mm1
  295. psrad mm1, 6
  296. ;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
  297. pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
  298. ; Going to use only 8 bits for bi-linear so that I can do a pmullw.
  299. ; Currently at 6 bits so shift up by 2.
  300. psllw mm2, 2
  301. movq mm0, mm2
  302. ; Replicate VFrac value for bilinear
  303. punpckhwd mm2, mm2
  304. punpcklwd mm2, mm2
  305. ; Replicate UFrac Value for bilinear
  306. punpcklwd mm0, mm0
  307. punpcklwd mm0, mm0
  308. movq dword ptr VFrac, mm2
  309. movq dword ptr UFrac, mm0
  310. ;INT32 iU01 = iU00 + 1;
  311. ;INT32 iV01 = iV00 + 1;
  312. packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
  313. paddw mm1, dword ptr IncHighandLow16
  314. ; This will make texture values be (High word to low word):
  315. ; iV01, iU00, iV00, iU01
  316. ; Need to do this to make texture look up for bilinear easier.
  317. ; I have to combine to get all combinations anyway. It just
  318. ; happens to be better for me to have iV00, iU01 pair first.
  319. ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
  320. ; put mask in mm3 and replicate to match location for wrap/mirror/clamp
  321. movd mm0, [esi+SPANTEX_uMaskU] ; Load U and V mask
  322. ; replicate mask if doing bilinear
  323. punpckldq mm0, mm0
  324. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  325. ;INT16 iFlip;
  326. ; MM1 should contain 16 bit iU and iV for both texture locations
  327. ; End Result is MM1 value wrapped or mirrored
  328. ; in Bilinear Case, four values can be done
  329. ; iU00, iV00, iU01, iV01
  330. ; This code really does alot for the bilinear case and is kinda wasteful
  331. ; in the normal mode.
  332. ;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
  333. movq mm7, mm1
  334. ; Point doesnt need replication
  335. movd mm4, [esi+SPANTEX_iFlipMaskU]
  336. ; if bilinear replicate values together, Point doesnt need this.
  337. punpckldq mm4, mm4
  338. ; Monolith cases assumed that iLOD0 was zero so no shift needed.
  339. pand mm7, mm4
  340. ;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
  341. pcmpeqw mm7, MMWORD PTR Zero
  342. ;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
  343. pandn mm7, mm0
  344. ;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
  345. pand mm1, mm0
  346. ;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
  347. pxor mm1, mm7
  348. ; Result in mm1 now since TexAddrAll ends up that way.
  349. ; Making other two cases for texture addressing has to be simplier than
  350. ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
  351. ; TBD Make this better.
  352. ; values are still stored as iV01, iU00, iV00, iU01
  353. movq mm2, mm1
  354. movq mm3, mm1
  355. ; Calculate 1st and 3rd texel address.
  356. pmaddwd mm1, mm5 ; Throw in first address calculation.
  357. ; Just to get it started. Calculate
  358. ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
  359. ; values are being changed to iV01, iU01, iV00, iU00
  360. ; seven instructions for this seems excessive.
  361. pand mm2, MMWORD ptr MaskKeepUValues
  362. pand mm3, MMWORD ptr MaskKeepVValues
  363. movq mm4, mm2
  364. psllq mm2, 32
  365. psrlq mm4, 32
  366. por mm3, mm2
  367. por mm3, mm4
  368. ; From here until mov edi is code that is needed for border.
  369. ; all sign bits are stored in bytes so that border code can tell if uv went below zero.
  370. ; Calculate 2nd and 4th texel address.
  371. pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
  372. mov edi, [esi+SPANTEX_pBits]
  373. ; was esi. Cant change to esi because it is the pointer to pTex
  374. ; which is used by Border and ColorKey. Use edi for now and
  375. ; call routines through memory. Figure out if this is bad.
  376. ; load the read texture routine address into a register early
  377. ;mov edi, [ebx+RASTCTX_pfnTexRead]
  378. ;UINT32 uTex00 = pCtx->pfnTexRead(iU00, iV00, pTex->iShiftU,
  379. ; pTex->pBits[iLOD0], &pCtx->Texture[0]);
  380. ; Combine U and V values before making call.
  381. ;call edi
  382. ; -------------------- In Monolithic version calls are inlined.
  383. ;D3DCOLOR TexRead_B5G6R5_NoBorder(INT32 iU, INT32 iV, INT32 iShiftU, PUINT8 pBits, PD3DI_SPANTEX pTex)
  384. ;{
  385. ; iV0 iU1 address should be done by now.
  386. pxor mm2, mm2
  387. ; 1st (mm1) and 2nd (mm3) texel
  388. movd eax, mm3 ; load 2nd texel address
  389. movd mm4, dword ptr [edi+4*eax]
  390. movd eax, mm1 ; load 1st texel address
  391. movd mm7, dword ptr [edi+4*eax]
  392. ; mm4 calculated from high 32 bits of mm3 (2nd texel)
  393. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  394. ; bits for each color component
  395. punpcklbw mm4, mm2
  396. ; mm7 calculated from low 32 bits of mm1 (1st texel)
  397. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  398. ; bits for each color component
  399. punpcklbw mm7, mm2
  400. psrlq mm3, 32 ; shift 4th texel address into low 32 bits
  401. ; mm7 = final calc on 1st and 2nd texel
  402. psubw mm7, mm4
  403. psllw mm4, 8
  404. pmullw mm7, dword ptr UFrac
  405. paddw mm7, mm4
  406. ; 3rd (mm1) and 4th (mm3) texel
  407. movd eax, mm3 ; load 4th texel address
  408. psrlq mm1, 32 ; shift 3rd texel address into low 32 bits
  409. movd mm6, dword ptr [edi+4*eax] ; mm6 = 4th texel
  410. movd eax, mm1 ; load 3rd texel address
  411. movd mm4, dword ptr [edi+4*eax]
  412. ; mm6 = 4th texel (high 32 bits), 3rd texel (low 32 bits)
  413. ; mm4 calculated from high 32 bits of mm3 (4th texel)
  414. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  415. ; bits for each color component
  416. punpcklbw mm4, mm2
  417. ; mm6 calculated from low 32 bits of mm1 (3rd texel)
  418. ; pad high 8 bits of each component with zeros because D3DCOLOR has 16
  419. ; bits for each color component
  420. punpcklbw mm6, mm2
  421. psubw mm6, mm4
  422. psllw mm4, 8
  423. pmullw mm6, dword ptr UFrac
  424. movq mm1, mm7
  425. ; mm6 = final calc on 3rd and 4th texel
  426. paddw mm6, mm4
  427. ; mm4 = final calc on 1st+2nd texel and 3rd+4th texel
  428. psrlw mm6, 8
  429. psrlw mm7, 8
  430. psubw mm6, mm7
  431. pmullw mm6, dword ptr VFrac
  432. paddw mm6, mm1
  433. ; write
  434. ;*(PUINT16)pS->pSurface =
  435. ; ((pCtx->SI.uBR >> 0) & 0xf800) |
  436. ; ((pCtx->SI.uBG >> 5) & 0x07e0) |
  437. ; ((pCtx->SI.uBB >> 11) & 0x001f);
  438. mov edi, [ebp+RASTSPAN_pSurface]
  439. psrlw mm6, 8 ; Convert color1 from 8.8 two 0.8
  440. packuswb mm6, mm7 ; pack one color
  441. pand mm6, MMWORD PTR SetAlphato0x00 ; = 0x00ffffff
  442. movd [edi], mm6
  443. FailLabel:
  444. ;//if (--uPix <= 0)
  445. ;// break;
  446. dec uPix ;// BUG BUG?? uPix should never start as zero should it?
  447. ;// if so, this is a bug.
  448. jle ExitPixelLoop
  449. ; Doing update code after span length test so that an extra update is not done.
  450. ;void TestFail_Gouraud_PerspTex1_NoSpecularFog(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  451. ; PD3DI_RASTSPAN pS)
  452. ;{
  453. ;pS->uB += pP->iDBDX; pS->uG += pP->iDGDX;
  454. ;pS->uR += pP->iDRDX; pS->uA += pP->iDADX;
  455. movq mm1, [ebp+RASTSPAN_uB]
  456. paddw mm1, [ecx+RASTPRIM_iDBDX]
  457. movq [ebp+RASTSPAN_uB], mm1
  458. ;pS->iUoW1 += pP->iDUoW1DX;
  459. ;pS->iVoW1 += pP->iDVoW1DX;
  460. movq mm5, [ebp+RASTSPAN_iUoW1]
  461. paddd mm5, [ecx+RASTPRIM_iDUoW1DX]
  462. movq [ebp+RASTSPAN_iUoW1], mm5
  463. ;pS->iOoW += pP->iDOoWDX;
  464. mov eax, [ebp+RASTSPAN_iOoW]
  465. add eax, [ecx+RASTPRIM_iDOoWDX]
  466. mov [ebp+RASTSPAN_iOoW], eax
  467. ;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
  468. ; TODO Could do this and OoW Add at same time with MMX.
  469. mov edx, [ebp+RASTSPAN_iW]
  470. mov LastW, edx ; Save iW to calc iDW for next time.
  471. add edx, [ebx+RASTCTX_SI+SPANITER_iDW]
  472. ;if (pCtx->SI.iSpecialW < 0)
  473. ;{
  474. xor edi, edi
  475. cmp di, word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  476. jle DontDoSpecialW1
  477. ;DoSpecialW1:
  478. ; This label is a left over from when
  479. ;if (iWn0 < 0)
  480. ;{
  481. cmp edx, edi
  482. jl WOutOfRange1
  483. ;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
  484. mov edx, LastW
  485. sar edx, 1
  486. ;}
  487. WOutOfRange1:
  488. ;VAL32 iWn1;
  489. ;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
  490. ; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
  491. ;INT32 iGiveUp = 7;
  492. mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
  493. ;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
  494. ;{
  495. SpecW1Loop1:
  496. ; Could move this to bottom of loop and combine results somehow.
  497. ; TBD look at it more.
  498. dec GiveUp
  499. jz ExitSpecWLoop1
  500. ; Shift iOoW by one since imul cannot have sign bit set
  501. ; OoW cannot reach one, only 0x7fffffff
  502. ;shr eax, 1 ; 1.31 >> 1 = 1.30
  503. ; Get ready to do Two minus iOoW*iW
  504. mov esi, (1 SHL 16)
  505. ;iWnOld = iWn0;
  506. mov edi, edx
  507. ; Result should be close to one so we want most of the
  508. ; precision in the low bits. Need to give more bits
  509. ; leaway since these are the bad cases.
  510. ; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
  511. imul edx
  512. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  513. sub esi, edx
  514. ;while(iWn1.i < 0)
  515. ;{
  516. SpecW1Loop2:
  517. test esi, esi
  518. jns SpecW1ExitLoop2 ; This jump should be predicted correctly most of the time.
  519. ;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
  520. add esi, (1 SHL 15)
  521. sar esi, 1
  522. jmp SpecW1Loop2
  523. ;}
  524. SpecW1ExitLoop2:
  525. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  526. mov eax, edi
  527. shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
  528. shl esi, 12 ; 4.15 << 12 = 4.27 ;
  529. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  530. ; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
  531. mul esi
  532. ; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
  533. sub edi, edx
  534. ; These four lines are abs code.
  535. mov eax, edi
  536. sar eax, 31
  537. xor edi, eax
  538. sub edi, eax
  539. cmp edi, 020h ;Assuming that loop will only happen once.
  540. jbe ExitSpecWLoop1
  541. ; Reload eax with iOoW.
  542. mov eax, [ebp+RASTSPAN_iOoW]
  543. jmp SpecW1Loop1
  544. ;}
  545. ;else
  546. ;{
  547. DontDoSpecialW1:
  548. ; Everything should be positive in Non-SpecialW case.
  549. ;INT32 iWn1;
  550. mov esi, (1 SHL 16)
  551. mov edi, edx
  552. ; This should be close to one so Low bits are most important.
  553. ;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
  554. mul edx
  555. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  556. sub esi, edx
  557. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  558. shl esi, 15 ; 0.16.15 << 15 = 0.2.30
  559. mov eax, esi
  560. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  561. mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
  562. shl edx, 2 ; 1.17.14 << 2 = 1.15.16
  563. ;}
  564. ;}
  565. ExitSpecWLoop1:
  566. ;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
  567. ;pS->iW = iWn0;
  568. mov [ebp+RASTSPAN_iW], edx
  569. mov esi, edx ; Save W for multiplying by UoW and VoW
  570. sub edx, LastW
  571. mov [ebx+RASTCTX_SI+SPANITER_iDW], edx
  572. ;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
  573. inc word ptr [ebx+RASTCTX_SI+SPANITER_iSpecialW]
  574. ;pCtx->SI.iU1 = d_WTimesUVoW(pS->iW,pS->iUoW1);
  575. ;pCtx->SI.iV1 = d_WTimesUVoW(pS->iW,pS->iVoW1);
  576. pslld mm5, 8
  577. shl esi, 4
  578. movd eax, mm5
  579. psrlq mm5, 32
  580. imul esi
  581. mov [ebx+RASTCTX_SI+SPANITER_iU1], edx
  582. movd eax, mm5
  583. imul esi
  584. mov [ebx+RASTCTX_SI+SPANITER_iV1], edx
  585. ;//pS->pZ += iZStep;
  586. ;//pS->pSurface += iSurfaceStep;
  587. mov eax, dword ptr [ebp+RASTSPAN_pZ]
  588. mov edx, dword ptr [ebp+RASTSPAN_pSurface]
  589. add eax, iZStep
  590. add edx, iSurfaceStep
  591. mov dword ptr [ebp+RASTSPAN_pZ], eax
  592. mov dword ptr [ebp+RASTSPAN_pSurface], edx
  593. ;#ifdef DBG
  594. ;// handy for debug to see where we are
  595. ;//pS->uX += (INT16)pCtx->SI.iXStep;
  596. ;#endif
  597. ;// } // while
  598. jmp PixelLoop
  599. ExitPixelLoop:
  600. ; Loop code ends
  601. ;-----------------------------------------------------------------------------
  602. ; LoopAny code ends here
  603. ;-----------------------------------------------------------------------------
  604. ;pS++;
  605. add ebp, SIZEOF_RASTSPAN
  606. ;}
  607. jmp SpanLoop
  608. ExitSpanLoop:
  609. ;pP = pP->pNext;
  610. mov ecx, [ecx+RASTPRIM_pNext]
  611. ;}
  612. jmp PrimLoop
  613. ExitPrimLoop:
  614. ;_asm{
  615. emms
  616. ;}
  617. ;return S_OK;
  618. xor eax, eax
  619. ;}
  620. pop edi
  621. pop esi
  622. pop ebx
  623. mov esp, StackPos
  624. pop ebp
  625. ret
  626. END