Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

847 lines
27 KiB

  1. dnl-----------------------------------------------------------------------------
  2. dnl
  3. dnl This file contains the macro for generating texture addressing routines.
  4. dnl
  5. dnl-----------------------------------------------------------------------------
  6. dnl
  7. dnl
  8. dnl
  9. dnl d_TexAddr
  10. dnl
  11. dnl Generates all the differentiated texture address routines.
  12. dnl
  13. dnl It takes 5 parameters.
  14. dnl
  15. dnl $1 is one of 0 or 1. 0 is single texture, and 1 is the first multi-texture
  16. dnl $2 is one of TexAddrWrapMirror TexAddrAll
  17. dnl $3 is one of NoPersp Persp
  18. dnl $4 is one of Point Bilinear MaybeBilinear
  19. dnl $5 is one of NoMip
  20. dnl
  21. dnl Note that even when we are not mip mapping, we use iLOD to get to the nearest mip map
  22. dnl (so iLOD must be 0 if the texture has no mip levels)
  23. dnl
  24. dnl two different jump counts.
  25. define(`d_WDIVcnt', 0)dnl
  26. define(`d_MaybeBilinearcnt', 0)dnl
  27. define(`d_MaxCLODcnt', 0)dnl
  28. dnl
  29. dnl Variables needed for texturing
  30. dnl
  31. dnl
  32. define(`texaddraVars', `
  33. EXTERN IncHighandLow16:MMWORD
  34. EXTERN UFracVFracMask:MMWORD
  35. EXTERN UV32to15Mask:MMWORD
  36. EXTERN Makelow16one:MMWORD
  37. EXTERN MaskKeepUValues:MMWORD
  38. EXTERN MaskKeepVValues:MMWORD
  39. EXTERN UFrac:MMWORD
  40. EXTERN VFrac:MMWORD
  41. EXTERN Zero:MMWORD
  42. EXTERN memD3DTFG_POINT:MMWORD
  43. EXTERN GiveUp:MMWORD
  44. EXTERN LastW:MMWORD
  45. EXTERN Val0x000a000a:MMWORD
  46. EXTERN Val0xffff:MMWORD
  47. EXTERN Val0x0000002000000020:MMWORD
  48. EXTERN Val0x0000ffff0000ffff:MMWORD
  49. ')
  50. dnl
  51. dnl d_UpdateUoWAndVoW
  52. dnl increments UoW and VoW for textures and can be used
  53. dnl in several different files.
  54. dnl
  55. define(`d_UpdateUoWandVoW', `
  56. ;pS->iUoW`'d_TexNum += pP->iDUoW`'$1`'DX;
  57. ;pS->iVoW`'d_TexNum += pP->iDVoW`'$1`'DX;
  58. movq mm5, XpS(UVoW + edi * SIZEOF_UV_UNION)
  59. paddd mm5, XpP(DUVoWDX + edi * SIZEOF_UV_UNION)
  60. movq XpS(UVoW + edi * SIZEOF_UV_UNION), mm5
  61. ')
  62. define(`d_UpdateLOD', `
  63. ; Seems like this should be done with something else
  64. ; i.e. group 16 bit adds together.
  65. ;pS->iLOD += pS->iDLOD;
  66. xor eax, eax
  67. mov ax, XpS(iLOD)
  68. add ax, XpS(iDLOD)
  69. mov XpS(iLOD), ax
  70. ')
  71. define(`d_UpdateOoW', `
  72. ;pS->iOoW += pP->iDOoWDX;
  73. mov eax, XpS(iOoW)
  74. add eax, XpP(iDOoWDX)
  75. mov XpS(iOoW), eax
  76. ')
  77. dnl d_UoWVowTimesW is so that I have same code in several different locations.
  78. dnl These four locations are texaddr1.mas, tstfail.mas, Setup Code (Monolithic and regular)
  79. dnl
  80. dnl mm5 is UoW and VoW for either texture 1 or texture two
  81. dnl esi is W
  82. dnl $1 is 1 or 2 depending on result is for texture one or texture 2
  83. dnl
  84. dnl
  85. dnl Does integer W * U or V computation
  86. dnl define(`d_WTimesUVoW', `imul32h_s20(($1), ($2))')dnl
  87. dnl
  88. dnl iW = 1.15.16 << 4 = 1.11.20
  89. dnl UoW = 1.11.20 << 8 = 1.2.28
  90. dnl
  91. dnl 1.11.20 * 1.3.28 == 1.15.48 >> 32 == 1.15.16
  92. dnl inline INT32 imul32h_s20(INT32 x, INT32 y)
  93. dnl {
  94. dnl #ifdef _X86_
  95. dnl _asm
  96. dnl {
  97. dnl mov eax, x
  98. dnl mov edx, y
  99. dnl imul edx
  100. dnl shr eax, 20
  101. dnl shl edx, 12
  102. dnl and eax, 000000fffh
  103. dnl and edx, 0fffff000h
  104. dnl or eax, edx
  105. dnl }
  106. dnl #else
  107. dnl return (INT32)(((LONGLONG)x * y) >> 20);
  108. dnl #endif
  109. dnl }
  110. define(`d_UoWVoWTimesW', `
  111. ;pCtx->SI.TexCol[i].iU = d_WTimesUVoW(pS->iW,pS->iUoW`'$1`');
  112. ;pCtx->SI.TexCol[i].iV = d_WTimesUVoW(pS->iW,pS->iVoW`'$1`');
  113. movd eax, mm5
  114. psrlq mm5, 32
  115. imul esi
  116. shrd eax, edx, 20
  117. mov XpCtxSI(TexUV + edi * SIZEOF_UV_UNION), eax
  118. movd eax, mm5
  119. imul esi
  120. shrd eax, edx, 20
  121. mov XpCtxSI(TexUV + edi * SIZEOF_UV_UNION + SIZEOF_INT32), eax
  122. ')
  123. define(`d_UpdateNonPersp', `
  124. ;pCtx->SI.iU`'$1`' = pS->iUoW`'$1`'>>TEX_TO_FINAL_SHIFT; // 1.11.20 >> 4 == 1.15.16
  125. ;pCtx->SI.iV`'$1`' = pS->iVoW`'$1`'>>TEX_TO_FINAL_SHIFT;
  126. ; mm5 still contains iUoW and iVoW which are the iU and iV values for
  127. ; non perspective correct.
  128. psrad mm5, TEX_TO_FINAL_SHIFT
  129. movq XpCtxSI(TexUV + edi * SIZEOF_UV_UNION), mm5
  130. ')
  131. dnl d_WDivide
  132. dnl
  133. dnl Does incremental W divide calculation
  134. dnl
  135. define(`d_WDivide', `
  136. dnl Increment counter for jump address calc stuff.
  137. define(`d_WDIVcnt', eval(d_WDIVcnt+1))dnl
  138. dnl This was deemed too annoying
  139. dnl #if DBG
  140. dnl if (iOoW <= 0)
  141. dnl {
  142. dnl D3D_WARN(0, "WDivide, iOoW (%d) out of Range!", iOoW);
  143. dnl DDASSERT(0);
  144. dnl }
  145. dnl #endif
  146. dnl Note: iOoW comes in as eax. So it is ready for first multiply
  147. dnl iOoW is actual iOoW value in 1.31 form instead of 1.15 form. good.
  148. dnl In SpecialW case I have to reload it at the end.
  149. ;INT32 iWn0 = pS->iW + pCtx->SI.iDW; // 1.15.16
  150. ; TODO Could do this and OoW Add at same time with MMX.
  151. mov edx, XpS(iW)
  152. mov LastW, edx ; Save iW to calc iDW for next time.
  153. add edx, XpCtxSI(iDW)
  154. ;if (pCtx->SI.iSpecialW < 0)
  155. ;{
  156. xor edi, edi
  157. cmp di, word ptr XpCtxSI(iSpecialW)
  158. jle DontDoSpecialW`'d_WDIVcnt`'
  159. ;DoSpecialW`'d_WDIVcnt`':
  160. ; This label is a left over from when
  161. ;if (iWn0 < 0)
  162. ;{
  163. cmp edx, edi
  164. jl WOutOfRange`'d_WDIVcnt`'
  165. ;iWn0 = pS->iW >> 1; // use iW/2 as a guess, instead
  166. mov edx, LastW
  167. sar edx, 1
  168. ;}
  169. WOutOfRange`'d_WDIVcnt`':
  170. ;VAL32 iWn1;
  171. ;INT16 iWnOld = iWn0 + 0x100; // make sure while fails first time
  172. ; Dont need to make sure it fails. I do a post test which guarentees it will execute once.
  173. ;INT32 iGiveUp = 7;
  174. mov GiveUp, 8 ; Pre decrementing instead of post decrementing.
  175. ;while((abs(iWnOld - iWn0) > 0x20) && (iGiveUp-- > 0))
  176. ;{
  177. SpecW`'d_WDIVcnt`'Loop1:
  178. ; Could move this to bottom of loop and combine results somehow.
  179. ; TBD look at it more.
  180. dec GiveUp
  181. jz ExitSpecWLoop`'d_WDIVcnt`'
  182. ; Shift iOoW by one since imul cannot have sign bit set
  183. ; OoW cannot reach one, only 0x7fffffff
  184. ;shr eax, 1 ; 1.31 >> 1 = 1.30
  185. ; Get ready to do Two minus iOoW*iW
  186. mov esi, (1 SHL 16)
  187. ;iWnOld = iWn0;
  188. mov edi, edx
  189. ; Result should be close to one so we want most of the
  190. ; precision in the low bits. Need to give more bits
  191. ; leaway since these are the bad cases.
  192. ; iWn1 = imul32h(pS->iOoW, iWn0); // 1.31*1.15.16 = 1.16.47 >> 32 = 1.16.15
  193. imul edx
  194. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  195. sub esi, edx
  196. ;while(iWn1.i < 0)
  197. ;{
  198. SpecW`'d_WDIVcnt`'Loop2:
  199. test esi, esi
  200. jns SpecW`'d_WDIVcnt`'ExitLoop2 ; This jump should be predicted correctly most of the time.
  201. ;iWn1=(iWn1+(1L<<15))>>1; // iWn1 = (iWn1 + 1.0)/2
  202. add esi, (1 SHL 15)
  203. sar esi, 1
  204. jmp SpecW`'d_WDIVcnt`'Loop2
  205. ;}
  206. SpecW`'d_WDIVcnt`'ExitLoop2:
  207. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  208. mov eax, edi
  209. shl eax, 5 ; 1.15.16 << 5 = 1.10.21 TBD Can I shift off upper bits??
  210. shl esi, 12 ; 4.15 << 12 = 4.27 ;
  211. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  212. ; Actually 4.27 * 1.10.21 = 1.14.48 >> 32 = 1.14.16. No need for post shift.
  213. mul esi
  214. ; Have to do (abs(iWnOld - iWn0) > 0x20) code here.
  215. sub edi, edx
  216. ; These four lines are abs code.
  217. mov eax, edi
  218. sar eax, 31
  219. xor edi, eax
  220. sub edi, eax
  221. cmp edi, 020h ;Assuming that loop will only happen once.
  222. jbe ExitSpecWLoop`'d_WDIVcnt`'
  223. ; Reload eax with iOoW.
  224. mov eax, XpS(iOoW)
  225. jmp SpecW`'d_WDIVcnt`'Loop1
  226. ;}
  227. ;else
  228. ;{
  229. DontDoSpecialW`'d_WDIVcnt`':
  230. ; Everything should be positive in Non-SpecialW case.
  231. ;INT32 iWn1;
  232. mov esi, (1 SHL 16)
  233. mov edi, edx
  234. ; This should be close to one so Low bits are most important.
  235. ;iWn1 = (iOoW*iWn0)>>15; // 1.31*0.15.16 == 0.16.47 >> 32 = 0.16.15
  236. mul edx
  237. ;iWn1 = (1L<<16) - iWn1; // 2.0 - iWn1
  238. sub esi, edx
  239. ;iWn1 <<= 15; // 1.16.15 << 15 = 1.1.30
  240. shl esi, 15 ; 0.16.15 << 15 = 0.2.30
  241. mov eax, esi
  242. ;iWn0 = imul32h(iWn1, iWn0)<<2; // 1.1.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14 << 2 = 1.15.16
  243. mul edi ; 0.2.30 * 1.15.16 = 1.17.46 >> 32 = 1.17.14
  244. shl edx, 2 ; 1.17.14 << 2 = 1.15.16
  245. ;}
  246. ;}
  247. ExitSpecWLoop`'d_WDIVcnt`':
  248. ;pCtx->SI.iDW = iWn0 - (UINT16)pS->iW;
  249. ;pS->iW = iWn0;
  250. mov XpS(iW), edx
  251. mov esi, edx ; Save W for multiplying by UoW and VoW
  252. sub edx, LastW
  253. mov XpCtxSI(iDW), edx
  254. ;pCtx->SI.iSpecialW += 1; // this is supposed to wrap past 0x7fff sometimes
  255. inc word ptr XpCtxSI(iSpecialW)
  256. ')
  257. dnl
  258. define(`d_TexAddr', `
  259. ;---------------------------------------------------------------------------
  260. ;void TexAddr_$2_$3_$4_$5(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  261. ; PD3DI_RASTSPAN pS, INT32 iTex)
  262. ;{
  263. PUBLIC _MMX_TexAddr_$2_$3_$4_$5
  264. _MMX_TexAddr_$2_$3_$4_$5:
  265. ;PD3DI_SPANTEX pTex = &pCtx->pTexture[$1];
  266. mov edx, iTex
  267. mov esi, XpCtx(pTexture + edx * SIZEOF_PSPANTEX)
  268. ifelse(`$4', `MaybeBilinear', `
  269. ; In maybe bilinear just jump to point or bi-linear depending on iLOD.
  270. define(`d_MaybeBilinearcnt', eval(d_MaybeBilinearcnt+1))dnl
  271. ;if ((((UINT16)pS->iLOD) >> 15) ^ (INT16)(pTex->uMagFilter == D3DTFG_POINT))
  272. ;{
  273. ; TODO check to see if MMX really needed here.
  274. movd mm1, XpTex(uMagFilter)
  275. pcmpeqd mm1, mmword ptr memD3DTFG_POINT
  276. movsx edx, word ptr XpS(iLOD)
  277. movd eax, mm1
  278. sar edx, 15 ; Generates mask based on sign.
  279. xor edx, eax
  280. jz DoPoint`'d_MaybeBilinearcnt
  281. ;// if magnify matches Mag filter, bilinear, else point
  282. ;C_TexAddr_$2_$3_Bilinear_$5(pCtx, pP, pS);
  283. jmp _MMX_TexAddr_$2_$3_Bilinear_$5
  284. DoPoint`'d_MaybeBilinearcnt`':
  285. jmp _MMX_TexAddr_$2_$3_Point_$5
  286. ', `
  287. ifelse(`$5', `LOD', `
  288. ;INT16 iLOD0 = min(max(pS->iLOD >> 11, 0), pTex->cLOD);
  289. movsx eax, word ptr XpS(iLOD)
  290. sar eax, 11
  291. mov edx, eax
  292. sar edx, 31
  293. xor edx, 0ffffffffh
  294. and eax, edx
  295. define(`d_MaxCLODcnt', eval(d_MaxCLODcnt+1))dnl
  296. cmp eax, XpTex(cLOD)
  297. jb NotMax`'d_MaxCLODcnt`'
  298. mov eax, XpTex(cLOD)
  299. NotMax`'d_MaxCLODcnt`':
  300. movd mm3, eax
  301. ')
  302. ; ----------------------------------------
  303. ; Doing UV calculation a little more accurate
  304. ; Exactly like C code.
  305. ; I shift iU and iV to the right not by (TEX_FINAL_SHIFT - iShiftU0) but by
  306. ; (TEX_FINAL_SHIFT - iShiftU0 - 6). iShiftU0 = pTex->iShiftU - iLOD0
  307. ; (TEX_FINAL_SHIFT - (pTex->iShiftU - iLOD0))
  308. ; (TEX_FINAL_SHIFT + iLOD0 - pTex->iShiftU)
  309. ; COMMENT1**
  310. ; If textures have a max of 1024 then shiftU0 would be at most 10 which would
  311. ; make (TEXT_FINAL_SHIFT - iShiftU - 6) at most zero. This is why I choose 6
  312. ; It will also give bi-linear 6 bits of precision I think it was said that
  313. ; only five was needed.
  314. ;INT16 iShiftU0 = pTex->iShiftU - iLOD0;
  315. ;INT16 iShiftV0 = pTex->iShiftV - iLOD0;
  316. movq mm5, MMWORD PTR Val0x000a000a ; This is TEX_FINAL_SHIFT - 6 = 10.
  317. ifelse(`$5', `NoLOD', `
  318. ;iLOD0 is zero so no subtraction needed and LOD doesnt need to be subtracted from U and V.
  319. ', `
  320. punpcklwd mm3, mm3 ; Make two copys of iLOD to subtract U and V
  321. ')dnl
  322. movd mm4, XpTex(iShiftU)
  323. ifelse(`$5', `LOD', `
  324. psubw mm4, mm3
  325. ')dnl
  326. psubw mm5, mm4
  327. movq mm4, mm5
  328. pand mm5, MMWORD PTR Val0xffff
  329. ifelse(`$5', `LOD', `
  330. pand mm3, MMWORD PTR Val0xffff ; Make iLOD back to only one copy
  331. ')
  332. psrld mm4, 16
  333. mov edx, iTex
  334. movd mm1, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION)
  335. psrad mm1, mm5
  336. movd mm2, XpCtxSI(TexUV + edx * SIZEOF_UV_UNION + SIZEOF_INT32)
  337. psrad mm2, mm4
  338. punpckldq mm1, mm2
  339. ifelse(`$4', `Bilinear', `
  340. psubd mm1, MMWORD PTR Val0x0000002000000020
  341. ')
  342. ; Texture Pitch cannot be calculated so it must be looked up in the iShiftPitch table
  343. ; ----------------- Start of hack
  344. ; ATTENTION This is really hacked right now. Just to get it working
  345. ; Pitch would be better for me, instead of shift pitch.
  346. ; With actual pitch, this would be two moves and a shift.
  347. ;shl eax, 1
  348. ifelse(`$5', `LOD', `
  349. movzx edx, word ptr XpTex(iShiftPitch+eax*2)
  350. ', `
  351. movzx edx, word ptr XpTex(iShiftPitch)
  352. ')dnl
  353. add edx, 16
  354. movd mm2, edx
  355. movq mm5, MMWORD ptr Makelow16one
  356. pslld mm5, mm2
  357. ;pslld mm5, 16 ;. Use this after hack.
  358. ; not needed in hacked version since i add to shifted value.
  359. ; ----------------- End of hack
  360. por mm5, MMWORD ptr Makelow16one
  361. ; Make the low 16 bits of dword one
  362. ; This helps in calculating texture address.
  363. ; Gets U and V value into mm1 so that it can be mirrored, wrapped or
  364. ; clamped. This can be done for two values in the point case
  365. ; or four values in the bilinear case.
  366. ifelse(`$4', `Point', `
  367. ;iU00 >>= 6;
  368. ;iV00 >>= 6;
  369. psrad mm1, 6
  370. packssdw mm1, mm1 ; Value needs to be packed since all wrap/mirror
  371. ; operations assume UV in low 32 bits.
  372. ', `
  373. ;INT32 iUFrac = iU00 & 0x03f;
  374. ;INT32 iVFrac = iV00 & 0x03f;
  375. ;iU00 >>= 6;
  376. ;iV00 >>= 6;
  377. movq mm2, mm1
  378. psrad mm1, 6
  379. ;pand mm1, MMWORD PTR Val0x0000ffff0000ffff
  380. pand mm2, dword ptr UFracVFracMask ; UFracVFracMask = 0x0000003f0000003f
  381. ; Going to use only 8 bits for bi-linear so that I can do a pmullw.
  382. ; Currently at 6 bits so shift up by 2.
  383. psllw mm2, 2
  384. movq mm0, mm2
  385. ; Replicate VFrac value for bilinear
  386. punpckhwd mm2, mm2
  387. punpcklwd mm2, mm2
  388. ; Replicate UFrac Value for bilinear
  389. punpcklwd mm0, mm0
  390. punpcklwd mm0, mm0
  391. movq dword ptr VFrac, mm2
  392. movq dword ptr UFrac, mm0
  393. ;INT32 iU01 = iU00 + 1;
  394. ;INT32 iV01 = iV00 + 1;
  395. packssdw mm1, mm1 ; replicate U and V value to upper 16 bit locations
  396. paddw mm1, dword ptr IncHighandLow16
  397. ; This will make texture values be (High word to low word):
  398. ; iV01, iU00, iV00, iU01
  399. ; Need to do this to make texture look up for bilinear easier.
  400. ; I have to combine to get all combinations anyway. It just
  401. ; happens to be better for me to have iV00, iU01 pair first.
  402. ')
  403. ;UINT16 uMaskU0 = pTex->uMaskU >> iLOD0; UINT16 uMaskV0 = pTex->uMaskV >> iLOD0;
  404. ; put mask in mm3 and replicate to match location for wrap/mirror/clamp
  405. movd mm0, XpTex(uMaskU) ; Load U and V mask
  406. ifelse(`$4', `Bilinear', `
  407. ; replicate mask if doing bilinear
  408. punpckldq mm0, mm0
  409. ')
  410. ifelse(`$5', `NoLOD', `
  411. ; iLOD0 is zero so no shift needed.
  412. ' , `
  413. ; iLOD0 shift value left over from above. TBD. Put this in in mip case
  414. ; Could do this one before or after the unpack also.
  415. psrlw mm0, mm3
  416. ')
  417. ifelse(`$2', `TexAddrWrapMirror', `
  418. ;INT16 iFlip;
  419. ; MM1 should contain 16 bit iU and iV for both texture locations
  420. ; End Result is MM1 value wrapped or mirrored
  421. ; in Bilinear Case, four values can be done
  422. ; iU00, iV00, iU01, iV01
  423. ; This code really does alot for the bilinear case and is kinda wasteful
  424. ; in the normal mode.
  425. ;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
  426. movq mm7, mm1
  427. ; Point doesnt need replication
  428. movd mm4, XpTex(iFlipMaskU)
  429. ; if bilinear replicate values together, Point doesnt need this.
  430. ifelse(`$4', `Bilinear', `
  431. punpckldq mm4, mm4
  432. ')
  433. ifelse(`$5', `NoLOD', `
  434. ; iLOD0 is zero so no shift needed.
  435. ' , `
  436. psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
  437. ')
  438. pand mm7, mm4
  439. ;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
  440. pcmpeqw mm7, MMWORD PTR Zero
  441. ;iFlip1 = uMaskU0 & ~ iFlip1; ;iFlip2 = uMaskV0 & ~ iFlip2; ;iFlip3 = uMaskU0 & ~ iFlip3; ;iFlip4 = uMaskV0 & ~ iFlip4;
  442. pandn mm7, mm0
  443. ;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
  444. pand mm1, mm0
  445. ;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
  446. pxor mm1, mm7
  447. ; Result in mm4 now since TexAddrAll ends up that way.
  448. ; Still need to look at register useage more.
  449. movq mm4, mm1
  450. ') dnl
  451. ifelse(`$2', `TexAddrAll', `
  452. ;INT16 iFlip, iClamp1, iClamp2, iClampMinT, iClampMaxT;
  453. ;INT16 iUoWAdj = (INT16)(pS->iUoW`'d_TexNum >> 12); // adjust to match W
  454. ;INT16 iVoWAdj = (INT16)(pS->iVoW`'d_TexNum >> 12);
  455. ;movq mm6, XpS(UVoW + iTex * SIZEOF_UV_UNION)
  456. ;movq mm6, MMWORD PTR Zero
  457. pxor mm6, mm6
  458. ; TBD Data in SPANTEX needs to be rearange to make life simpler.
  459. ; I have rearranged some of it, but there still needs to be some
  460. ; fixes to it.
  461. ;iFlip1 = iU00 & pTex->iFlipMaskU; ;iFlip2 = iV00 & pTex->iFlipMaskV; ;iFlip3 = iU01 & pTex->iFlipMaskU; ;iFlip4 = iV01 & pTex->iFlipMaskV;
  462. movq mm7, mm1
  463. movd mm4, XpTex(iFlipMaskU) ; This should copy U and V mask at the same time.
  464. ifelse(`$4', `Bilinear', `
  465. dnl Only replicate if U and V if doing bilinear
  466. punpckldq mm4, mm4 ; copy UV
  467. ')
  468. ifelse(`$5', `NoLOD', `
  469. ; iLOD0 is zero so no shift needed.
  470. ' , `
  471. psrlw mm4, mm3 ; Shifts mirror mask to correct bit location
  472. ')
  473. pand mm7, mm4
  474. ;iFlip1 = MMX_cmpeqw(iFlip1, 0); ;iFlip2 = MMX_cmpeqw(iFlip2, 0); ;iFlip3 = MMX_cmpeqw(iFlip3, 0); ;iFlip4 = MMX_cmpeqw(iFlip4, 0);
  475. pcmpeqw mm7, MMWORD PTR Zero
  476. ;iFlip1 = uMaskU0 &~ iFlip1; ;iFlip2 = uMaskV0 &~ iFlip2; ;iFlip3 = uMaskU0 &~ iFlip3; ;iFlip4 = uMaskV0 &~ iFlip4;
  477. pandn mm7, mm0
  478. ;iU00 &= uMaskU0; ;iV00 &= uMaskV0; ;iU01 &= uMaskU0; ;iV01 &= uMaskV0;
  479. pand mm1, mm0
  480. ;iU00 ^= iFlip1; ;iV00 ^= iFlip2; ;iU01 ^= iFlip3; ;iV01 ^= iFlip4;
  481. pxor mm1, mm7
  482. ;iClamp11 = MMX_cmpgtw(0, iUoWAdj); ;iClamp12 = MMX_cmpgtw(0, iVoWAdj);
  483. mov edx, iTex
  484. pcmpgtd mm6, XpS(UVoW + edx * SIZEOF_UV_UNION)
  485. packssdw mm6, mm6
  486. ;iClamp21 = MMX_cmpgtw(iOoWAdj, iUoWAdj); ;iClamp22 = MMX_cmpgtw(iOoWAdj, iVoWAdj);
  487. movd mm7, XpS(iOoW)
  488. punpckldq mm7, mm7 ; Make a copy of OoW to compare both UoW and VoW.
  489. psrld mm7, 11 ; Make OoWs Precision Match UoWs.
  490. pcmpgtd mm7, XpS(UVoW + edx * SIZEOF_UV_UNION)
  491. packssdw mm7, mm7
  492. ;iClampMinT1 = pTex->iClampMinU & iClamp11; ;iClampMinT2 = pTex->iClampMinV & iClamp12; ;iClampMinT3 = pTex->iClampMinU & iClamp13; ;iClampMinT4 = pTex->iClampMinV & iClamp14;
  493. movd mm0, XpTex(iClampMinU)
  494. ifelse(`$4', `Bilinear', `
  495. punpckldq mm0, mm0
  496. ')
  497. pand mm0, mm6
  498. ; Save clamp2 because pandn will destory value.
  499. movq mm4, mm7
  500. ;iClampMaxT1 = pTex->iClampMaxU &~ iClamp21; ;iClampMaxT2 = pTex->iClampMaxV &~ iClamp22; ;iClampMaxT3 = pTex->iClampMaxU &~ iClamp23; ;iClampMaxT4 = pTex->iClampMaxV &~ iClamp24;
  501. movd mm2, XpTex(iClampMaxU)
  502. ifelse(`$4', `Bilinear', `
  503. punpckldq mm2, mm2
  504. ')
  505. ifelse(`$5', `NoLOD', `
  506. ; iLOD0 is zero so no shift needed.
  507. ' , `
  508. psraw mm2, mm3 ; Shifts clamp max to correct bit location
  509. ')
  510. pandn mm7, mm2 ; Since iClamp2 is already negated, I can just do an AND.
  511. ;iClamp21 &= ~iClamp11; ;iClamp22 &= ~iClamp12; ;iClamp23 &= ~iClamp13; ;iClamp24 &= ~iClamp14;
  512. pandn mm6, mm4
  513. ;iClamp21 = pTex->iClampEnU &~ iClamp21; ;iClamp22 = pTex->iClampEnU &~ iClamp22; ;iClamp23 = pTex->iClampEnU &~ iClamp23; ;iClamp24 = pTex->iClampEnU &~ iClamp24;
  514. movd mm2, XpTex(iClampEnU)
  515. ifelse(`$4', `Bilinear', `
  516. punpckldq mm2, mm2
  517. ')
  518. pandn mm6, mm2
  519. ;iU00 &= ~iClamp21; ;iV00 &= ~iClamp22; ;iU01 &= ~iClamp23; ;iV01 &= ~iClamp24;
  520. pandn mm6, mm1
  521. ;iU00 |= iClampMinT1; ;iV00 |= iClampMinT2; ;iU01 |= iClampMinT3; ;iV01 |= iClampMinT4;
  522. por mm6, mm0
  523. ;iU00 |= iClampMaxT1; ;iV00 |= iClampMaxT2; ;iU01 |= iClampMaxT3; ;iV01 |= iClampMaxT4;
  524. por mm6, mm7
  525. movq mm4, mm6
  526. ') dnl
  527. ; Making other two cases for texture addressing has to be simplier than
  528. ; this and not use so many registers. Puts U1 V0 U0 V1 into mm3.
  529. ; TBD Make this better.
  530. ; values are still stored as iV01, iU00, iV00, iU01
  531. ifelse(`$4', `Bilinear', `
  532. movq mm2, mm4
  533. movq mm3, mm4
  534. ') dnl Bilinear
  535. dnl ifelse(`$2', `TexAddrAll', `
  536. movq mm0, mm4
  537. dnl ') dnl border code
  538. pmaddwd mm4, mm5 ; Throw in first address calculation.
  539. ; Just to get it started. Calculate
  540. ; iU0+iV1*iShiftU0 and iU1+iV0*iShiftU0
  541. ifelse(`$4', `Bilinear', `
  542. ; values are being changed to iV01, iU01, iV00, iU00
  543. ; seven instructions for this seems excessive.
  544. pand mm2, MMWORD ptr MaskKeepUValues
  545. pand mm3, MMWORD ptr MaskKeepVValues
  546. movq mm1, mm2
  547. psllq mm2, 32
  548. psrlq mm1, 32
  549. por mm3, mm2
  550. por mm3, mm1
  551. ') dnl Bilinear
  552. ; From here until mov edi is code that is needed for border.
  553. ; all sign bits are stored in bytes so that border code can tell if uv went below zero.
  554. dnl ifelse(`$2', `TexAddrAll', `
  555. ifelse(`$4', `Point', `
  556. ; Point needs to be in same format as bilinear for border
  557. packsswb mm0, mm0
  558. ') dnl point
  559. ifelse(`$4', `Bilinear', `
  560. ; mm0 = iV01, iU00, iV00, iU01
  561. ; mm3 = iV01, iU01, iV00, iU00
  562. ; Need to rearrange values to be like so v1 u0 v1 u1 v0 u0 v0 u1 in bytes
  563. ; This is really bad. Just doing whatever to get it to work.
  564. movq mm1, mm0
  565. punpckldq mm1, mm3 ; This will make mm1 = v0 u0 v0 u1
  566. movq mm2, mm3
  567. punpckhdq mm2, mm0 ; This will make mm0 = v1 u0 v1 u1
  568. packsswb mm1, mm2
  569. movq mm0, mm1
  570. ') dnl Bilinear
  571. dnl ') dnl TexAddrAll
  572. ifelse(`$4', `Bilinear', `
  573. pmaddwd mm3, mm5 ; Calculates iU1+iV0*iShiftU0 and iU0+iV1*iShiftU0
  574. ') dnl Bilinear
  575. dnl ; Load pTex->pBits[iLOD0] into esi. It will be needed.
  576. dnl ; Convient that eax is still around as iLOD0. TBD make sure eax positive.
  577. ifelse(`$5', `NoLOD', `
  578. mov edi, XpTex(pBits)
  579. ',`
  580. mov edi, XpTex(pBits+eax*4)
  581. ')dnl
  582. ; was esi. Cant change to esi because it is the pointer to pTex
  583. ; which is used by Border and ColorKey. Use edi for now and
  584. ; call routines through memory. Figure out if this is bad.
  585. ; load the read texture routine address into a register early
  586. ;mov edi, XpCtx(pfnTexRead + $1*SIZEOF_PFNTEXREAD)
  587. ifelse(`$4', `Bilinear', `
  588. ; iV0 iU1 address should be done by now.
  589. movd eax, mm4
  590. ;UINT32 uTex00 = pCtx->pfnTexRead[$1](iU00, iV00, pTex->iShiftU,
  591. ; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
  592. ; Combine U and V values before making call.
  593. ;call edi
  594. mov edx, iTex
  595. call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
  596. movd eax, mm3
  597. movq mm7, mm1 ; Put TColor[iU0, uV0] in mm7
  598. ;UINT32 uTex10 = pCtx->pfnTexRead[$1](iU01, iV00, pTex->iShiftU,
  599. ; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
  600. ;call edi
  601. mov edx, iTex
  602. call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
  603. psrlq mm3, 32
  604. psubw mm7, mm1
  605. psllw mm1, 8
  606. pmullw mm7, dword ptr UFrac
  607. paddw mm7, mm1 ; Should I copy mm1 to another variable and do shift/add later?
  608. movd eax, mm3
  609. ;UINT32 uTex01 = pCtx->pfnTexRead[$1](iU00, iV01, pTex->iShiftU,
  610. ; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
  611. ;call edi
  612. mov edx, iTex
  613. call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
  614. psrlq mm4, 32
  615. movq mm6, mm1
  616. movd eax, mm4
  617. ;UINT32 uTex11 = pCtx->pfnTexRead[$1](iU01, iV01, pTex->iShiftU,
  618. ; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
  619. ;call edi
  620. mov edx, iTex
  621. call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
  622. ;TexFiltBilinear(&pCtx->SI.TexCol[$1], iUFrac, iVFrac, uTex00, uTex10, uTex01, uTex11);
  623. ; The amount of shifting instructions for this makes the other approach
  624. ; look pretty good.
  625. psubw mm6, mm1
  626. psllw mm1, 8
  627. pmullw mm6, dword ptr UFrac ; TBD explain this code better.
  628. movq mm4, mm7
  629. paddw mm6, mm1
  630. psrlw mm6, 8
  631. psrlw mm7, 8
  632. psubw mm6, mm7
  633. pmullw mm6, dword ptr VFrac
  634. paddw mm4, mm6
  635. psrlw mm4, 8
  636. ; TBD shouldnt have to pack and then unpack later. Should keep in a register
  637. packuswb mm4, mm4
  638. mov edx, iTex
  639. movd XpCtxSI(TexCol+edx*4), mm4
  640. ') dnl
  641. ifelse(`$4', `Point', `
  642. ; iV0 iU1 address should be done by now.
  643. movd eax, mm4
  644. ;pCtx->SI.TexCol[$1] = pCtx->pfnTexRead[$1](iU00, iV00, pTex->iShiftU,
  645. ; pTex->pBits[iLOD0], &pCtx->Texture[$1]);
  646. ;call edi
  647. mov edx, iTex
  648. call dword ptr XpCtx(pfnTexRead + edx*SIZEOF_PFNTEXREAD)
  649. ; TBD Currently have to pack and then unpack later. Should be able
  650. ; to leave the value in some register for a while. I would think.
  651. packuswb mm1, mm1
  652. mov edx, iTex
  653. movd XpCtxSI(TexCol+edx*4), mm1
  654. ') dnl
  655. dnl only do update code in non-monolithic case. Monolithic code updates are done
  656. dnl by tstfail routine.
  657. push edi
  658. mov edi, iTex
  659. d_UpdateUoWandVoW()
  660. pop edi
  661. ifelse(`$5', `LOD', `
  662. cmp iTex, 0
  663. jne SkipLOD$2$3$4$5
  664. d_UpdateLOD()
  665. SkipLOD$2$3$4$5:
  666. ')
  667. ifelse(`$3', `Persp', `
  668. cmp iTex, 0
  669. jne TexStoreW$2$3$4$5
  670. d_UpdateOoW()
  671. ;pS->iW = 0x00800000/(pS->iOoW>>16); // 9.23/1.15 = 8.8
  672. d_WDivide()
  673. jmp Tex$2$3$4$5
  674. TexStoreW$2$3$4$5:
  675. ; In Texaddr1, W is calculated and result is in esi. I need to get the W value back into esi for the multiply.
  676. mov esi, XpS(iW)
  677. Tex$2$3$4$5:
  678. push edi
  679. mov edi, iTex
  680. d_UoWVoWTimesW()
  681. pop edi
  682. ', `
  683. push edi
  684. mov edi, iTex
  685. d_UpdateNonPersp()
  686. pop edi
  687. ')
  688. ; load the next bead address into a register early. Not early anymore
  689. ; since so much regular non-mmx code being done for WDIV
  690. ; mov eax, XpCtx(pfnTex`'d_TexNum`'AddrEnd)
  691. ; pCtx->pfnTex`'d_TexNum`'AddrEnd(pCtx, pP, pS);
  692. ; jmp eax
  693. ; We now need to return
  694. ret
  695. ')')
  696. dnl
  697. dnl
  698. dnl d_TexAddrHdr
  699. dnl
  700. dnl Generates headers with the same format as d_TexAddr
  701. dnl
  702. define(`d_TexAddrHdr', `
  703. void MMX_TexAddr_$2_$3_$4_$5(PD3DI_RASTCTX pCtx, PD3DI_RASTPRIM pP,
  704. PD3DI_RASTSPAN pS, INT32 iTex);')dnl
  705. dnl