Counter Strike : Global Offensive Source Code

1609 lines
53 KiB

  1. //========= Copyright c 1996-2008, Valve Corporation, All rights reserved. ============//
  2. #include "studiorender.h"
  3. #include "studio.h"
  4. #include "materialsystem/imesh.h"
  5. #include "materialsystem/imaterialsystemhardwareconfig.h"
  6. #include "materialsystem/imaterialvar.h"
  7. #include "materialsystem/imorph.h"
  8. #include "materialsystem/itexture.h"
  9. #include "materialsystem/imaterial.h"
  10. #include "optimize.h"
  11. #include "mathlib/mathlib.h"
  12. #include "mathlib/vector.h"
  13. #include "mathlib/vmatrix.h"
  14. #include "studiorendercontext.h"
  15. #include "tier2/tier2.h"
  16. #include "tier0/vprof.h"
  17. #include "tier0/miniprofiler.h"
  18. #include <algorithm>
  19. #include "filesystem.h"
  20. #define PROFILE_THIS_FILE 0
  21. DLL_IMPORT CLinkedMiniProfiler *g_pOtherMiniProfilers;
  22. #if PROFILE_THIS_FILE
  23. #if !ENABLE_HARDWARE_PROFILER
  24. #error "can't profile without profiler enabled"
  25. #endif
  26. CLinkedMiniProfiler g_mp_morph_Vx("morph_Vx", &g_pOtherMiniProfilers);
  27. CLinkedMiniProfiler g_mp_morph_Vw("morph_Vw", &g_pOtherMiniProfilers);
  28. CLinkedMiniProfiler g_mp_morph_lower_bound("morph_lower_bound", &g_pOtherMiniProfilers);
  29. CLinkedMiniProfiler g_mp_morph("morph", &g_pOtherMiniProfilers);
  30. CLinkedMiniProfiler g_mp_morph_V1("morph_V1", &g_pOtherMiniProfilers);
  31. CLinkedMiniProfiler g_mp_morph_V2("morph_V2", &g_pOtherMiniProfilers);
  32. CLinkedMiniProfiler g_mp_morph_V3("morph_V3", &g_pOtherMiniProfilers);
  33. CLinkedMiniProfiler g_mp_morph_V4("morph_V4", &g_pOtherMiniProfilers);
  34. CLinkedMiniProfiler g_mp_morph_V5("morph_V5", &g_pOtherMiniProfilers);
  35. CLinkedMiniProfiler g_mp_morph_V6("morph_V6", &g_pOtherMiniProfilers);
  36. CLinkedMiniProfiler g_mp_morph_V7("morph_V7", &g_pOtherMiniProfilers);
  37. CLinkedMiniProfiler* g_mp_ComputeFlexedVertex_StreamOffset[8] =
  38. {
  39. NULL,
  40. &g_mp_morph_V1,
  41. &g_mp_morph_V2,
  42. &g_mp_morph_V3,
  43. &g_mp_morph_V4,
  44. &g_mp_morph_V5,
  45. &g_mp_morph_V6,
  46. &g_mp_morph_V7
  47. };
  48. #else
  49. uint32 g_mp_morph_Vx[2];
  50. uint32 g_mp_morph_Vw[2];
  51. #endif
  52. ConVar g_cv_morph_path("morph_path", "7");
  53. ConVar g_cv_morph_debug("morph_debug", "0");
  54. #ifdef _X360
  55. const ALIGN16 int32 g_perm_speed_side[4] = {0x12, 0x13, 0x12, 0x13};
  56. const ALIGN16 int32 g_perm_delta[4] = {0x14150000, 0x16170000, 0x18190000, 0};
  57. const ALIGN16 int32 g_perm_delta_wrinkle[4] = {0x14150000, 0x16170000, 0x18190000, 0x10110000}; // includes the f3PreDelta's W that's in the X component
  58. const ALIGN16 int32 g_perm_ndelta[4] = {0x1A1B0000, 0x1C1D0000, 0x1E1F0000, 0};
  59. //const ALIGN16 int32 g_perm_w0[4] = {0x00010203,0x08090A0B,0x00010203,0x08090A0B};
  60. const ALIGN16 int32 g_perm_w1[4] = {0x0C0D0E0F,0x0C0D0E0F,0x04050607,0x04050607};
  61. const fltx4 g_sc256_255_special = {256.0f/255.0f,256.0f/255.0f,-256.0f/255.0f,-256.0f/255.0f};
  62. const fltx4 g_f40011 = {0,0,1,1};
  63. fltx4 g_dummy2[2];
  64. int g_nStreamOffset_prefetch = 256;
  65. //
  66. // V4 rolled - latency of x4, manually scheduled for nearly optimal dual-issue and no automatic stalls
  67. // the ~15 nops mean 1 instruction is issued at that cycle, instead of theoretically possible 2 per cycle
  68. //
  69. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V7(
  70. int nThinFlexVertexCount, //r3
  71. CachedPosNorm_t *pThinFlexVerts,//r4
  72. int32 *pFirstThinFlexIndex, //r5
  73. mstudiovertanim_t * pVert, //r6
  74. uint32 nCurrentTag, //r7
  75. uint32 numVertsToProcess, //r8
  76. fltx4 w1234 //vr1
  77. )
  78. {
  79. __asm
  80. {
  81. std r14, -0x08(r1)
  82. std r15, -0x10(r1)
  83. std r16, -0x18(r1)
  84. std r17, -0x20(r1)
  85. std r18, -0x28(r1)
  86. std r19, -0x30(r1)
  87. std r20, -0x38(r1)
  88. std r21, -0x40(r1)
  89. std r22, -0x48(r1)
  90. std r23, -0x50(r1)
  91. std r24, -0x58(r1)
  92. std r25, -0x60(r1)
  93. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  94. lau r14, g_sc256_255_special
  95. lal r14, r14, g_sc256_255_special
  96. lvx vr2, r0,r14
  97. lau r15, g_f40011
  98. lal r15, r15, g_f40011
  99. lvx vr3, r0,r15
  100. lau r16, g_perm_speed_side
  101. lal r16, r16, g_perm_speed_side
  102. lvx vr4, r0,r16
  103. lau r17, g_perm_delta
  104. lal r17, r17, g_perm_delta
  105. lvx vr5, r0,r17
  106. lau r18, g_perm_ndelta
  107. lal r18, r18, g_perm_ndelta
  108. lvx vr6, r0,r18
  109. lau r20, g_dummy2
  110. lal r20,r20, g_dummy2
  111. mr r21, r20
  112. mr r22, r21
  113. mr r23, r22
  114. li r10, -1
  115. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  116. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  117. vxor vr8,vr8,vr8
  118. li r15, 16
  119. li r11,0x100
  120. li r24, MAXSTUDIOFLEXVERTS - 4
  121. mtctr r8
  122. mftb r25
  123. vxor vr19,vr19,vr19
  124. vxor vr20,vr20,vr20
  125. nop // align!
  126. nop
  127. nop
  128. label_start_V7: // 52 instructions run in 45 cycles, although compiler predicts 38 cycles
  129. ////////////////
  130. // IMPORTANT: DO NOT REMOVE NOPS UNLESS YOU KNOW WHAT YOU ARE DOING AND WHY!
  131. // nops are essential here, removing them will make the code about 2% slower because dual-issue will be broken
  132. ////////////////
  133. lhz r14, 0(r6) // int n = pVert->index;
  134. addi r16, r3, 2
  135. dcbt r11,r6
  136. cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2
  137. lvlx vr9,r0,r6
  138. rldicl r14, r14, 2, 0 // r14 = n*4
  139. lvrx vr10,r15,r6
  140. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  141. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  142. addi r31,r31,0//vpermwi128 vr40,vr40,0x1B //mr r31,r31
  143. add r16, r16, r4
  144. vpermwi128 vr40,vr40,0x1B //mr r30,r30
  145. addi r6, r6, 0x10 // pVert++
  146. vpermwi128 vr41,vr41,0x1B//nop
  147. lwzx r17, r14, r5 // r17 = oldCache
  148. //addi r30,r30,0//nop
  149. vperm vr10, vr8, vr9, vr4
  150. //addi r29,r29,0//nop
  151. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  152. vperm vr11, vr8, vr9, vr5
  153. stvx vr8, r0,r16
  154. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  155. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  156. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  157. stvx vr8, r15,r16
  158. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  159. vcsxwfp vr10,vr10,8
  160. or r19,r3,r7
  161. vperm vr12, vr8, vr9, vr6
  162. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  163. /*S:3*/ stvx vr30, r0,r23
  164. //nop
  165. /*S:3*/ stvx vr31, r15,r23
  166. //nop
  167. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  168. //nop
  169. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  170. //nop
  171. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  172. //nop
  173. /*S:2*/mr r23,r22
  174. //nop
  175. or r19, r19, r17 // r19 = updateCache
  176. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  177. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  178. //nop
  179. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  180. //nop
  181. /*S:1*/ vmulfp128 vr19, vr25, vr26
  182. /*S:1*/mr r22, r21
  183. vmaddfp vr20, vr10, vr2, vr3 // vr20 = f4sb
  184. add r21, r17, r4 // r21 = pFlexedVertex, goes to Stage:1
  185. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  186. stwx r19, r14, r5
  187. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  188. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  189. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  190. vcsxwfp128 vr32, vr11, 28
  191. //nop
  192. vcsxwfp128 vr33, vr12, 28
  193. bgt label_end_V7
  194. dcbt r11, r21
  195. bdnz label_start_V7
  196. label_end_V7:
  197. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  198. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  199. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  200. /*S:3*/ stvx vr30, r0,r23
  201. /*S:3*/ stvx vr31, r15,r23
  202. /*S:2*/mr r23,r22
  203. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  204. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  205. /*S:1*/ vmulfp128 vr19, vr25, vr26
  206. /*S:1*/mr r22, r21
  207. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  208. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  209. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  210. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  211. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  212. /*S:3*/ stvx vr30, r0,r23
  213. /*S:3*/ stvx vr31, r15,r23
  214. /*S:2*/mr r23,r22
  215. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  216. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  217. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  218. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  219. /*S:3*/ stvx vr30, r0,r23
  220. /*S:3*/ stvx vr31, r15,r23
  221. mftb r17
  222. subf r17, r25, r17
  223. lau r18, g_mp_morph_Vx
  224. lal r18, r18, g_mp_morph_Vx
  225. lwz r23, 0(r18)
  226. add r23,r23,r17
  227. stw r23, 0(r18)
  228. lwz r23, 4(r18)
  229. add r23,r23,r8
  230. stw r23, 4(r18)
  231. ld r14, -0x08(r1)
  232. ld r15, -0x10(r1)
  233. ld r16, -0x18(r1)
  234. ld r17, -0x20(r1)
  235. ld r18, -0x28(r1)
  236. ld r19, -0x30(r1)
  237. ld r20, -0x38(r1)
  238. ld r21, -0x40(r1)
  239. ld r22, -0x48(r1)
  240. ld r23, -0x50(r1)
  241. ld r24, -0x58(r1)
  242. ld r25, -0x60(r1)
  243. blr
  244. }
  245. }
  246. __declspec(naked) int ComputeFlexedVertexWrinkle_StreamOffset_V7(
  247. int nThinFlexVertexCount, //r3
  248. CachedPosNorm_t *pThinFlexVerts,//r4
  249. int32 *pFirstThinFlexIndex, //r5
  250. mstudiovertanim_wrinkle_t * pVert, //r6
  251. uint32 nCurrentTag, //r7
  252. uint32 numVertsToProcess, //r8
  253. fltx4 w1234 //vr1
  254. )
  255. {
  256. __asm
  257. {
  258. std r14, -0x08(r1)
  259. std r15, -0x10(r1)
  260. std r16, -0x18(r1)
  261. std r17, -0x20(r1)
  262. std r18, -0x28(r1)
  263. std r19, -0x30(r1)
  264. std r20, -0x38(r1)
  265. std r21, -0x40(r1)
  266. std r22, -0x48(r1)
  267. std r23, -0x50(r1)
  268. std r24, -0x58(r1)
  269. std r25, -0x60(r1)
  270. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  271. lau r14, g_sc256_255_special
  272. lal r14, r14, g_sc256_255_special
  273. lvx vr2, r0,r14
  274. lau r15, g_f40011
  275. lal r15, r15, g_f40011
  276. lvx vr3, r0,r15
  277. lau r16, g_perm_speed_side
  278. lal r16, r16, g_perm_speed_side
  279. lvx vr4, r0,r16
  280. lau r17, g_perm_delta_wrinkle
  281. lal r17, r17, g_perm_delta_wrinkle
  282. lvx vr5, r0,r17
  283. lau r18, g_perm_ndelta
  284. lal r18, r18, g_perm_ndelta
  285. lvx vr6, r0,r18
  286. lau r20, g_dummy2
  287. lal r20,r20, g_dummy2
  288. mr r21, r20
  289. mr r22, r21
  290. mr r23, r22
  291. li r10, -1
  292. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  293. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  294. vxor vr8,vr8,vr8
  295. li r15, 16
  296. li r11,0x100
  297. li r24, MAXSTUDIOFLEXVERTS - 4
  298. mtctr r8
  299. mftb r25
  300. vxor vr19,vr19,vr19
  301. vxor vr20,vr20,vr20
  302. nop // align!
  303. nop
  304. nop
  305. label_start_V7: // 52 instructions run in 45 cycles, although compiler predicts 38 cycles
  306. ////////////////
  307. // IMPORTANT: DO NOT REMOVE NOPS UNLESS YOU KNOW WHAT YOU ARE DOING AND WHY!
  308. // nops are essential here, removing them will make the code about 2% slower because dual-issue will be broken
  309. ////////////////
  310. lhz r14, 0(r6) // int n = pVert->index;
  311. addi r16, r3, 2
  312. dcbt r11,r6
  313. cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2
  314. lvlx vr9,r0,r6
  315. rldicl r14, r14, 2, 0 // r14 = n*4
  316. lvrx vr10,r15,r6
  317. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  318. lvlx vr27,r15,r6 // f3PreDelta
  319. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  320. addi r31,r31,0//vpermwi128 vr40,vr40,0x1B //mr r31,r31
  321. add r16, r16, r4
  322. vpermwi128 vr40,vr40,0x1B //mr r30,r30
  323. addi r6, r6, 0x12 // pVert++
  324. vpermwi128 vr41,vr41,0x1B//nop
  325. lwzx r17, r14, r5 // r17 = oldCache
  326. //addi r30,r30,0//nop
  327. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  328. vrlimi128 vr27,vr9,7,0// f3PreDelta
  329. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  330. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  331. stvx vr8, r0,r16
  332. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  333. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  334. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  335. stvx vr8, r15,r16
  336. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  337. vcsxwfp vr10,vr10,8
  338. or r19,r3,r7
  339. vperm vr11, vr8, vr27, vr5 //f3Delta = __vperm(f4Zero, f3PreDelta, permuteDelta)
  340. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  341. /*S:3*/ stvx vr30, r0,r23
  342. //nop
  343. /*S:3*/ stvx vr31, r15,r23
  344. //nop
  345. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  346. //nop
  347. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  348. //nop
  349. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  350. //nop
  351. /*S:2*/mr r23,r22
  352. //nop
  353. or r19, r19, r17 // r19 = updateCache
  354. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  355. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  356. //nop
  357. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  358. //nop
  359. /*S:1*/ vmulfp128 vr19, vr25, vr26
  360. /*S:1*/mr r22, r21
  361. vmaddfp vr20, vr10, vr2, vr3 // vr20 = f4sb
  362. add r21, r17, r4 // r21 = pFlexedVertex, goes to Stage:1
  363. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  364. stwx r19, r14, r5
  365. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  366. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  367. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  368. vcsxwfp128 vr32, vr11, 28
  369. //nop
  370. vcsxwfp128 vr33, vr12, 28
  371. bgt label_end_V7
  372. dcbt r11, r21
  373. bdnz label_start_V7
  374. label_end_V7:
  375. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  376. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  377. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  378. /*S:3*/ stvx vr30, r0,r23
  379. /*S:3*/ stvx vr31, r15,r23
  380. /*S:2*/mr r23,r22
  381. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  382. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  383. /*S:1*/ vmulfp128 vr19, vr25, vr26
  384. /*S:1*/mr r22, r21
  385. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  386. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  387. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  388. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  389. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  390. /*S:3*/ stvx vr30, r0,r23
  391. /*S:3*/ stvx vr31, r15,r23
  392. /*S:2*/mr r23,r22
  393. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  394. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  395. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  396. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  397. /*S:3*/ stvx vr30, r0,r23
  398. /*S:3*/ stvx vr31, r15,r23
  399. mftb r17
  400. subf r17, r25, r17
  401. lau r18, g_mp_morph_Vw
  402. lal r18, r18, g_mp_morph_Vw
  403. lwz r23, 0(r18)
  404. add r23,r23,r17
  405. stw r23, 0(r18)
  406. lwz r23, 4(r18)
  407. add r23,r23,r8
  408. stw r23, 4(r18)
  409. ld r14, -0x08(r1)
  410. ld r15, -0x10(r1)
  411. ld r16, -0x18(r1)
  412. ld r17, -0x20(r1)
  413. ld r18, -0x28(r1)
  414. ld r19, -0x30(r1)
  415. ld r20, -0x38(r1)
  416. ld r21, -0x40(r1)
  417. ld r22, -0x48(r1)
  418. ld r23, -0x50(r1)
  419. ld r24, -0x58(r1)
  420. ld r25, -0x60(r1)
  421. blr
  422. }
  423. }
  424. // V4 rolled - latency of x3
  425. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V6(
  426. int nThinFlexVertexCount, //r3
  427. CachedPosNorm_t *pThinFlexVerts,//r4
  428. int32 *pFirstThinFlexIndex, //r5
  429. mstudiovertanim_t * pVert, //r6
  430. uint32 nCurrentTag, //r7
  431. uint32 numVertsToProcess, //r8
  432. fltx4 w1234 //vr1
  433. )
  434. {
  435. __asm
  436. {
  437. std r14, -0x08(r1)
  438. std r15, -0x10(r1)
  439. std r16, -0x18(r1)
  440. std r17, -0x20(r1)
  441. std r18, -0x28(r1)
  442. std r19, -0x30(r1)
  443. std r20, -0x38(r1)
  444. std r21, -0x40(r1)
  445. std r22, -0x48(r1)
  446. std r23, -0x50(r1)
  447. std r24, -0x58(r1)
  448. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  449. lau r14, g_sc256_255_special
  450. lal r14, r14, g_sc256_255_special
  451. lvx vr2, r0,r14
  452. lau r15, g_f40011
  453. lal r15, r15, g_f40011
  454. lvx vr3, r0,r15
  455. lau r16, g_perm_speed_side
  456. lal r16, r16, g_perm_speed_side
  457. lvx vr4, r0,r16
  458. lau r17, g_perm_delta
  459. lal r17, r17, g_perm_delta
  460. lvx vr5, r0,r17
  461. lau r18, g_perm_ndelta
  462. lal r18, r18, g_perm_ndelta
  463. lvx vr6, r0,r18
  464. lau r20, g_dummy2
  465. lal r20,r20, g_dummy2
  466. mr r21, r20
  467. mr r22, r21
  468. li r10, -1
  469. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  470. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  471. vxor vr8,vr8,vr8
  472. li r15, 16
  473. lau r14,g_nStreamOffset_prefetch
  474. lal r14,r14,g_nStreamOffset_prefetch
  475. lwz r11,0(r14)
  476. li r24, MAXSTUDIOFLEXVERTS - 2
  477. mtctr r8
  478. mftb r23
  479. label_start:
  480. lhz r14, 0(r6) // int n = pVert->index;
  481. dcbt r11,r6
  482. addi r16, r3, 2
  483. cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2
  484. lvlx vr9,r0,r6
  485. lvrx vr10,r15,r6
  486. rldicl r14, r14, 2, 0 // r14 = n*4
  487. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  488. add r16, r16, r4
  489. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  490. stvx vr8, r0,r16
  491. lwzx r17, r14, r5 // r17 = oldCache
  492. stvx vr8, r15,r16
  493. vmsum4fp128 vr19,vr19, vr1 // vr15 = scWeight
  494. vperm vr10, vr8, vr9, vr4
  495. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  496. vperm vr11, vr8, vr9, vr5
  497. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  498. vcsxwfp vr10,vr10,8
  499. vperm vr12, vr8, vr9, vr6
  500. stvx vr23, r0,r22
  501. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  502. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  503. stvx vr24, r15,r22
  504. or r19,r3,r7
  505. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  506. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  507. vpermwi128 vr15, vr10, 0x22
  508. or r19, r19, r17 // r19 = updateCache
  509. vpermwi128 vr16, vr10, 0xF5
  510. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  511. vmaddfp vr24, vr19, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  512. vmaddfp vr23, vr19, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  513. vmulfp128 vr19, vr15, vr16
  514. add r17, r17, r4 // r17 = pFlexedVertex
  515. stwx r19, r14, r5
  516. subf r3, r18, r3// nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  517. lvx vr13, r0,r17 // vr13 = vfPosition
  518. addi r6, r6, 0x10 // pVert++
  519. lvx vr14, r15,r17 // vr14 = vfNormal
  520. vcsxwfp vr21, vr11, 28
  521. mr r22,r21
  522. vcsxwfp vr22, vr12, 28
  523. mr r21,r17
  524. bgt label_end
  525. dcbt r11, r17
  526. bdnz label_start
  527. label_end:
  528. mftb r17
  529. subf r17, r23, r17
  530. lau r18, g_mp_morph_Vx
  531. lal r18, r18, g_mp_morph_Vx
  532. lwz r23, 0(r18)
  533. add r23,r23,r17
  534. stw r23, 0(r18)
  535. lwz r23, 4(r18)
  536. add r23,r23,r8
  537. stw r23, 4(r18)
  538. vmsum4fp128 vr19,vr19, vr1 // vr15 = scWeight
  539. stvx vr23, r0,r22
  540. stvx vr24, r15,r22
  541. vmaddfp vr24, vr19, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  542. vmaddfp vr23, vr19, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  543. stvx vr23, r0,r21
  544. stvx vr24, r15,r21
  545. ld r14, -0x08(r1)
  546. ld r15, -0x10(r1)
  547. ld r16, -0x18(r1)
  548. ld r17, -0x20(r1)
  549. ld r18, -0x28(r1)
  550. ld r19, -0x30(r1)
  551. ld r20, -0x38(r1)
  552. ld r21, -0x40(r1)
  553. ld r22, -0x48(r1)
  554. ld r23, -0x50(r1)
  555. ld r24, -0x58(r1)
  556. blr
  557. }
  558. }
  559. // 2-stages
  560. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V5(
  561. int nThinFlexVertexCount, //r3
  562. CachedPosNorm_t *pThinFlexVerts,//r4
  563. int32 *pFirstThinFlexIndex, //r5
  564. mstudiovertanim_t * pVert, //r6
  565. uint32 nCurrentTag, //r7
  566. uint32 numVertsToProcess, //r8
  567. fltx4 w1234 //vr1
  568. )
  569. {
  570. __asm
  571. {
  572. std r14, -0x08(r1)
  573. std r15, -0x10(r1)
  574. std r16, -0x18(r1)
  575. std r17, -0x20(r1)
  576. std r18, -0x28(r1)
  577. std r19, -0x30(r1)
  578. std r20, -0x38(r1)
  579. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  580. lau r14, g_sc256_255_special
  581. lal r14, r14, g_sc256_255_special
  582. lvx vr2, r0,r14
  583. lau r15, g_f40011
  584. lal r15, r15, g_f40011
  585. lvx vr3, r0,r15
  586. lau r16, g_perm_speed_side
  587. lal r16, r16, g_perm_speed_side
  588. lvx vr4, r0,r16
  589. lau r17, g_perm_delta
  590. lal r17, r17, g_perm_delta
  591. lvx vr5, r0,r17
  592. lau r18, g_perm_ndelta
  593. lal r18, r18, g_perm_ndelta
  594. lvx vr6, r0,r18
  595. lau r20, g_dummy2
  596. lal r20,r20, g_dummy2
  597. vxor vr8,vr8,vr8
  598. li r10, -1
  599. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  600. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  601. mtctr r8
  602. li r15, 16
  603. label_start_schlp:
  604. lhz r14, 0(r6) // int n = pVert->index;
  605. addi r16, r3, 2 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  606. lvlx vr9,r0,r6
  607. rldicl r14, r14, 2, 0 // r14 = n*4
  608. lvrx vr10,r15,r6
  609. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  610. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  611. add r16, r16, r4
  612. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  613. addi r6, r6, 0x10 // pVert++
  614. vcsxwfp vr10,vr10,8
  615. vmaddfp vr17, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) - stage 1
  616. vmaddfp vr18, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) - stage 1
  617. vperm vr11, vr8, vr9, vr5 //f3Delta = __vperm(f4Zero, packedVert, permuteDelta)
  618. vcsxwfp vr11, vr11, 28
  619. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  620. vcsxwfp vr12, vr12, 28
  621. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  622. lwzx r17, r14, r5 // r17 = oldCache
  623. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  624. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  625. or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount
  626. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  627. vpermwi128 vr15, vr10, 0x22
  628. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  629. vpermwi128 vr16, vr10, 0xF5
  630. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  631. stvx vr8, r0, r16
  632. or r19, r19, r17 // r19 = updateCache
  633. stvx vr8, r15, r16
  634. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  635. add r17, r17, r4 // r17 = pFlexedVertex
  636. vmulfp128 vr15, vr15, vr16
  637. lvx vr13, r0,r17 // vr13 = vfPosition
  638. lvx vr14, r15,r17 // vr14 = vfNormal
  639. vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight
  640. stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache
  641. subf r3, r18, r3// nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  642. stvx vr17, r0,r20 // stage 1
  643. stvx vr18, r15,r20 // stage 1
  644. mr r20, r17
  645. bdnz label_start_schlp
  646. vmaddfp vr17, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) - stage 1
  647. vmaddfp vr18, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) - stage 1
  648. stvx vr17, r0,r20 // stage 1; deferred storing saves 15 cycles (10%!)
  649. stvx vr18, r15,r20
  650. ld r14, -0x08(r1)
  651. ld r15, -0x10(r1)
  652. ld r16, -0x18(r1)
  653. ld r17, -0x20(r1)
  654. ld r18, -0x28(r1)
  655. ld r19, -0x30(r1)
  656. ld r20, -0x38(r1)
  657. blr
  658. }
  659. }
  660. // V3 in asm
  661. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V4(
  662. int nThinFlexVertexCount, //r3
  663. CachedPosNorm_t *pThinFlexVerts,//r4
  664. int32 *pFirstThinFlexIndex, //r5
  665. mstudiovertanim_t * pVert, //r6
  666. uint32 nCurrentTag, //r7
  667. uint32 numVertsToProcess, //r8
  668. fltx4 w1234 //vr1
  669. )
  670. {
  671. __asm
  672. {
  673. std r14, -0x08(r1)
  674. std r15, -0x10(r1)
  675. std r16, -0x18(r1)
  676. std r17, -0x20(r1)
  677. std r18, -0x28(r1)
  678. std r19, -0x30(r1)
  679. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  680. lau r14, g_sc256_255_special
  681. lal r14, r14, g_sc256_255_special
  682. lvx vr2, r0,r14
  683. lau r15, g_f40011
  684. lal r15, r15, g_f40011
  685. lvx vr3, r0,r15
  686. lau r16, g_perm_speed_side
  687. lal r16, r16, g_perm_speed_side
  688. lvx vr4, r0,r16
  689. lau r17, g_perm_delta
  690. lal r17, r17, g_perm_delta
  691. lvx vr5, r0,r17
  692. lau r18, g_perm_ndelta
  693. lal r18, r18, g_perm_ndelta
  694. lvx vr6, r0,r18
  695. li r10, -1
  696. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  697. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  698. lau r14,g_nStreamOffset_prefetch
  699. lal r14,r14,g_nStreamOffset_prefetch
  700. lwz r11,0(r14)
  701. vxor vr8,vr8,vr8
  702. li r15, 16
  703. li r24, MAXSTUDIOFLEXVERTS - 3 // critical number at which to stop processing
  704. mtctr r8
  705. label_start:
  706. lhz r14, 0(r6) // int n = pVert->index;
  707. dcbt r11,r16
  708. rldicl r14, r14, 2, 0 // r14 = n*4
  709. addi r16, r3, 2
  710. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  711. add r16, r16, r4
  712. stvx vr8, r0,r16
  713. stvx vr8, r15,r16
  714. lvlx vr9,r0,r6
  715. lvrx vr10,r15,r6
  716. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  717. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  718. vcsxwfp vr10,vr10,8
  719. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  720. vperm vr11, vr8, vr9, vr5 //f3Delta = __vperm(f4Zero, packedVert, permuteDelta)
  721. vcsxwfp vr11, vr11, 28
  722. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  723. vcsxwfp vr12, vr12, 28
  724. lwzx r17, r14, r5 // r17 = oldCache
  725. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  726. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  727. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  728. or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount
  729. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  730. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  731. or r19, r19, r17 // r19 = updateCache
  732. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  733. add r17, r17, r4 // r17 = pFlexedVertex
  734. lvx vr13, r0,r17 // vr13 = vfPosition
  735. lvx vr14, r15,r17 // vr14 = vfNormal
  736. dcbt r11,r17
  737. vpermwi128 vr15, vr10, 0x22
  738. vpermwi128 vr16, vr10, 0xF5
  739. vmulfp128 vr15, vr15, vr16
  740. vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight
  741. stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache
  742. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  743. vmaddfp vr14, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  744. vmaddfp vr13, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  745. stvx vr13, r0,r17
  746. stvx vr14, r15,r17
  747. cmpw r3, r24
  748. bgt label_end
  749. addi r6, r6, 0x10 // pVert++
  750. bdnz label_start
  751. label_end:
  752. ld r14, -0x08(r1)
  753. ld r15, -0x10(r1)
  754. ld r16, -0x18(r1)
  755. ld r17, -0x20(r1)
  756. ld r18, -0x28(r1)
  757. ld r19, -0x30(r1)
  758. blr
  759. }
  760. }
  761. // V3 in asm
  762. __declspec(naked) int ComputeFlexedVertexWrinkle_StreamOffset_V4(
  763. int nThinFlexVertexCount, //r3
  764. CachedPosNorm_t *pThinFlexVerts,//r4
  765. int32 *pFirstThinFlexIndex, //r5
  766. mstudiovertanim_wrinkle_t * pVert,//r6
  767. uint32 nCurrentTag, //r7
  768. uint32 numVertsToProcess, //r8
  769. fltx4 w1234 //vr1
  770. )
  771. {
  772. __asm
  773. {
  774. std r14, -0x08(r1)
  775. std r15, -0x10(r1)
  776. std r16, -0x18(r1)
  777. std r17, -0x20(r1)
  778. std r18, -0x28(r1)
  779. std r19, -0x30(r1)
  780. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  781. lau r14, g_sc256_255_special
  782. lal r14, r14, g_sc256_255_special
  783. lvx vr2, r0,r14
  784. lau r15, g_f40011
  785. lal r15, r15, g_f40011
  786. lvx vr3, r0,r15
  787. lau r16, g_perm_speed_side
  788. lal r16, r16, g_perm_speed_side
  789. lvx vr4, r0,r16
  790. lau r17, g_perm_delta_wrinkle
  791. lal r17, r17, g_perm_delta_wrinkle
  792. lvx vr5, r0,r17
  793. lau r18, g_perm_ndelta
  794. lal r18, r18, g_perm_ndelta
  795. lvx vr6, r0,r18
  796. li r10, -1
  797. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  798. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  799. lau r14,g_nStreamOffset_prefetch
  800. lal r14,r14,g_nStreamOffset_prefetch
  801. lwz r11,0(r14)
  802. vxor vr8,vr8,vr8
  803. li r15, 16
  804. li r24, MAXSTUDIOFLEXVERTS - 3 // critical number at which to stop processing
  805. mtctr r8
  806. label_start:
  807. lhz r14, 0(r6) // int n = pVert->index;
  808. dcbt r11,r16
  809. rldicl r14, r14, 2, 0 // r14 = n*4
  810. addi r16, r3, 2
  811. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  812. add r16, r16, r4
  813. stvx vr8, r0,r16
  814. stvx vr8, r15,r16
  815. lvlx vr27,r15,r6 // f3PreDelta
  816. lvlx vr9,r0,r6
  817. lvrx vr10,r15,r6
  818. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  819. vrlimi128 vr27,vr9,7,0// f3PreDelta
  820. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  821. vcsxwfp vr10,vr10,8
  822. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  823. vperm vr11, vr8, vr27, vr5 //f3Delta = __vperm(f4Zero, f3PreDelta, permuteDelta)
  824. vcsxwfp vr11, vr11, 28
  825. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  826. vcsxwfp vr12, vr12, 28
  827. lwzx r17, r14, r5 // r17 = oldCache
  828. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  829. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  830. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  831. or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount
  832. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  833. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  834. or r19, r19, r17 // r19 = updateCache
  835. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  836. add r17, r17, r4 // r17 = pFlexedVertex
  837. lvx vr13, r0,r17 // vr13 = vfPosition
  838. lvx vr14, r15,r17 // vr14 = vfNormal
  839. dcbt r11,r17
  840. vpermwi128 vr15, vr10, 0x22
  841. vpermwi128 vr16, vr10, 0xF5
  842. vmulfp128 vr15, vr15, vr16
  843. vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight
  844. stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache
  845. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  846. vmaddfp vr14, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  847. vmaddfp vr13, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  848. stvx vr13, r0,r17
  849. stvx vr14, r15,r17
  850. cmpw r3, r24
  851. bgt label_end
  852. addi r6, r6, 0x12 // pVert++
  853. bdnz label_start
  854. label_end:
  855. ld r14, -0x08(r1)
  856. ld r15, -0x10(r1)
  857. ld r16, -0x18(r1)
  858. ld r17, -0x20(r1)
  859. ld r18, -0x28(r1)
  860. ld r19, -0x30(r1)
  861. blr
  862. }
  863. }
  864. // base for asm
  865. int ComputeFlexedVertex_StreamOffset_V3(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  866. {
  867. fltx4 sc256_255_special = g_sc256_255_special;
  868. fltx4 f40011 = g_f40011;
  869. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  870. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta);
  871. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  872. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  873. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  874. fltx4 f4Zero = Four_Zeros;
  875. do
  876. {
  877. int n = pVert->index;
  878. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  879. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  880. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  881. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011);
  882. // f4sb = {s,b,1-s,1-b}
  883. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  884. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  885. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  886. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  887. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  888. int64 isCacheValid = ~isCacheInvalid;
  889. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  890. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  891. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  892. int nVertexIndex = updateCache & 0xFFFF;
  893. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  894. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  895. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  896. // here we need to form the following vector to compute final w:
  897. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  898. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1));
  899. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5));
  900. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  901. pFirstThinFlexIndex[n] = updateCache;
  902. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal));
  903. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition));
  904. pVert ++;
  905. }
  906. while(--numVertsToProcess); // why doesn't this use bdnz??
  907. return nThinFlexVertexCount;
  908. }
  909. // base for asm
  910. int ComputeFlexedVertexWrinkle_StreamOffset_V3(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_wrinkle_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  911. {
  912. fltx4 sc256_255_special = g_sc256_255_special;
  913. fltx4 f40011 = g_f40011;
  914. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  915. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta_wrinkle);
  916. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  917. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  918. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  919. fltx4 f4Zero = Four_Zeros;
  920. do
  921. {
  922. int n = pVert->index;
  923. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  924. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  925. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  926. fltx4 f3PreDelta = __lvlx(pVert, 16); // f3Delta now contains only packed W component in high X halfword...
  927. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011);
  928. // f4sb = {s,b,1-s,1-b}
  929. f3PreDelta = __vrlimi(f3PreDelta, packedVert, 7, 0); // don't rotate and move bytes 4..15 from packed vert to f3PreDelta
  930. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  931. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, f3PreDelta, permuteDelta), 12+16);
  932. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  933. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  934. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  935. int64 isCacheValid = ~isCacheInvalid;
  936. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  937. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  938. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  939. int nVertexIndex = updateCache & 0xFFFF;
  940. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  941. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  942. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  943. // here we need to form the following vector to compute final w:
  944. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  945. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1));
  946. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5));
  947. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  948. pFirstThinFlexIndex[n] = updateCache;
  949. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal));
  950. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition));
  951. pVert ++;
  952. }
  953. while(--numVertsToProcess); // why doesn't this use bdnz??
  954. return nThinFlexVertexCount;
  955. }
  956. // tried to pipeline in C++
  957. int ComputeFlexedVertex_StreamOffset_V2(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  958. {
  959. Assert(0 == (uint32(pVert) & 0xF));
  960. fltx4 sc256_255_special = g_sc256_255_special;
  961. fltx4 f40011 = g_f40011;
  962. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  963. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta);
  964. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  965. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  966. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  967. fltx4 f4Zero = Four_Zeros;
  968. fltx4 f4sb_st1, f3Delta_st1, f3NDelta_st1;
  969. int32 updateCache_st1;
  970. mstudiovertanim_t *pVertEnd = pVert + numVertsToProcess;
  971. {
  972. // stage 0
  973. int n = pVert->index;
  974. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  975. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  976. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  977. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); // to be completely correct, we'll ned to multiply this with 256/255
  978. // f4sb = {s,b,1-s,1-b}
  979. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  980. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  981. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  982. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  983. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  984. int64 isCacheValid = ~isCacheInvalid;
  985. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  986. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  987. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  988. pFirstThinFlexIndex[n] = updateCache;
  989. // prime next stage 1
  990. f4sb_st1 = f4sb;
  991. f3Delta_st1 = f3Delta;
  992. f3NDelta_st1 = f3NDelta;
  993. updateCache_st1 = updateCache;
  994. pVert ++;
  995. }
  996. while(pVert < pVertEnd)
  997. {
  998. // stage 1
  999. {
  1000. int nVertexIndex = updateCache_st1 & 0xFFFF;
  1001. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  1002. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  1003. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  1004. // here we need to form the following vector to compute final w:
  1005. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  1006. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb_st1,f4sb_st1,permuteW0), __vperm(f4sb_st1,f4sb_st1,permuteW1));
  1007. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb_st1,0x22), __vpermwi(f4sb_st1,0xF5));
  1008. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  1009. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta_st1, vfNormal));
  1010. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta_st1, vfPosition));
  1011. }
  1012. // stage 0
  1013. {
  1014. int n = pVert->index;
  1015. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  1016. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  1017. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  1018. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); // to be completely correct, we'll ned to multiply this with 256/255
  1019. // f4sb = {s,b,1-s,1-b}
  1020. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  1021. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  1022. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  1023. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  1024. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  1025. int64 isCacheValid = ~isCacheInvalid;
  1026. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  1027. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  1028. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  1029. pFirstThinFlexIndex[n] = updateCache; // this may be put wherever it doesn't mess up the other stores
  1030. // prime next stage 1
  1031. f4sb_st1 = f4sb;
  1032. updateCache_st1 = updateCache;
  1033. f3Delta_st1 = f3Delta;
  1034. f3NDelta_st1 = f3NDelta;
  1035. }
  1036. pVert ++;
  1037. }
  1038. // stage 1
  1039. {
  1040. int nVertexIndex = updateCache_st1 & 0xFFFF;
  1041. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  1042. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  1043. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  1044. // here we need to form the following vector to compute final w:
  1045. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  1046. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb_st1,f4sb_st1,permuteW0), __vperm(f4sb_st1,f4sb_st1,permuteW1));
  1047. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb_st1,0x22), __vpermwi(f4sb_st1,0xF5));
  1048. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  1049. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta_st1, vfNormal));
  1050. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta_st1, vfPosition));
  1051. }
  1052. return nThinFlexVertexCount;
  1053. }
  1054. // branchless
  1055. int ComputeFlexedVertex_StreamOffset_V1(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  1056. {
  1057. Assert(0 == (uint32(pVert) & 0xF));
  1058. fltx4 sc256_255_special = g_sc256_255_special;
  1059. fltx4 f40011 = g_f40011;
  1060. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  1061. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta);
  1062. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  1063. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  1064. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  1065. fltx4 f4Zero = Four_Zeros;
  1066. mstudiovertanim_t *pVertEnd = pVert + numVertsToProcess;
  1067. do
  1068. {
  1069. int n = pVert->index;
  1070. pThinFlexVerts[nThinFlexVertexCount].m_Position.InitZero();
  1071. pThinFlexVerts[nThinFlexVertexCount].m_Normal.InitZero();
  1072. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  1073. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011);
  1074. // f4sb = {s,b,1-s,1-b}
  1075. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  1076. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  1077. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  1078. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  1079. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  1080. int32 isCacheValid = ~isCacheInvalid;
  1081. int32 newCache = nCurrentTag | nThinFlexVertexCount;
  1082. int32 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  1083. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  1084. int nVertexIndex = updateCache & 0xFFFF;
  1085. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  1086. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  1087. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  1088. // here we need to form the following vector to compute final w:
  1089. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  1090. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1));
  1091. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5));
  1092. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  1093. pFirstThinFlexIndex[n] = updateCache;
  1094. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal));
  1095. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition));
  1096. pVert ++;
  1097. }
  1098. while(pVert < pVertEnd); // why doesn't this use CTR??
  1099. return nThinFlexVertexCount;
  1100. }
  1101. typedef int (*Fn_ComputeFlexedVertex_StreamOffset)(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234);
  1102. Fn_ComputeFlexedVertex_StreamOffset g_fn_ComputeFlexedVertex_StreamOffset[8] =
  1103. {
  1104. NULL,
  1105. ComputeFlexedVertex_StreamOffset_V1,
  1106. ComputeFlexedVertex_StreamOffset_V2,
  1107. ComputeFlexedVertex_StreamOffset_V3,
  1108. ComputeFlexedVertex_StreamOffset_V4,
  1109. ComputeFlexedVertex_StreamOffset_V5,
  1110. ComputeFlexedVertex_StreamOffset_V6,
  1111. ComputeFlexedVertex_StreamOffset_V7
  1112. };
  1113. typedef int (*Fn_ComputeFlexedVertexWrinkle_StreamOffset)(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_wrinkle_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234);
  1114. Fn_ComputeFlexedVertexWrinkle_StreamOffset g_fn_ComputeFlexedVertexWrinkle_StreamOffset[8] =
  1115. {
  1116. NULL,
  1117. ComputeFlexedVertexWrinkle_StreamOffset_V3,
  1118. ComputeFlexedVertexWrinkle_StreamOffset_V3,
  1119. ComputeFlexedVertexWrinkle_StreamOffset_V3,
  1120. ComputeFlexedVertexWrinkle_StreamOffset_V4,
  1121. ComputeFlexedVertexWrinkle_StreamOffset_V4,
  1122. ComputeFlexedVertexWrinkle_StreamOffset_V4,
  1123. ComputeFlexedVertexWrinkle_StreamOffset_V7
  1124. };
  1125. inline float Diff(const CachedPosNorm_t&a, const CachedPosNorm_t&b)
  1126. {
  1127. return a.m_Position.DistTo(b.m_Position) + a.m_Normal.DistTo(b.m_Normal);
  1128. }
  1129. bool g_bBreakOnAssert = true;
  1130. void AlwaysAssert(bool mustBeTrue)
  1131. {
  1132. if(!mustBeTrue)
  1133. {
  1134. Plat_DebugString("AlwaysAssert\n");
  1135. if(g_bBreakOnAssert)
  1136. DebugBreak();
  1137. }
  1138. }
  1139. #endif
  1140. template
  1141. void CCachedRenderData::ComputeFlexedVertex_StreamOffset<mstudiovertanim_t>( studiohdr_t *pStudioHdr, mstudioflex_t *pflex,
  1142. mstudiovertanim_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 );
  1143. template
  1144. void CCachedRenderData::ComputeFlexedVertex_StreamOffset<mstudiovertanim_wrinkle_t>( studiohdr_t *pStudioHdr, mstudioflex_t *pflex,
  1145. mstudiovertanim_wrinkle_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 );
  1146. // vectorized
  1147. void CCachedRenderData::ComputeFlexedVertex_StreamOffset_Optimized( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, mstudiovertanim_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 )
  1148. {
  1149. #if PROFILE_THIS_FILE
  1150. CMiniProfilerGuard mpguard(&g_mp_morph);
  1151. #endif
  1152. #ifdef _X360
  1153. int nMorphPath = g_cv_morph_path.GetInt();
  1154. if(nMorphPath)
  1155. {
  1156. mstudiovertanim_t vertCountStruct;
  1157. vertCountStruct.index = vertCount;
  1158. /*for(uint32 i = 1; i< pflex->numverts; ++i)
  1159. if(pvanim[i-1].index > pvanim[i].index)
  1160. DebugBreak();*/
  1161. mstudiovertanim_t * pVertEnd;
  1162. {
  1163. #if PROFILE_THIS_FILE
  1164. CMiniProfilerGuard mpguard_lower_bound(&g_mp_morph_lower_bound);
  1165. #endif
  1166. pVertEnd = std::lower_bound(pvanim, pvanim + pflex->numverts, vertCountStruct, mstudiovertanim_t::CSortByIndex());
  1167. }
  1168. if(pvanim < pVertEnd)
  1169. {
  1170. union
  1171. {
  1172. fltx4 f4;
  1173. float f1[4];
  1174. } weights;
  1175. weights.f1[0] = w1;
  1176. weights.f1[1] = w2;
  1177. weights.f1[2] = w3;
  1178. weights.f1[3] = w4;
  1179. uint32 nCurrentTag = uint32(m_CurrentTag)<<16;
  1180. int nThinFlexVertexCount = m_ThinFlexVertexCount;
  1181. int32 *pFirstThinFlexIndex = (int32*)m_pFirstThinFlexIndex;
  1182. CachedPosNorm_t *pThinFlexVerts = m_pThinFlexVerts;
  1183. uint64 numVertsToProcess = pVertEnd - pvanim;
  1184. nMorphPath = MIN(7,nMorphPath);
  1185. /*static int maxVertsSaved = 0;
  1186. if(numVertsToProcess > maxVertsSaved)
  1187. {
  1188. maxVertsSaved = numVertsToProcess;
  1189. FileHandle_t fh = g_pFullFileSystem->Open( "vertices.bin", "wb" );
  1190. if(fh != FILESYSTEM_INVALID_HANDLE)
  1191. {
  1192. g_pFullFileSystem->Write(pvanim, sizeof(*pvanim) * numVertsToProcess, fh);
  1193. g_pFullFileSystem->Close(fh);
  1194. }
  1195. }*/
  1196. #ifdef _DEBUG
  1197. if(0 == g_cv_morph_debug.GetInt())
  1198. #endif
  1199. {
  1200. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1201. {
  1202. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1203. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1204. }
  1205. nThinFlexVertexCount = g_fn_ComputeFlexedVertex_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1206. }
  1207. #ifdef _DEBUG
  1208. else // Validation path inactive in release, since these static arrays consume 1MB
  1209. {
  1210. bool repeat = false;
  1211. static CachedPosNorm_t backupThinFlexVerts[MAXSTUDIOFLEXVERTS+1], checkThinFlexVerts[MAXSTUDIOFLEXVERTS+1];
  1212. static CacheIndex_t backupFirstThinFlexIndex[MAXSTUDIOVERTS+1],checkFirstThinFlexIndex[MAXSTUDIOVERTS+1];
  1213. int newThinFlexVertexCount ;
  1214. static int numRuns = 0;
  1215. ++numRuns;
  1216. memcpy(backupThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1217. memcpy(backupFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1218. do
  1219. {
  1220. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1221. {
  1222. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1223. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1224. }
  1225. newThinFlexVertexCount = g_fn_ComputeFlexedVertex_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1226. memcpy(checkThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1227. memcpy(checkFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1228. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1229. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1230. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1231. AlwaysAssert(m_ThinFlexVertexCount == newThinFlexVertexCount);
  1232. for(int i = 0; i < newThinFlexVertexCount; ++i)
  1233. AlwaysAssert(Diff(checkThinFlexVerts[i], m_pThinFlexVerts[i]) < 1e-5f);
  1234. int indexOffset = m_pFirstThinFlexIndex - m_pThinFlexIndex;
  1235. for(int i = 0; i < numVertsToProcess; ++i)
  1236. AlwaysAssert(*(int*)&checkFirstThinFlexIndex[indexOffset + pvanim[i].index] == *(int*)&m_pThinFlexIndex[indexOffset + pvanim[i].index]);
  1237. if(repeat)
  1238. {
  1239. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1240. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1241. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1242. }
  1243. }
  1244. while(repeat);
  1245. nThinFlexVertexCount = newThinFlexVertexCount;
  1246. }
  1247. #endif
  1248. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1249. }
  1250. }
  1251. else
  1252. #endif
  1253. {
  1254. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1255. }
  1256. }
  1257. void CCachedRenderData::ComputeFlexedVertexWrinkle_StreamOffset_Optimized( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, mstudiovertanim_wrinkle_t *pvanim, int vertCount, float w1, float w2, float w3, float w4)
  1258. {
  1259. #if PROFILE_THIS_FILE
  1260. CMiniProfilerGuard mpguard(&g_mp_morph);
  1261. #endif
  1262. #ifdef _X360
  1263. int nMorphPath = g_cv_morph_path.GetInt();
  1264. if(nMorphPath)
  1265. {
  1266. mstudiovertanim_wrinkle_t vertCountStruct;
  1267. vertCountStruct.index = vertCount;
  1268. mstudiovertanim_wrinkle_t * pVertEnd;
  1269. {
  1270. #if PROFILE_THIS_FILE
  1271. CMiniProfilerGuard mpguard_lower_bound(&g_mp_morph_lower_bound);
  1272. #endif
  1273. pVertEnd = std::lower_bound(pvanim, pvanim + pflex->numverts, vertCountStruct, mstudiovertanim_wrinkle_t::CSortByIndex());
  1274. }
  1275. if(pvanim < pVertEnd)
  1276. {
  1277. union
  1278. {
  1279. fltx4 f4;
  1280. float f1[4];
  1281. } weights;
  1282. weights.f1[0] = w1;
  1283. weights.f1[1] = w2;
  1284. weights.f1[2] = w3;
  1285. weights.f1[3] = w4;
  1286. uint32 nCurrentTag = uint32(m_CurrentTag)<<16;
  1287. int nThinFlexVertexCount = m_ThinFlexVertexCount;
  1288. int32 *pFirstThinFlexIndex = (int32*)m_pFirstThinFlexIndex;
  1289. CachedPosNorm_t *pThinFlexVerts = m_pThinFlexVerts;
  1290. uint64 numVertsToProcess = pVertEnd - pvanim;
  1291. nMorphPath = MIN(7,nMorphPath);
  1292. #ifdef _DEBUG
  1293. if(0 == g_cv_morph_debug.GetInt())
  1294. #endif
  1295. {
  1296. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1297. {
  1298. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1299. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1300. }
  1301. nThinFlexVertexCount = g_fn_ComputeFlexedVertexWrinkle_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1302. }
  1303. #ifdef _DEBUG
  1304. else // Validation path inactive in release, since these static arrays consume 1MB
  1305. {
  1306. bool repeat = false;
  1307. static CachedPosNorm_t backupThinFlexVerts[MAXSTUDIOFLEXVERTS+1], checkThinFlexVerts[MAXSTUDIOFLEXVERTS+1];
  1308. static CacheIndex_t backupFirstThinFlexIndex[MAXSTUDIOVERTS+1],checkFirstThinFlexIndex[MAXSTUDIOVERTS+1];
  1309. int newThinFlexVertexCount ;
  1310. static int numRuns = 0;
  1311. ++numRuns;
  1312. memcpy(backupThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1313. memcpy(backupFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1314. do
  1315. {
  1316. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1317. {
  1318. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1319. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1320. }
  1321. newThinFlexVertexCount = g_fn_ComputeFlexedVertexWrinkle_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1322. memcpy(checkThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1323. memcpy(checkFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1324. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1325. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1326. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1327. AlwaysAssert(m_ThinFlexVertexCount == newThinFlexVertexCount);
  1328. for(int i = 0; i < newThinFlexVertexCount; ++i)
  1329. AlwaysAssert(Diff(checkThinFlexVerts[i], m_pThinFlexVerts[i]) < 1e-5f);
  1330. int indexOffset = m_pFirstThinFlexIndex - m_pThinFlexIndex;
  1331. for(int i = 0; i < numVertsToProcess; ++i)
  1332. AlwaysAssert(*(int*)&checkFirstThinFlexIndex[indexOffset + pvanim[i].index] == *(int*)&m_pThinFlexIndex[indexOffset + pvanim[i].index]);
  1333. if(repeat)
  1334. {
  1335. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1336. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1337. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1338. }
  1339. }
  1340. while(repeat);
  1341. nThinFlexVertexCount = newThinFlexVertexCount;
  1342. }
  1343. #endif
  1344. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1345. }
  1346. }
  1347. else
  1348. #endif
  1349. {
  1350. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1351. }
  1352. }