Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1620 lines
53 KiB

  1. //========= Copyright c 1996-2008, Valve Corporation, All rights reserved. ============//
  2. #include "tier0/platform.h"
  3. #ifdef PLATFORM_WINDOWS
  4. #include "studiorender.h"
  5. #include "studio.h"
  6. #include "materialsystem/imesh.h"
  7. #include "materialsystem/imaterialsystemhardwareconfig.h"
  8. #include "materialsystem/imaterialvar.h"
  9. #include "materialsystem/imorph.h"
  10. #include "materialsystem/itexture.h"
  11. #include "materialsystem/imaterial.h"
  12. #include "optimize.h"
  13. #include "mathlib/mathlib.h"
  14. #include "mathlib/vector.h"
  15. #include <malloc.h>
  16. #include "mathlib/vmatrix.h"
  17. #include "studiorendercontext.h"
  18. #include "tier2/tier2.h"
  19. #include "tier0/vprof.h"
  20. //#include "tier0/miniprofiler.h"
  21. #include <algorithm>
  22. #include "filesystem.h"
  23. #define PROFILE_THIS_FILE 0
  24. //DLL_IMPORT CLinkedMiniProfiler *g_pOtherMiniProfilers;
  25. #if PROFILE_THIS_FILE
  26. #if !ENABLE_HARDWARE_PROFILER
  27. #error "can't profile without profiler enabled"
  28. #endif
  29. CLinkedMiniProfiler g_mp_morph_Vx("morph_Vx", &g_pOtherMiniProfilers);
  30. CLinkedMiniProfiler g_mp_morph_Vw("morph_Vw", &g_pOtherMiniProfilers);
  31. CLinkedMiniProfiler g_mp_morph_lower_bound("morph_lower_bound", &g_pOtherMiniProfilers);
  32. CLinkedMiniProfiler g_mp_morph("morph", &g_pOtherMiniProfilers);
  33. CLinkedMiniProfiler g_mp_morph_V1("morph_V1", &g_pOtherMiniProfilers);
  34. CLinkedMiniProfiler g_mp_morph_V2("morph_V2", &g_pOtherMiniProfilers);
  35. CLinkedMiniProfiler g_mp_morph_V3("morph_V3", &g_pOtherMiniProfilers);
  36. CLinkedMiniProfiler g_mp_morph_V4("morph_V4", &g_pOtherMiniProfilers);
  37. CLinkedMiniProfiler g_mp_morph_V5("morph_V5", &g_pOtherMiniProfilers);
  38. CLinkedMiniProfiler g_mp_morph_V6("morph_V6", &g_pOtherMiniProfilers);
  39. CLinkedMiniProfiler g_mp_morph_V7("morph_V7", &g_pOtherMiniProfilers);
  40. CLinkedMiniProfiler* g_mp_ComputeFlexedVertex_StreamOffset[8] =
  41. {
  42. NULL,
  43. &g_mp_morph_V1,
  44. &g_mp_morph_V2,
  45. &g_mp_morph_V3,
  46. &g_mp_morph_V4,
  47. &g_mp_morph_V5,
  48. &g_mp_morph_V6,
  49. &g_mp_morph_V7
  50. };
  51. #else
  52. uint32 g_mp_morph_Vx[2];
  53. uint32 g_mp_morph_Vw[2];
  54. #endif
  55. #ifdef _X360
  56. ConVar g_cv_morph_path("morph_path", "7");
  57. #ifdef _DEBUG
  58. ConVar g_cv_morph_debug("morph_debug", "0");
  59. #endif // _DEBUG
  60. #endif // _X360
  61. #ifdef _X360
  62. const ALIGN16 int32 g_perm_speed_side[4] = {0x12, 0x13, 0x12, 0x13};
  63. const ALIGN16 int32 g_perm_delta[4] = {0x14150000, 0x16170000, 0x18190000, 0};
  64. const ALIGN16 int32 g_perm_delta_wrinkle[4] = {0x14150000, 0x16170000, 0x18190000, 0x10110000}; // includes the f3PreDelta's W that's in the X component
  65. const ALIGN16 int32 g_perm_ndelta[4] = {0x1A1B0000, 0x1C1D0000, 0x1E1F0000, 0};
  66. //const ALIGN16 int32 g_perm_w0[4] = {0x00010203,0x08090A0B,0x00010203,0x08090A0B};
  67. const ALIGN16 int32 g_perm_w1[4] = {0x0C0D0E0F,0x0C0D0E0F,0x04050607,0x04050607};
  68. const fltx4 g_sc256_255_special = {256.0f/255.0f,256.0f/255.0f,-256.0f/255.0f,-256.0f/255.0f};
  69. const fltx4 g_f40011 = {0,0,1,1};
  70. fltx4 g_dummy2[2];
  71. int g_nStreamOffset_prefetch = 256;
  72. //
  73. // V4 rolled - latency of x4, manually scheduled for nearly optimal dual-issue and no automatic stalls
  74. // the ~15 nops mean 1 instruction is issued at that cycle, instead of theoretically possible 2 per cycle
  75. //
  76. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V7(
  77. int nThinFlexVertexCount, //r3
  78. CachedPosNorm_t *pThinFlexVerts,//r4
  79. int32 *pFirstThinFlexIndex, //r5
  80. mstudiovertanim_t * pVert, //r6
  81. uint32 nCurrentTag, //r7
  82. uint32 numVertsToProcess, //r8
  83. fltx4 w1234 //vr1
  84. )
  85. {
  86. __asm
  87. {
  88. std r14, -0x08(r1)
  89. std r15, -0x10(r1)
  90. std r16, -0x18(r1)
  91. std r17, -0x20(r1)
  92. std r18, -0x28(r1)
  93. std r19, -0x30(r1)
  94. std r20, -0x38(r1)
  95. std r21, -0x40(r1)
  96. std r22, -0x48(r1)
  97. std r23, -0x50(r1)
  98. std r24, -0x58(r1)
  99. std r25, -0x60(r1)
  100. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  101. lau r14, g_sc256_255_special
  102. lal r14, r14, g_sc256_255_special
  103. lvx vr2, r0,r14
  104. lau r15, g_f40011
  105. lal r15, r15, g_f40011
  106. lvx vr3, r0,r15
  107. lau r16, g_perm_speed_side
  108. lal r16, r16, g_perm_speed_side
  109. lvx vr4, r0,r16
  110. lau r17, g_perm_delta
  111. lal r17, r17, g_perm_delta
  112. lvx vr5, r0,r17
  113. lau r18, g_perm_ndelta
  114. lal r18, r18, g_perm_ndelta
  115. lvx vr6, r0,r18
  116. lau r20, g_dummy2
  117. lal r20,r20, g_dummy2
  118. mr r21, r20
  119. mr r22, r21
  120. mr r23, r22
  121. li r10, -1
  122. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  123. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  124. vxor vr8,vr8,vr8
  125. li r15, 16
  126. li r11,0x100
  127. li r24, MAXSTUDIOFLEXVERTS - 4
  128. mtctr r8
  129. mftb r25
  130. vxor vr19,vr19,vr19
  131. vxor vr20,vr20,vr20
  132. nop // align!
  133. nop
  134. nop
  135. label_start_V7: // 52 instructions run in 45 cycles, although compiler predicts 38 cycles
  136. ////////////////
  137. // IMPORTANT: DO NOT REMOVE NOPS UNLESS YOU KNOW WHAT YOU ARE DOING AND WHY!
  138. // nops are essential here, removing them will make the code about 2% slower because dual-issue will be broken
  139. ////////////////
  140. lhz r14, 0(r6) // int n = pVert->index;
  141. addi r16, r3, 2
  142. dcbt r11,r6
  143. cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2
  144. lvlx vr9,r0,r6
  145. rldicl r14, r14, 2, 0 // r14 = n*4
  146. lvrx vr10,r15,r6
  147. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  148. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  149. addi r31,r31,0//vpermwi128 vr40,vr40,0x1B //mr r31,r31
  150. add r16, r16, r4
  151. vpermwi128 vr40,vr40,0x1B //mr r30,r30
  152. addi r6, r6, 0x10 // pVert++
  153. vpermwi128 vr41,vr41,0x1B//nop
  154. lwzx r17, r14, r5 // r17 = oldCache
  155. //addi r30,r30,0//nop
  156. vperm vr10, vr8, vr9, vr4
  157. //addi r29,r29,0//nop
  158. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  159. vperm vr11, vr8, vr9, vr5
  160. stvx vr8, r0,r16
  161. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  162. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  163. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  164. stvx vr8, r15,r16
  165. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  166. vcsxwfp vr10,vr10,8
  167. or r19,r3,r7
  168. vperm vr12, vr8, vr9, vr6
  169. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  170. /*S:3*/ stvx vr30, r0,r23
  171. //nop
  172. /*S:3*/ stvx vr31, r15,r23
  173. //nop
  174. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  175. //nop
  176. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  177. //nop
  178. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  179. //nop
  180. /*S:2*/mr r23,r22
  181. //nop
  182. or r19, r19, r17 // r19 = updateCache
  183. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  184. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  185. //nop
  186. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  187. //nop
  188. /*S:1*/ vmulfp128 vr19, vr25, vr26
  189. /*S:1*/mr r22, r21
  190. vmaddfp vr20, vr10, vr2, vr3 // vr20 = f4sb
  191. add r21, r17, r4 // r21 = pFlexedVertex, goes to Stage:1
  192. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  193. stwx r19, r14, r5
  194. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  195. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  196. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  197. vcsxwfp128 vr32, vr11, 28
  198. //nop
  199. vcsxwfp128 vr33, vr12, 28
  200. bgt label_end_V7
  201. dcbt r11, r21
  202. bdnz label_start_V7
  203. label_end_V7:
  204. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  205. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  206. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  207. /*S:3*/ stvx vr30, r0,r23
  208. /*S:3*/ stvx vr31, r15,r23
  209. /*S:2*/mr r23,r22
  210. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  211. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  212. /*S:1*/ vmulfp128 vr19, vr25, vr26
  213. /*S:1*/mr r22, r21
  214. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  215. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  216. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  217. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  218. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  219. /*S:3*/ stvx vr30, r0,r23
  220. /*S:3*/ stvx vr31, r15,r23
  221. /*S:2*/mr r23,r22
  222. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  223. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  224. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  225. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  226. /*S:3*/ stvx vr30, r0,r23
  227. /*S:3*/ stvx vr31, r15,r23
  228. mftb r17
  229. subf r17, r25, r17
  230. lau r18, g_mp_morph_Vx
  231. lal r18, r18, g_mp_morph_Vx
  232. lwz r23, 0(r18)
  233. add r23,r23,r17
  234. stw r23, 0(r18)
  235. lwz r23, 4(r18)
  236. add r23,r23,r8
  237. stw r23, 4(r18)
  238. ld r14, -0x08(r1)
  239. ld r15, -0x10(r1)
  240. ld r16, -0x18(r1)
  241. ld r17, -0x20(r1)
  242. ld r18, -0x28(r1)
  243. ld r19, -0x30(r1)
  244. ld r20, -0x38(r1)
  245. ld r21, -0x40(r1)
  246. ld r22, -0x48(r1)
  247. ld r23, -0x50(r1)
  248. ld r24, -0x58(r1)
  249. ld r25, -0x60(r1)
  250. blr
  251. }
  252. }
  253. __declspec(naked) int ComputeFlexedVertexWrinkle_StreamOffset_V7(
  254. int nThinFlexVertexCount, //r3
  255. CachedPosNorm_t *pThinFlexVerts,//r4
  256. int32 *pFirstThinFlexIndex, //r5
  257. mstudiovertanim_wrinkle_t * pVert, //r6
  258. uint32 nCurrentTag, //r7
  259. uint32 numVertsToProcess, //r8
  260. fltx4 w1234 //vr1
  261. )
  262. {
  263. __asm
  264. {
  265. std r14, -0x08(r1)
  266. std r15, -0x10(r1)
  267. std r16, -0x18(r1)
  268. std r17, -0x20(r1)
  269. std r18, -0x28(r1)
  270. std r19, -0x30(r1)
  271. std r20, -0x38(r1)
  272. std r21, -0x40(r1)
  273. std r22, -0x48(r1)
  274. std r23, -0x50(r1)
  275. std r24, -0x58(r1)
  276. std r25, -0x60(r1)
  277. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  278. lau r14, g_sc256_255_special
  279. lal r14, r14, g_sc256_255_special
  280. lvx vr2, r0,r14
  281. lau r15, g_f40011
  282. lal r15, r15, g_f40011
  283. lvx vr3, r0,r15
  284. lau r16, g_perm_speed_side
  285. lal r16, r16, g_perm_speed_side
  286. lvx vr4, r0,r16
  287. lau r17, g_perm_delta_wrinkle
  288. lal r17, r17, g_perm_delta_wrinkle
  289. lvx vr5, r0,r17
  290. lau r18, g_perm_ndelta
  291. lal r18, r18, g_perm_ndelta
  292. lvx vr6, r0,r18
  293. lau r20, g_dummy2
  294. lal r20,r20, g_dummy2
  295. mr r21, r20
  296. mr r22, r21
  297. mr r23, r22
  298. li r10, -1
  299. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  300. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  301. vxor vr8,vr8,vr8
  302. li r15, 16
  303. li r11,0x100
  304. li r24, MAXSTUDIOFLEXVERTS - 4
  305. mtctr r8
  306. mftb r25
  307. vxor vr19,vr19,vr19
  308. vxor vr20,vr20,vr20
  309. nop // align!
  310. nop
  311. nop
  312. label_start_V7: // 52 instructions run in 45 cycles, although compiler predicts 38 cycles
  313. ////////////////
  314. // IMPORTANT: DO NOT REMOVE NOPS UNLESS YOU KNOW WHAT YOU ARE DOING AND WHY!
  315. // nops are essential here, removing them will make the code about 2% slower because dual-issue will be broken
  316. ////////////////
  317. lhz r14, 0(r6) // int n = pVert->index;
  318. addi r16, r3, 2
  319. dcbt r11,r6
  320. cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2
  321. lvlx vr9,r0,r6
  322. rldicl r14, r14, 2, 0 // r14 = n*4
  323. lvrx vr10,r15,r6
  324. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  325. lvlx vr27,r15,r6 // f3PreDelta
  326. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  327. addi r31,r31,0//vpermwi128 vr40,vr40,0x1B //mr r31,r31
  328. add r16, r16, r4
  329. vpermwi128 vr40,vr40,0x1B //mr r30,r30
  330. addi r6, r6, 0x12 // pVert++
  331. vpermwi128 vr41,vr41,0x1B//nop
  332. lwzx r17, r14, r5 // r17 = oldCache
  333. //addi r30,r30,0//nop
  334. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  335. vrlimi128 vr27,vr9,7,0// f3PreDelta
  336. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  337. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  338. stvx vr8, r0,r16
  339. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  340. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  341. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  342. stvx vr8, r15,r16
  343. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  344. vcsxwfp vr10,vr10,8
  345. or r19,r3,r7
  346. vperm vr11, vr8, vr27, vr5 //f3Delta = __vperm(f4Zero, f3PreDelta, permuteDelta)
  347. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  348. /*S:3*/ stvx vr30, r0,r23
  349. //nop
  350. /*S:3*/ stvx vr31, r15,r23
  351. //nop
  352. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  353. //nop
  354. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  355. //nop
  356. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  357. //nop
  358. /*S:2*/mr r23,r22
  359. //nop
  360. or r19, r19, r17 // r19 = updateCache
  361. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  362. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  363. //nop
  364. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  365. //nop
  366. /*S:1*/ vmulfp128 vr19, vr25, vr26
  367. /*S:1*/mr r22, r21
  368. vmaddfp vr20, vr10, vr2, vr3 // vr20 = f4sb
  369. add r21, r17, r4 // r21 = pFlexedVertex, goes to Stage:1
  370. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  371. stwx r19, r14, r5
  372. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  373. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  374. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  375. vcsxwfp128 vr32, vr11, 28
  376. //nop
  377. vcsxwfp128 vr33, vr12, 28
  378. bgt label_end_V7
  379. dcbt r11, r21
  380. bdnz label_start_V7
  381. label_end_V7:
  382. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  383. /*S:1*/ vpermwi128 vr25, vr20, 0x22 // depends on vmadd vr20 = f4sb
  384. /*S:1*/ vpermwi128 vr26, vr20, 0xF5
  385. /*S:3*/ stvx vr30, r0,r23
  386. /*S:3*/ stvx vr31, r15,r23
  387. /*S:2*/mr r23,r22
  388. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  389. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  390. /*S:1*/ vmulfp128 vr19, vr25, vr26
  391. /*S:1*/mr r22, r21
  392. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  393. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  394. /*S:1*/ vpermwi128 vr21, vr32, 0x1B
  395. /*S:1*/ vpermwi128 vr22, vr33, 0x1B
  396. /*S:2*/ vmsum4fp128 vr29,vr19, vr1 // vr29 = scWeight
  397. /*S:3*/ stvx vr30, r0,r23
  398. /*S:3*/ stvx vr31, r15,r23
  399. /*S:2*/mr r23,r22
  400. /*S:2*/ lvx vr13, r0,r22 // vr13 = vfPosition
  401. /*S:2*/ lvx vr14, r15,r22 // vr14 = vfNormal
  402. /*S:2*/ vmaddfp vr30, vr29, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  403. /*S:2*/ vmaddfp vr31, vr29, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  404. /*S:3*/ stvx vr30, r0,r23
  405. /*S:3*/ stvx vr31, r15,r23
  406. mftb r17
  407. subf r17, r25, r17
  408. lau r18, g_mp_morph_Vw
  409. lal r18, r18, g_mp_morph_Vw
  410. lwz r23, 0(r18)
  411. add r23,r23,r17
  412. stw r23, 0(r18)
  413. lwz r23, 4(r18)
  414. add r23,r23,r8
  415. stw r23, 4(r18)
  416. ld r14, -0x08(r1)
  417. ld r15, -0x10(r1)
  418. ld r16, -0x18(r1)
  419. ld r17, -0x20(r1)
  420. ld r18, -0x28(r1)
  421. ld r19, -0x30(r1)
  422. ld r20, -0x38(r1)
  423. ld r21, -0x40(r1)
  424. ld r22, -0x48(r1)
  425. ld r23, -0x50(r1)
  426. ld r24, -0x58(r1)
  427. ld r25, -0x60(r1)
  428. blr
  429. }
  430. }
  431. // V4 rolled - latency of x3
  432. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V6(
  433. int nThinFlexVertexCount, //r3
  434. CachedPosNorm_t *pThinFlexVerts,//r4
  435. int32 *pFirstThinFlexIndex, //r5
  436. mstudiovertanim_t * pVert, //r6
  437. uint32 nCurrentTag, //r7
  438. uint32 numVertsToProcess, //r8
  439. fltx4 w1234 //vr1
  440. )
  441. {
  442. __asm
  443. {
  444. std r14, -0x08(r1)
  445. std r15, -0x10(r1)
  446. std r16, -0x18(r1)
  447. std r17, -0x20(r1)
  448. std r18, -0x28(r1)
  449. std r19, -0x30(r1)
  450. std r20, -0x38(r1)
  451. std r21, -0x40(r1)
  452. std r22, -0x48(r1)
  453. std r23, -0x50(r1)
  454. std r24, -0x58(r1)
  455. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  456. lau r14, g_sc256_255_special
  457. lal r14, r14, g_sc256_255_special
  458. lvx vr2, r0,r14
  459. lau r15, g_f40011
  460. lal r15, r15, g_f40011
  461. lvx vr3, r0,r15
  462. lau r16, g_perm_speed_side
  463. lal r16, r16, g_perm_speed_side
  464. lvx vr4, r0,r16
  465. lau r17, g_perm_delta
  466. lal r17, r17, g_perm_delta
  467. lvx vr5, r0,r17
  468. lau r18, g_perm_ndelta
  469. lal r18, r18, g_perm_ndelta
  470. lvx vr6, r0,r18
  471. lau r20, g_dummy2
  472. lal r20,r20, g_dummy2
  473. mr r21, r20
  474. mr r22, r21
  475. li r10, -1
  476. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  477. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  478. vxor vr8,vr8,vr8
  479. li r15, 16
  480. lau r14,g_nStreamOffset_prefetch
  481. lal r14,r14,g_nStreamOffset_prefetch
  482. lwz r11,0(r14)
  483. li r24, MAXSTUDIOFLEXVERTS - 2
  484. mtctr r8
  485. mftb r23
  486. label_start:
  487. lhz r14, 0(r6) // int n = pVert->index;
  488. dcbt r11,r6
  489. addi r16, r3, 2
  490. cmpw r3, r24 // compare nThinFlexVertexCount to MAXSTUDIOFLEXVERTS - 2
  491. lvlx vr9,r0,r6
  492. lvrx vr10,r15,r6
  493. rldicl r14, r14, 2, 0 // r14 = n*4
  494. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  495. add r16, r16, r4
  496. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  497. stvx vr8, r0,r16
  498. lwzx r17, r14, r5 // r17 = oldCache
  499. stvx vr8, r15,r16
  500. vmsum4fp128 vr19,vr19, vr1 // vr15 = scWeight
  501. vperm vr10, vr8, vr9, vr4
  502. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  503. vperm vr11, vr8, vr9, vr5
  504. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  505. vcsxwfp vr10,vr10,8
  506. vperm vr12, vr8, vr9, vr6
  507. stvx vr23, r0,r22
  508. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  509. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  510. stvx vr24, r15,r22
  511. or r19,r3,r7
  512. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  513. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  514. vpermwi128 vr15, vr10, 0x22
  515. or r19, r19, r17 // r19 = updateCache
  516. vpermwi128 vr16, vr10, 0xF5
  517. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  518. vmaddfp vr24, vr19, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  519. vmaddfp vr23, vr19, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  520. vmulfp128 vr19, vr15, vr16
  521. add r17, r17, r4 // r17 = pFlexedVertex
  522. stwx r19, r14, r5
  523. subf r3, r18, r3// nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  524. lvx vr13, r0,r17 // vr13 = vfPosition
  525. addi r6, r6, 0x10 // pVert++
  526. lvx vr14, r15,r17 // vr14 = vfNormal
  527. vcsxwfp vr21, vr11, 28
  528. mr r22,r21
  529. vcsxwfp vr22, vr12, 28
  530. mr r21,r17
  531. bgt label_end
  532. dcbt r11, r17
  533. bdnz label_start
  534. label_end:
  535. mftb r17
  536. subf r17, r23, r17
  537. lau r18, g_mp_morph_Vx
  538. lal r18, r18, g_mp_morph_Vx
  539. lwz r23, 0(r18)
  540. add r23,r23,r17
  541. stw r23, 0(r18)
  542. lwz r23, 4(r18)
  543. add r23,r23,r8
  544. stw r23, 4(r18)
  545. vmsum4fp128 vr19,vr19, vr1 // vr15 = scWeight
  546. stvx vr23, r0,r22
  547. stvx vr24, r15,r22
  548. vmaddfp vr24, vr19, vr22, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  549. vmaddfp vr23, vr19, vr21, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  550. stvx vr23, r0,r21
  551. stvx vr24, r15,r21
  552. ld r14, -0x08(r1)
  553. ld r15, -0x10(r1)
  554. ld r16, -0x18(r1)
  555. ld r17, -0x20(r1)
  556. ld r18, -0x28(r1)
  557. ld r19, -0x30(r1)
  558. ld r20, -0x38(r1)
  559. ld r21, -0x40(r1)
  560. ld r22, -0x48(r1)
  561. ld r23, -0x50(r1)
  562. ld r24, -0x58(r1)
  563. blr
  564. }
  565. }
  566. // 2-stages
  567. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V5(
  568. int nThinFlexVertexCount, //r3
  569. CachedPosNorm_t *pThinFlexVerts,//r4
  570. int32 *pFirstThinFlexIndex, //r5
  571. mstudiovertanim_t * pVert, //r6
  572. uint32 nCurrentTag, //r7
  573. uint32 numVertsToProcess, //r8
  574. fltx4 w1234 //vr1
  575. )
  576. {
  577. __asm
  578. {
  579. std r14, -0x08(r1)
  580. std r15, -0x10(r1)
  581. std r16, -0x18(r1)
  582. std r17, -0x20(r1)
  583. std r18, -0x28(r1)
  584. std r19, -0x30(r1)
  585. std r20, -0x38(r1)
  586. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  587. lau r14, g_sc256_255_special
  588. lal r14, r14, g_sc256_255_special
  589. lvx vr2, r0,r14
  590. lau r15, g_f40011
  591. lal r15, r15, g_f40011
  592. lvx vr3, r0,r15
  593. lau r16, g_perm_speed_side
  594. lal r16, r16, g_perm_speed_side
  595. lvx vr4, r0,r16
  596. lau r17, g_perm_delta
  597. lal r17, r17, g_perm_delta
  598. lvx vr5, r0,r17
  599. lau r18, g_perm_ndelta
  600. lal r18, r18, g_perm_ndelta
  601. lvx vr6, r0,r18
  602. lau r20, g_dummy2
  603. lal r20,r20, g_dummy2
  604. vxor vr8,vr8,vr8
  605. li r10, -1
  606. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  607. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  608. mtctr r8
  609. li r15, 16
  610. label_start_schlp:
  611. lhz r14, 0(r6) // int n = pVert->index;
  612. addi r16, r3, 2 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  613. lvlx vr9,r0,r6
  614. rldicl r14, r14, 2, 0 // r14 = n*4
  615. lvrx vr10,r15,r6
  616. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  617. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  618. add r16, r16, r4
  619. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  620. addi r6, r6, 0x10 // pVert++
  621. vcsxwfp vr10,vr10,8
  622. vmaddfp vr17, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) - stage 1
  623. vmaddfp vr18, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) - stage 1
  624. vperm vr11, vr8, vr9, vr5 //f3Delta = __vperm(f4Zero, packedVert, permuteDelta)
  625. vcsxwfp vr11, vr11, 28
  626. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  627. vcsxwfp vr12, vr12, 28
  628. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  629. lwzx r17, r14, r5 // r17 = oldCache
  630. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  631. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  632. or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount
  633. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  634. vpermwi128 vr15, vr10, 0x22
  635. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  636. vpermwi128 vr16, vr10, 0xF5
  637. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  638. stvx vr8, r0, r16
  639. or r19, r19, r17 // r19 = updateCache
  640. stvx vr8, r15, r16
  641. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  642. add r17, r17, r4 // r17 = pFlexedVertex
  643. vmulfp128 vr15, vr15, vr16
  644. lvx vr13, r0,r17 // vr13 = vfPosition
  645. lvx vr14, r15,r17 // vr14 = vfNormal
  646. vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight
  647. stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache
  648. subf r3, r18, r3// nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  649. stvx vr17, r0,r20 // stage 1
  650. stvx vr18, r15,r20 // stage 1
  651. mr r20, r17
  652. bdnz label_start_schlp
  653. vmaddfp vr17, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition) - stage 1
  654. vmaddfp vr18, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal) - stage 1
  655. stvx vr17, r0,r20 // stage 1; deferred storing saves 15 cycles (10%!)
  656. stvx vr18, r15,r20
  657. ld r14, -0x08(r1)
  658. ld r15, -0x10(r1)
  659. ld r16, -0x18(r1)
  660. ld r17, -0x20(r1)
  661. ld r18, -0x28(r1)
  662. ld r19, -0x30(r1)
  663. ld r20, -0x38(r1)
  664. blr
  665. }
  666. }
  667. // V3 in asm
  668. __declspec(naked) int ComputeFlexedVertex_StreamOffset_V4(
  669. int nThinFlexVertexCount, //r3
  670. CachedPosNorm_t *pThinFlexVerts,//r4
  671. int32 *pFirstThinFlexIndex, //r5
  672. mstudiovertanim_t * pVert, //r6
  673. uint32 nCurrentTag, //r7
  674. uint32 numVertsToProcess, //r8
  675. fltx4 w1234 //vr1
  676. )
  677. {
  678. __asm
  679. {
  680. std r14, -0x08(r1)
  681. std r15, -0x10(r1)
  682. std r16, -0x18(r1)
  683. std r17, -0x20(r1)
  684. std r18, -0x28(r1)
  685. std r19, -0x30(r1)
  686. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  687. lau r14, g_sc256_255_special
  688. lal r14, r14, g_sc256_255_special
  689. lvx vr2, r0,r14
  690. lau r15, g_f40011
  691. lal r15, r15, g_f40011
  692. lvx vr3, r0,r15
  693. lau r16, g_perm_speed_side
  694. lal r16, r16, g_perm_speed_side
  695. lvx vr4, r0,r16
  696. lau r17, g_perm_delta
  697. lal r17, r17, g_perm_delta
  698. lvx vr5, r0,r17
  699. lau r18, g_perm_ndelta
  700. lal r18, r18, g_perm_ndelta
  701. lvx vr6, r0,r18
  702. li r10, -1
  703. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  704. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  705. lau r14,g_nStreamOffset_prefetch
  706. lal r14,r14,g_nStreamOffset_prefetch
  707. lwz r11,0(r14)
  708. vxor vr8,vr8,vr8
  709. li r15, 16
  710. li r24, MAXSTUDIOFLEXVERTS - 3 // critical number at which to stop processing
  711. mtctr r8
  712. label_start:
  713. lhz r14, 0(r6) // int n = pVert->index;
  714. dcbt r11,r16
  715. rldicl r14, r14, 2, 0 // r14 = n*4
  716. addi r16, r3, 2
  717. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  718. add r16, r16, r4
  719. stvx vr8, r0,r16
  720. stvx vr8, r15,r16
  721. lvlx vr9,r0,r6
  722. lvrx vr10,r15,r6
  723. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  724. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  725. vcsxwfp vr10,vr10,8
  726. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  727. vperm vr11, vr8, vr9, vr5 //f3Delta = __vperm(f4Zero, packedVert, permuteDelta)
  728. vcsxwfp vr11, vr11, 28
  729. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  730. vcsxwfp vr12, vr12, 28
  731. lwzx r17, r14, r5 // r17 = oldCache
  732. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  733. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  734. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  735. or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount
  736. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  737. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  738. or r19, r19, r17 // r19 = updateCache
  739. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  740. add r17, r17, r4 // r17 = pFlexedVertex
  741. lvx vr13, r0,r17 // vr13 = vfPosition
  742. lvx vr14, r15,r17 // vr14 = vfNormal
  743. dcbt r11,r17
  744. vpermwi128 vr15, vr10, 0x22
  745. vpermwi128 vr16, vr10, 0xF5
  746. vmulfp128 vr15, vr15, vr16
  747. vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight
  748. stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache
  749. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  750. vmaddfp vr14, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  751. vmaddfp vr13, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  752. stvx vr13, r0,r17
  753. stvx vr14, r15,r17
  754. cmpw r3, r24
  755. bgt label_end
  756. addi r6, r6, 0x10 // pVert++
  757. bdnz label_start
  758. label_end:
  759. ld r14, -0x08(r1)
  760. ld r15, -0x10(r1)
  761. ld r16, -0x18(r1)
  762. ld r17, -0x20(r1)
  763. ld r18, -0x28(r1)
  764. ld r19, -0x30(r1)
  765. blr
  766. }
  767. }
  768. // V3 in asm
  769. __declspec(naked) int ComputeFlexedVertexWrinkle_StreamOffset_V4(
  770. int nThinFlexVertexCount, //r3
  771. CachedPosNorm_t *pThinFlexVerts,//r4
  772. int32 *pFirstThinFlexIndex, //r5
  773. mstudiovertanim_wrinkle_t * pVert,//r6
  774. uint32 nCurrentTag, //r7
  775. uint32 numVertsToProcess, //r8
  776. fltx4 w1234 //vr1
  777. )
  778. {
  779. __asm
  780. {
  781. std r14, -0x08(r1)
  782. std r15, -0x10(r1)
  783. std r16, -0x18(r1)
  784. std r17, -0x20(r1)
  785. std r18, -0x28(r1)
  786. std r19, -0x30(r1)
  787. // let the compiler schedule the instructions, just use several registers to avoid dependencies
  788. lau r14, g_sc256_255_special
  789. lal r14, r14, g_sc256_255_special
  790. lvx vr2, r0,r14
  791. lau r15, g_f40011
  792. lal r15, r15, g_f40011
  793. lvx vr3, r0,r15
  794. lau r16, g_perm_speed_side
  795. lal r16, r16, g_perm_speed_side
  796. lvx vr4, r0,r16
  797. lau r17, g_perm_delta_wrinkle
  798. lal r17, r17, g_perm_delta_wrinkle
  799. lvx vr5, r0,r17
  800. lau r18, g_perm_ndelta
  801. lal r18, r18, g_perm_ndelta
  802. lvx vr6, r0,r18
  803. li r10, -1
  804. rldicl r7,r7,0,32 // currentTag &= 0xFFFFFFFF ; just to make sure we don't mess up isCacheInvalid computation
  805. rldicl r10,r10,0,48 // r10 = 0x0000FFFF
  806. lau r14,g_nStreamOffset_prefetch
  807. lal r14,r14,g_nStreamOffset_prefetch
  808. lwz r11,0(r14)
  809. vxor vr8,vr8,vr8
  810. li r15, 16
  811. li r24, MAXSTUDIOFLEXVERTS - 3 // critical number at which to stop processing
  812. mtctr r8
  813. label_start:
  814. lhz r14, 0(r6) // int n = pVert->index;
  815. dcbt r11,r16
  816. rldicl r14, r14, 2, 0 // r14 = n*4
  817. addi r16, r3, 2
  818. rldicl r16, r16, 5, 0 // r16 = (nThinFlexVertexCount+2) * 32 + pThinFlexVerts
  819. add r16, r16, r4
  820. stvx vr8, r0,r16
  821. stvx vr8, r15,r16
  822. lvlx vr27,r15,r6 // f3PreDelta
  823. lvlx vr9,r0,r6
  824. lvrx vr10,r15,r6
  825. vor vr9,vr9,vr10 // vr9 = packedVert = LoadUnalignedSIMD(pVert)
  826. vrlimi128 vr27,vr9,7,0// f3PreDelta
  827. vperm vr10, vr8, vr9, vr4 //__vperm(f4Zero, packedVert, permuteSpeedSide)
  828. vcsxwfp vr10,vr10,8
  829. vmaddfp vr10, vr10, vr2, vr3 // vr10 = f4sb
  830. vperm vr11, vr8, vr27, vr5 //f3Delta = __vperm(f4Zero, f3PreDelta, permuteDelta)
  831. vcsxwfp vr11, vr11, 28
  832. vperm vr12, vr8, vr9, vr6 //f3NDelta = __vperm(f4Zero, packedVert, permuteNDelta)
  833. vcsxwfp vr12, vr12, 28
  834. lwzx r17, r14, r5 // r17 = oldCache
  835. xor r18, r17, r7 // cacheVertexIndex = oldCache^nCurrentTag
  836. subf r18,r18,r10 // (0xFFFF-cacheVertexIndex) >> 32
  837. sradi r18,r18,32 // r18 = isCacheInvalid : form mask
  838. or r19,r3,r7 // newCache = nCurrentTag | nThinFlexVertexCount
  839. and r19,r19,r18 // r19 = newCache & isCacheInvalid
  840. andc r17, r17, r18 // r17 = oldCache & ~isCacheInvalid
  841. or r19, r19, r17 // r19 = updateCache
  842. rldicl r17, r19, 5,43 // r17 = (updateCache & 0xFFFF) * 32 = nVertexIndex * 32
  843. add r17, r17, r4 // r17 = pFlexedVertex
  844. lvx vr13, r0,r17 // vr13 = vfPosition
  845. lvx vr14, r15,r17 // vr14 = vfNormal
  846. dcbt r11,r17
  847. vpermwi128 vr15, vr10, 0x22
  848. vpermwi128 vr16, vr10, 0xF5
  849. vmulfp128 vr15, vr15, vr16
  850. vmsum4fp128 vr15,vr15, vr1 // vr15 = scWeight
  851. stwx r19, r14, r5 // pFirstThinFlexIndex[n] = updateCache
  852. subf r3, r18, r3 // nThinFlexVertexCount = nThinFlexVertexCount + (isCacheInvalid&1);
  853. vmaddfp vr14, vr15, vr12, vr14 // MaddSIMD(scWeight,f3NDelta, vfNormal)
  854. vmaddfp vr13, vr15, vr11, vr13 // MaddSIMD(scWeight,f3Delta, vfPosition)
  855. stvx vr13, r0,r17
  856. stvx vr14, r15,r17
  857. cmpw r3, r24
  858. bgt label_end
  859. addi r6, r6, 0x12 // pVert++
  860. bdnz label_start
  861. label_end:
  862. ld r14, -0x08(r1)
  863. ld r15, -0x10(r1)
  864. ld r16, -0x18(r1)
  865. ld r17, -0x20(r1)
  866. ld r18, -0x28(r1)
  867. ld r19, -0x30(r1)
  868. blr
  869. }
  870. }
  871. // base for asm
  872. int ComputeFlexedVertex_StreamOffset_V3(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  873. {
  874. fltx4 sc256_255_special = g_sc256_255_special;
  875. fltx4 f40011 = g_f40011;
  876. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  877. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta);
  878. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  879. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  880. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  881. fltx4 f4Zero = Four_Zeros;
  882. do
  883. {
  884. int n = pVert->index;
  885. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  886. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  887. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  888. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011);
  889. // f4sb = {s,b,1-s,1-b}
  890. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  891. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  892. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  893. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  894. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  895. int64 isCacheValid = ~isCacheInvalid;
  896. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  897. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  898. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  899. int nVertexIndex = updateCache & 0xFFFF;
  900. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  901. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  902. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  903. // here we need to form the following vector to compute final w:
  904. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  905. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1));
  906. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5));
  907. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  908. pFirstThinFlexIndex[n] = updateCache;
  909. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal));
  910. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition));
  911. pVert ++;
  912. }
  913. while(--numVertsToProcess); // why doesn't this use bdnz??
  914. return nThinFlexVertexCount;
  915. }
  916. // base for asm
  917. int ComputeFlexedVertexWrinkle_StreamOffset_V3(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_wrinkle_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  918. {
  919. fltx4 sc256_255_special = g_sc256_255_special;
  920. fltx4 f40011 = g_f40011;
  921. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  922. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta_wrinkle);
  923. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  924. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  925. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  926. fltx4 f4Zero = Four_Zeros;
  927. do
  928. {
  929. int n = pVert->index;
  930. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  931. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  932. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  933. fltx4 f3PreDelta = __lvlx(pVert, 16); // f3Delta now contains only packed W component in high X halfword...
  934. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011);
  935. // f4sb = {s,b,1-s,1-b}
  936. f3PreDelta = __vrlimi(f3PreDelta, packedVert, 7, 0); // don't rotate and move bytes 4..15 from packed vert to f3PreDelta
  937. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  938. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, f3PreDelta, permuteDelta), 12+16);
  939. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  940. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  941. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  942. int64 isCacheValid = ~isCacheInvalid;
  943. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  944. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  945. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  946. int nVertexIndex = updateCache & 0xFFFF;
  947. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  948. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  949. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  950. // here we need to form the following vector to compute final w:
  951. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  952. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1));
  953. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5));
  954. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  955. pFirstThinFlexIndex[n] = updateCache;
  956. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal));
  957. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition));
  958. pVert ++;
  959. }
  960. while(--numVertsToProcess); // why doesn't this use bdnz??
  961. return nThinFlexVertexCount;
  962. }
  963. // tried to pipeline in C++
  964. int ComputeFlexedVertex_StreamOffset_V2(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  965. {
  966. Assert(0 == (uint32(pVert) & 0xF));
  967. fltx4 sc256_255_special = g_sc256_255_special;
  968. fltx4 f40011 = g_f40011;
  969. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  970. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta);
  971. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  972. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  973. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  974. fltx4 f4Zero = Four_Zeros;
  975. fltx4 f4sb_st1, f3Delta_st1, f3NDelta_st1;
  976. int32 updateCache_st1;
  977. mstudiovertanim_t *pVertEnd = pVert + numVertsToProcess;
  978. {
  979. // stage 0
  980. int n = pVert->index;
  981. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  982. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  983. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  984. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); // to be completely correct, we'll ned to multiply this with 256/255
  985. // f4sb = {s,b,1-s,1-b}
  986. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  987. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  988. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  989. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  990. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  991. int64 isCacheValid = ~isCacheInvalid;
  992. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  993. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  994. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  995. pFirstThinFlexIndex[n] = updateCache;
  996. // prime next stage 1
  997. f4sb_st1 = f4sb;
  998. f3Delta_st1 = f3Delta;
  999. f3NDelta_st1 = f3NDelta;
  1000. updateCache_st1 = updateCache;
  1001. pVert ++;
  1002. }
  1003. while(pVert < pVertEnd)
  1004. {
  1005. // stage 1
  1006. {
  1007. int nVertexIndex = updateCache_st1 & 0xFFFF;
  1008. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  1009. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  1010. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  1011. // here we need to form the following vector to compute final w:
  1012. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  1013. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb_st1,f4sb_st1,permuteW0), __vperm(f4sb_st1,f4sb_st1,permuteW1));
  1014. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb_st1,0x22), __vpermwi(f4sb_st1,0xF5));
  1015. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  1016. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta_st1, vfNormal));
  1017. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta_st1, vfPosition));
  1018. }
  1019. // stage 0
  1020. {
  1021. int n = pVert->index;
  1022. pThinFlexVerts[nThinFlexVertexCount+2].m_Position.InitZero();
  1023. pThinFlexVerts[nThinFlexVertexCount+2].m_Normal.InitZero();
  1024. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  1025. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011); // to be completely correct, we'll ned to multiply this with 256/255
  1026. // f4sb = {s,b,1-s,1-b}
  1027. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  1028. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  1029. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  1030. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  1031. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  1032. int64 isCacheValid = ~isCacheInvalid;
  1033. int64 newCache = nCurrentTag | nThinFlexVertexCount;
  1034. int64 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  1035. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  1036. pFirstThinFlexIndex[n] = updateCache; // this may be put wherever it doesn't mess up the other stores
  1037. // prime next stage 1
  1038. f4sb_st1 = f4sb;
  1039. updateCache_st1 = updateCache;
  1040. f3Delta_st1 = f3Delta;
  1041. f3NDelta_st1 = f3NDelta;
  1042. }
  1043. pVert ++;
  1044. }
  1045. // stage 1
  1046. {
  1047. int nVertexIndex = updateCache_st1 & 0xFFFF;
  1048. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  1049. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  1050. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  1051. // here we need to form the following vector to compute final w:
  1052. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  1053. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb_st1,f4sb_st1,permuteW0), __vperm(f4sb_st1,f4sb_st1,permuteW1));
  1054. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb_st1,0x22), __vpermwi(f4sb_st1,0xF5));
  1055. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  1056. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta_st1, vfNormal));
  1057. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta_st1, vfPosition));
  1058. }
  1059. return nThinFlexVertexCount;
  1060. }
  1061. // branchless
  1062. int ComputeFlexedVertex_StreamOffset_V1(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234)
  1063. {
  1064. Assert(0 == (uint32(pVert) & 0xF));
  1065. fltx4 sc256_255_special = g_sc256_255_special;
  1066. fltx4 f40011 = g_f40011;
  1067. fltx4 permuteSpeedSide = LoadAlignedSIMD((const float*)g_perm_speed_side);
  1068. fltx4 permuteDelta = LoadAlignedSIMD((const float*)g_perm_delta);
  1069. fltx4 permuteNDelta = LoadAlignedSIMD((const float*)g_perm_ndelta);
  1070. //fltx4 permuteW0 = LoadAlignedSIMD((const float*)g_perm_w0);
  1071. //fltx4 permuteW1 = LoadAlignedSIMD((const float*)g_perm_w1);
  1072. fltx4 f4Zero = Four_Zeros;
  1073. mstudiovertanim_t *pVertEnd = pVert + numVertsToProcess;
  1074. do
  1075. {
  1076. int n = pVert->index;
  1077. pThinFlexVerts[nThinFlexVertexCount].m_Position.InitZero();
  1078. pThinFlexVerts[nThinFlexVertexCount].m_Normal.InitZero();
  1079. fltx4 packedVert = LoadUnalignedSIMD((const float*)pVert);
  1080. fltx4 f4sb = MaddSIMD(__vcfsx(__vperm(f4Zero, packedVert, permuteSpeedSide), 8), sc256_255_special, f40011);
  1081. // f4sb = {s,b,1-s,1-b}
  1082. fltx4 f3Delta = __vcfsx(__vperm(f4Zero, packedVert, permuteDelta), 12+16);
  1083. fltx4 f3NDelta = __vcfsx(__vperm(f4Zero, packedVert, permuteNDelta), 12+16);
  1084. uint64 oldCache = uint32(pFirstThinFlexIndex[n]);
  1085. uint64 cacheVertexIndex = oldCache^nCurrentTag; // if there is trash in high (2^16) bits, we need to update the cache
  1086. int64 isCacheInvalid = int64(0xFFFF-cacheVertexIndex)>>32; // the second shift must be arithmetic to form a valid mask
  1087. int32 isCacheValid = ~isCacheInvalid;
  1088. int32 newCache = nCurrentTag | nThinFlexVertexCount;
  1089. int32 updateCache = (newCache & isCacheInvalid) | (oldCache & isCacheValid);
  1090. nThinFlexVertexCount = nThinFlexVertexCount - isCacheInvalid;
  1091. int nVertexIndex = updateCache & 0xFFFF;
  1092. CachedPosNorm_t *pFlexedVertex = pThinFlexVerts + nVertexIndex; // will be overridden
  1093. fltx4 vfNormal = LoadAlignedSIMD((float*)&pFlexedVertex->m_Normal);
  1094. fltx4 vfPosition = LoadAlignedSIMD((float*)&pFlexedVertex->m_Position);
  1095. // here we need to form the following vector to compute final w:
  1096. // {s(1-b), (1-s)(1-b), sb, (1-s)b}
  1097. //fltx4 f4sbProd = MulSIMD(__vperm(f4sb,f4sb,permuteW0), __vperm(f4sb,f4sb,permuteW1));
  1098. fltx4 f4sbProd = MulSIMD(__vpermwi(f4sb,0x22), __vpermwi(f4sb,0xF5));
  1099. fltx4 scWeight = __vmsum4fp(f4sbProd,w1234);
  1100. pFirstThinFlexIndex[n] = updateCache;
  1101. StoreAlignedSIMD((float*)&pFlexedVertex->m_Normal, MaddSIMD(scWeight,f3NDelta, vfNormal));
  1102. StoreAlignedSIMD((float*)&pFlexedVertex->m_Position, MaddSIMD(scWeight,f3Delta, vfPosition));
  1103. pVert ++;
  1104. }
  1105. while(pVert < pVertEnd); // why doesn't this use CTR??
  1106. return nThinFlexVertexCount;
  1107. }
  1108. typedef int (*Fn_ComputeFlexedVertex_StreamOffset)(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234);
  1109. Fn_ComputeFlexedVertex_StreamOffset g_fn_ComputeFlexedVertex_StreamOffset[8] =
  1110. {
  1111. NULL,
  1112. ComputeFlexedVertex_StreamOffset_V1,
  1113. ComputeFlexedVertex_StreamOffset_V2,
  1114. ComputeFlexedVertex_StreamOffset_V3,
  1115. ComputeFlexedVertex_StreamOffset_V4,
  1116. ComputeFlexedVertex_StreamOffset_V5,
  1117. ComputeFlexedVertex_StreamOffset_V6,
  1118. ComputeFlexedVertex_StreamOffset_V7
  1119. };
  1120. typedef int (*Fn_ComputeFlexedVertexWrinkle_StreamOffset)(int nThinFlexVertexCount, CachedPosNorm_t *pThinFlexVerts, int32 *pFirstThinFlexIndex, mstudiovertanim_wrinkle_t * pVert, uint32 nCurrentTag, uint32 numVertsToProcess, fltx4 w1234);
  1121. Fn_ComputeFlexedVertexWrinkle_StreamOffset g_fn_ComputeFlexedVertexWrinkle_StreamOffset[8] =
  1122. {
  1123. NULL,
  1124. ComputeFlexedVertexWrinkle_StreamOffset_V3,
  1125. ComputeFlexedVertexWrinkle_StreamOffset_V3,
  1126. ComputeFlexedVertexWrinkle_StreamOffset_V3,
  1127. ComputeFlexedVertexWrinkle_StreamOffset_V4,
  1128. ComputeFlexedVertexWrinkle_StreamOffset_V4,
  1129. ComputeFlexedVertexWrinkle_StreamOffset_V4,
  1130. ComputeFlexedVertexWrinkle_StreamOffset_V7
  1131. };
  1132. inline float Diff(const CachedPosNorm_t&a, const CachedPosNorm_t&b)
  1133. {
  1134. return a.m_Position.DistTo(b.m_Position) + a.m_Normal.DistTo(b.m_Normal);
  1135. }
  1136. bool g_bBreakOnAssert = true;
  1137. void AlwaysAssert(bool mustBeTrue)
  1138. {
  1139. if(!mustBeTrue)
  1140. {
  1141. Plat_DebugString("AlwaysAssert\n");
  1142. if(g_bBreakOnAssert)
  1143. DebugBreak();
  1144. }
  1145. }
  1146. #endif
  1147. template
  1148. void CCachedRenderData::ComputeFlexedVertex_StreamOffset<mstudiovertanim_t>( studiohdr_t *pStudioHdr, mstudioflex_t *pflex,
  1149. mstudiovertanim_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 );
  1150. template
  1151. void CCachedRenderData::ComputeFlexedVertex_StreamOffset<mstudiovertanim_wrinkle_t>( studiohdr_t *pStudioHdr, mstudioflex_t *pflex,
  1152. mstudiovertanim_wrinkle_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 );
  1153. // vectorized
  1154. void CCachedRenderData::ComputeFlexedVertex_StreamOffset_Optimized( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, mstudiovertanim_t *pvanim, int vertCount, float w1, float w2, float w3, float w4 )
  1155. {
  1156. #if PROFILE_THIS_FILE
  1157. CMiniProfilerGuard mpguard(&g_mp_morph);
  1158. #endif
  1159. #ifdef _X360
  1160. int nMorphPath = g_cv_morph_path.GetInt();
  1161. if(nMorphPath)
  1162. {
  1163. mstudiovertanim_t vertCountStruct;
  1164. vertCountStruct.index = vertCount;
  1165. /*for(uint32 i = 1; i< pflex->numverts; ++i)
  1166. if(pvanim[i-1].index > pvanim[i].index)
  1167. DebugBreak();*/
  1168. mstudiovertanim_t * pVertEnd;
  1169. {
  1170. #if PROFILE_THIS_FILE
  1171. CMiniProfilerGuard mpguard_lower_bound(&g_mp_morph_lower_bound);
  1172. #endif
  1173. pVertEnd = std::lower_bound(pvanim, pvanim + pflex->numverts, vertCountStruct, mstudiovertanim_t::CSortByIndex());
  1174. }
  1175. if(pvanim < pVertEnd)
  1176. {
  1177. union
  1178. {
  1179. fltx4 f4;
  1180. float f1[4];
  1181. } weights;
  1182. weights.f1[0] = w1;
  1183. weights.f1[1] = w2;
  1184. weights.f1[2] = w3;
  1185. weights.f1[3] = w4;
  1186. uint32 nCurrentTag = uint32(m_CurrentTag)<<16;
  1187. int nThinFlexVertexCount = m_ThinFlexVertexCount;
  1188. int32 *pFirstThinFlexIndex = (int32*)m_pFirstThinFlexIndex;
  1189. CachedPosNorm_t *pThinFlexVerts = m_pThinFlexVerts;
  1190. uint64 numVertsToProcess = pVertEnd - pvanim;
  1191. nMorphPath = MIN(7,nMorphPath);
  1192. /*static int maxVertsSaved = 0;
  1193. if(numVertsToProcess > maxVertsSaved)
  1194. {
  1195. maxVertsSaved = numVertsToProcess;
  1196. FileHandle_t fh = g_pFullFileSystem->Open( "vertices.bin", "wb" );
  1197. if(fh != FILESYSTEM_INVALID_HANDLE)
  1198. {
  1199. g_pFullFileSystem->Write(pvanim, sizeof(*pvanim) * numVertsToProcess, fh);
  1200. g_pFullFileSystem->Close(fh);
  1201. }
  1202. }*/
  1203. #ifdef _DEBUG
  1204. if(0 == g_cv_morph_debug.GetInt())
  1205. #endif
  1206. {
  1207. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1208. {
  1209. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1210. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1211. }
  1212. nThinFlexVertexCount = g_fn_ComputeFlexedVertex_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1213. }
  1214. #ifdef _DEBUG
  1215. else // Validation path inactive in release, since these static arrays consume 1MB
  1216. {
  1217. bool repeat = false;
  1218. static CachedPosNorm_t backupThinFlexVerts[MAXSTUDIOFLEXVERTS+1], checkThinFlexVerts[MAXSTUDIOFLEXVERTS+1];
  1219. static CacheIndex_t backupFirstThinFlexIndex[MAXSTUDIOVERTS+1],checkFirstThinFlexIndex[MAXSTUDIOVERTS+1];
  1220. int newThinFlexVertexCount ;
  1221. static int numRuns = 0;
  1222. ++numRuns;
  1223. memcpy(backupThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1224. memcpy(backupFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1225. do
  1226. {
  1227. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1228. {
  1229. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1230. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1231. }
  1232. newThinFlexVertexCount = g_fn_ComputeFlexedVertex_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1233. memcpy(checkThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1234. memcpy(checkFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1235. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1236. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1237. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1238. AlwaysAssert(m_ThinFlexVertexCount == newThinFlexVertexCount);
  1239. for(int i = 0; i < newThinFlexVertexCount; ++i)
  1240. AlwaysAssert(Diff(checkThinFlexVerts[i], m_pThinFlexVerts[i]) < 1e-5f);
  1241. int indexOffset = m_pFirstThinFlexIndex - m_pThinFlexIndex;
  1242. for(int i = 0; i < numVertsToProcess; ++i)
  1243. AlwaysAssert(*(int*)&checkFirstThinFlexIndex[indexOffset + pvanim[i].index] == *(int*)&m_pThinFlexIndex[indexOffset + pvanim[i].index]);
  1244. if(repeat)
  1245. {
  1246. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1247. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1248. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1249. }
  1250. }
  1251. while(repeat);
  1252. nThinFlexVertexCount = newThinFlexVertexCount;
  1253. }
  1254. #endif
  1255. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1256. }
  1257. }
  1258. else
  1259. #endif
  1260. {
  1261. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1262. }
  1263. }
  1264. void CCachedRenderData::ComputeFlexedVertexWrinkle_StreamOffset_Optimized( studiohdr_t *pStudioHdr, mstudioflex_t *pflex, mstudiovertanim_wrinkle_t *pvanim, int vertCount, float w1, float w2, float w3, float w4)
  1265. {
  1266. #if PROFILE_THIS_FILE
  1267. CMiniProfilerGuard mpguard(&g_mp_morph);
  1268. #endif
  1269. #ifdef _X360
  1270. int nMorphPath = g_cv_morph_path.GetInt();
  1271. if(nMorphPath)
  1272. {
  1273. mstudiovertanim_wrinkle_t vertCountStruct;
  1274. vertCountStruct.index = vertCount;
  1275. mstudiovertanim_wrinkle_t * pVertEnd;
  1276. {
  1277. #if PROFILE_THIS_FILE
  1278. CMiniProfilerGuard mpguard_lower_bound(&g_mp_morph_lower_bound);
  1279. #endif
  1280. pVertEnd = std::lower_bound(pvanim, pvanim + pflex->numverts, vertCountStruct, mstudiovertanim_wrinkle_t::CSortByIndex());
  1281. }
  1282. if(pvanim < pVertEnd)
  1283. {
  1284. union
  1285. {
  1286. fltx4 f4;
  1287. float f1[4];
  1288. } weights;
  1289. weights.f1[0] = w1;
  1290. weights.f1[1] = w2;
  1291. weights.f1[2] = w3;
  1292. weights.f1[3] = w4;
  1293. uint32 nCurrentTag = uint32(m_CurrentTag)<<16;
  1294. int nThinFlexVertexCount = m_ThinFlexVertexCount;
  1295. int32 *pFirstThinFlexIndex = (int32*)m_pFirstThinFlexIndex;
  1296. CachedPosNorm_t *pThinFlexVerts = m_pThinFlexVerts;
  1297. uint64 numVertsToProcess = pVertEnd - pvanim;
  1298. nMorphPath = MIN(7,nMorphPath);
  1299. #ifdef _DEBUG
  1300. if(0 == g_cv_morph_debug.GetInt())
  1301. #endif
  1302. {
  1303. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1304. {
  1305. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1306. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1307. }
  1308. nThinFlexVertexCount = g_fn_ComputeFlexedVertexWrinkle_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1309. }
  1310. #ifdef _DEBUG
  1311. else // Validation path inactive in release, since these static arrays consume 1MB
  1312. {
  1313. bool repeat = false;
  1314. static CachedPosNorm_t backupThinFlexVerts[MAXSTUDIOFLEXVERTS+1], checkThinFlexVerts[MAXSTUDIOFLEXVERTS+1];
  1315. static CacheIndex_t backupFirstThinFlexIndex[MAXSTUDIOVERTS+1],checkFirstThinFlexIndex[MAXSTUDIOVERTS+1];
  1316. int newThinFlexVertexCount ;
  1317. static int numRuns = 0;
  1318. ++numRuns;
  1319. memcpy(backupThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1320. memcpy(backupFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1321. do
  1322. {
  1323. for(uint32 i = 0; i < 2; ++i) // reset the first 2 positions here as it's required by the algorithm..
  1324. {
  1325. pThinFlexVerts[nThinFlexVertexCount+i].m_Position.InitZero();
  1326. pThinFlexVerts[nThinFlexVertexCount+i].m_Normal.InitZero();
  1327. }
  1328. newThinFlexVertexCount = g_fn_ComputeFlexedVertexWrinkle_StreamOffset[nMorphPath](nThinFlexVertexCount,pThinFlexVerts,pFirstThinFlexIndex,pvanim,nCurrentTag, numVertsToProcess, weights.f4);
  1329. memcpy(checkThinFlexVerts, m_pThinFlexVerts, sizeof(m_pThinFlexVerts));
  1330. memcpy(checkFirstThinFlexIndex, m_pThinFlexIndex, sizeof(m_pThinFlexIndex));
  1331. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1332. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1333. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1334. AlwaysAssert(m_ThinFlexVertexCount == newThinFlexVertexCount);
  1335. for(int i = 0; i < newThinFlexVertexCount; ++i)
  1336. AlwaysAssert(Diff(checkThinFlexVerts[i], m_pThinFlexVerts[i]) < 1e-5f);
  1337. int indexOffset = m_pFirstThinFlexIndex - m_pThinFlexIndex;
  1338. for(int i = 0; i < numVertsToProcess; ++i)
  1339. AlwaysAssert(*(int*)&checkFirstThinFlexIndex[indexOffset + pvanim[i].index] == *(int*)&m_pThinFlexIndex[indexOffset + pvanim[i].index]);
  1340. if(repeat)
  1341. {
  1342. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1343. memcpy(m_pThinFlexVerts, backupThinFlexVerts, sizeof(m_pThinFlexVerts));
  1344. memcpy(m_pThinFlexIndex, backupFirstThinFlexIndex, sizeof(m_pThinFlexIndex));
  1345. }
  1346. }
  1347. while(repeat);
  1348. nThinFlexVertexCount = newThinFlexVertexCount;
  1349. }
  1350. #endif
  1351. m_ThinFlexVertexCount = nThinFlexVertexCount;
  1352. }
  1353. }
  1354. else
  1355. #endif
  1356. {
  1357. ComputeFlexedVertex_StreamOffset( pStudioHdr, pflex, pvanim, vertCount, w1, w2, w3, w4);
  1358. }
  1359. }
  1360. #endif // PLATFORM_WINDOWS