Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

768 lines
29 KiB

  1. /************************************************************************************************************************************\
  2. |* *|
  3. |* Copyright 2012 NVIDIA Corporation. All rights reserved. *|
  4. |* *|
  5. |* NOTICE TO USER: *|
  6. |* *|
  7. |* This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws. *|
  8. |* *|
  9. |* This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA *|
  10. |* and are being provided solely under the terms and conditions of an NVIDIA software license agreement. *|
  11. |* Otherwise, you have no rights to use or access this software in any manner. *|
  12. |* *|
  13. |* If not covered by the applicable NVIDIA software license agreement: *|
  14. |* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE. *|
  15. |* IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. *|
  16. |* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, *|
  17. |* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. *|
  18. |* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, *|
  19. |* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, *|
  20. |* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE. *|
  21. |* *|
  22. |* U.S. Government End Users. *|
  23. |* This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), *|
  24. |* consisting of "commercial computer software" and "commercial computer software documentation" *|
  25. |* as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item. *|
  26. |* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), *|
  27. |* all U.S. Government End Users acquire the software with only those rights set forth herein. *|
  28. |* *|
  29. |* Any use of this software in individual and commercial software must include, *|
  30. |* in the user documentation and internal comments to the code, *|
  31. |* the above Disclaimer (as applicable) and U.S. Government End Users Notice. *|
  32. |* *|
  33. \************************************************************************************************************************************/
  34. ////////////////////////// NVIDIA SHADER EXTENSIONS /////////////////
  35. // this file is to be #included in the app HLSL shader code to make
  36. // use of nvidia shader extensions
  37. #include "nvHLSLExtnsInternal.h"
  38. //----------------------------------------------------------------------------//
  39. //------------------------- Warp Shuffle Functions ---------------------------//
  40. //----------------------------------------------------------------------------//
  41. // all functions have variants with width parameter which permits sub-division
  42. // of the warp into segments - for example to exchange data between 4 groups of
  43. // 8 lanes in a SIMD manner. If width is less than warpSize then each subsection
  44. // of the warp behaves as a separate entity with a starting logical lane ID of 0.
  45. // A thread may only exchange data with others in its own subsection. Width must
  46. // have a value which is a power of 2 so that the warp can be subdivided equally;
  47. // results are undefined if width is not a power of 2, or is a number greater
  48. // than warpSize.
  49. //
  50. // simple variant of SHFL instruction
  51. // returns val from the specified lane
  52. // optional width parameter must be a power of two and width <= 32
  53. //
  54. int NvShfl(int val, uint srcLane, int width = NV_WARP_SIZE)
  55. {
  56. uint index = g_NvidiaExt.IncrementCounter();
  57. g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
  58. g_NvidiaExt[index].src0u.y = srcLane; // source lane
  59. g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
  60. g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL;
  61. // result is returned as the return value of IncrementCounter on fake UAV slot
  62. return g_NvidiaExt.IncrementCounter();
  63. }
  64. //
  65. // Copy from a lane with lower ID relative to caller
  66. //
  67. int NvShflUp(int val, uint delta, int width = NV_WARP_SIZE)
  68. {
  69. uint index = g_NvidiaExt.IncrementCounter();
  70. g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
  71. g_NvidiaExt[index].src0u.y = delta; // relative lane offset
  72. g_NvidiaExt[index].src0u.z = (NV_WARP_SIZE - width) << 8; // minIndex = maxIndex for shfl_up (src2[4:0] is expected to be 0)
  73. g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_UP;
  74. return g_NvidiaExt.IncrementCounter();
  75. }
  76. //
  77. // Copy from a lane with higher ID relative to caller
  78. //
  79. int NvShflDown(int val, uint delta, int width = NV_WARP_SIZE)
  80. {
  81. uint index = g_NvidiaExt.IncrementCounter();
  82. g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
  83. g_NvidiaExt[index].src0u.y = delta; // relative lane offset
  84. g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
  85. g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_DOWN;
  86. return g_NvidiaExt.IncrementCounter();
  87. }
  88. //
  89. // Copy from a lane based on bitwise XOR of own lane ID
  90. //
  91. int NvShflXor(int val, uint laneMask, int width = NV_WARP_SIZE)
  92. {
  93. uint index = g_NvidiaExt.IncrementCounter();
  94. g_NvidiaExt[index].src0u.x = val; // variable to be shuffled
  95. g_NvidiaExt[index].src0u.y = laneMask; // laneMask to be XOR'ed with current laneId to get the source lane id
  96. g_NvidiaExt[index].src0u.z = __NvGetShflMaskFromWidth(width);
  97. g_NvidiaExt[index].opcode = NV_EXTN_OP_SHFL_XOR;
  98. return g_NvidiaExt.IncrementCounter();
  99. }
  100. //----------------------------------------------------------------------------//
  101. //----------------------------- Warp Vote Functions---------------------------//
  102. //----------------------------------------------------------------------------//
  103. // returns 0xFFFFFFFF if the predicate is true for any thread in the warp, returns 0 otherwise
  104. uint NvAny(int predicate)
  105. {
  106. uint index = g_NvidiaExt.IncrementCounter();
  107. g_NvidiaExt[index].src0u.x = predicate;
  108. g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ANY;
  109. return g_NvidiaExt.IncrementCounter();
  110. }
  111. // returns 0xFFFFFFFF if the predicate is true for ALL threads in the warp, returns 0 otherwise
  112. uint NvAll(int predicate)
  113. {
  114. uint index = g_NvidiaExt.IncrementCounter();
  115. g_NvidiaExt[index].src0u.x = predicate;
  116. g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_ALL;
  117. return g_NvidiaExt.IncrementCounter();
  118. }
  119. // returns a mask of all threads in the warp with bits set for threads that have predicate true
  120. uint NvBallot(int predicate)
  121. {
  122. uint index = g_NvidiaExt.IncrementCounter();
  123. g_NvidiaExt[index].src0u.x = predicate;
  124. g_NvidiaExt[index].opcode = NV_EXTN_OP_VOTE_BALLOT;
  125. return g_NvidiaExt.IncrementCounter();
  126. }
  127. //----------------------------------------------------------------------------//
  128. //----------------------------- Utility Functions ----------------------------//
  129. //----------------------------------------------------------------------------//
  130. // returns the lane index of the current thread (thread index in warp)
  131. int NvGetLaneId()
  132. {
  133. uint index = g_NvidiaExt.IncrementCounter();
  134. g_NvidiaExt[index].opcode = NV_EXTN_OP_GET_LANE_ID;
  135. return g_NvidiaExt.IncrementCounter();
  136. }
  137. //----------------------------------------------------------------------------//
  138. //----------------------------- FP16 Atmoic Functions-------------------------//
  139. //----------------------------------------------------------------------------//
  140. // The functions below performs atomic operations on two consecutive fp16
  141. // values in the given raw UAV.
  142. // The uint paramater 'fp16x2Val' is treated as two fp16 values byteAddress must be multiple of 4
  143. // The returned value are the two fp16 values packed into a single uint
  144. uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
  145. {
  146. return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_ADD);
  147. }
  148. uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
  149. {
  150. return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MIN);
  151. }
  152. uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val)
  153. {
  154. return __NvAtomicOpFP16x2(uav, byteAddress, fp16x2Val, NV_EXTN_ATOM_MAX);
  155. }
  156. // versions of the above functions taking two fp32 values (internally converted to fp16 values)
  157. uint NvInterlockedAddFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
  158. {
  159. return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
  160. }
  161. uint NvInterlockedMinFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
  162. {
  163. return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
  164. }
  165. uint NvInterlockedMaxFp16x2(RWByteAddressBuffer uav, uint byteAddress, float2 val)
  166. {
  167. return __NvAtomicOpFP16x2(uav, byteAddress, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
  168. }
  169. //----------------------------------------------------------------------------//
  170. // The functions below perform atomic operation on a R16G16_FLOAT UAV at the given address
  171. // the uint paramater 'fp16x2Val' is treated as two fp16 values
  172. // the returned value are the two fp16 values (.x and .y components) packed into a single uint
  173. // Warning: Behaviour of these set of functions is undefined if the UAV is not
  174. // of R16G16_FLOAT format (might result in app crash or TDR)
  175. uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
  176. {
  177. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
  178. }
  179. uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
  180. {
  181. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
  182. }
  183. uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val)
  184. {
  185. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
  186. }
  187. uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
  188. {
  189. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
  190. }
  191. uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
  192. {
  193. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
  194. }
  195. uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val)
  196. {
  197. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
  198. }
  199. uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
  200. {
  201. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
  202. }
  203. uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
  204. {
  205. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
  206. }
  207. uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val)
  208. {
  209. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
  210. }
  211. // versions taking two fp32 values (internally converted to fp16)
  212. uint NvInterlockedAddFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
  213. {
  214. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
  215. }
  216. uint NvInterlockedMinFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
  217. {
  218. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
  219. }
  220. uint NvInterlockedMaxFp16x2(RWTexture1D<float2> uav, uint address, float2 val)
  221. {
  222. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
  223. }
  224. uint NvInterlockedAddFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
  225. {
  226. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
  227. }
  228. uint NvInterlockedMinFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
  229. {
  230. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
  231. }
  232. uint NvInterlockedMaxFp16x2(RWTexture2D<float2> uav, uint2 address, float2 val)
  233. {
  234. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
  235. }
  236. uint NvInterlockedAddFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
  237. {
  238. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_ADD);
  239. }
  240. uint NvInterlockedMinFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
  241. {
  242. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MIN);
  243. }
  244. uint NvInterlockedMaxFp16x2(RWTexture3D<float2> uav, uint3 address, float2 val)
  245. {
  246. return __NvAtomicOpFP16x2(uav, address, __fp32x2Tofp16x2(val), NV_EXTN_ATOM_MAX);
  247. }
  248. //----------------------------------------------------------------------------//
  249. // The functions below perform Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
  250. // the uint2 paramater 'fp16x2Val' is treated as four fp16 values
  251. // i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
  252. // The returned value are the four fp16 values (.xyzw components) packed into uint2
  253. // Warning: Behaviour of these set of functions is undefined if the UAV is not
  254. // of R16G16B16A16_FLOAT format (might result in app crash or TDR)
  255. uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
  256. {
  257. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
  258. }
  259. uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
  260. {
  261. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
  262. }
  263. uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val)
  264. {
  265. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
  266. }
  267. uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
  268. {
  269. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
  270. }
  271. uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
  272. {
  273. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
  274. }
  275. uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val)
  276. {
  277. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
  278. }
  279. uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
  280. {
  281. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_ADD);
  282. }
  283. uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
  284. {
  285. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MIN);
  286. }
  287. uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val)
  288. {
  289. return __NvAtomicOpFP16x2(uav, address, fp16x2Val, NV_EXTN_ATOM_MAX);
  290. }
  291. // versions taking four fp32 values (internally converted to fp16)
  292. uint2 NvInterlockedAddFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
  293. {
  294. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
  295. }
  296. uint2 NvInterlockedMinFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
  297. {
  298. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
  299. }
  300. uint2 NvInterlockedMaxFp16x4(RWTexture1D<float4> uav, uint address, float4 val)
  301. {
  302. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
  303. }
  304. uint2 NvInterlockedAddFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
  305. {
  306. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
  307. }
  308. uint2 NvInterlockedMinFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
  309. {
  310. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
  311. }
  312. uint2 NvInterlockedMaxFp16x4(RWTexture2D<float4> uav, uint2 address, float4 val)
  313. {
  314. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
  315. }
  316. uint2 NvInterlockedAddFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
  317. {
  318. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_ADD);
  319. }
  320. uint2 NvInterlockedMinFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
  321. {
  322. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MIN);
  323. }
  324. uint2 NvInterlockedMaxFp16x4(RWTexture3D<float4> uav, uint3 address, float4 val)
  325. {
  326. return __NvAtomicOpFP16x2(uav, address, __fp32x4Tofp16x4(val), NV_EXTN_ATOM_MAX);
  327. }
  328. //----------------------------------------------------------------------------//
  329. //----------------------------- FP32 Atmoic Functions-------------------------//
  330. //----------------------------------------------------------------------------//
  331. // The functions below performs atomic add on the given UAV treating the value as float
  332. // byteAddress must be multiple of 4
  333. // The returned value is the value present in memory location before the atomic add
  334. float NvInterlockedAddFp32(RWByteAddressBuffer uav, uint byteAddress, float val)
  335. {
  336. return __NvAtomicAddFP32(uav, byteAddress, val);
  337. }
  338. //----------------------------------------------------------------------------//
  339. // The functions below perform atomic add on a R32_FLOAT UAV at the given address
  340. // the returned value is the value before performing the atomic add
  341. // Warning: Behaviour of these set of functions is undefined if the UAV is not
  342. // of R32_FLOAT format (might result in app crash or TDR)
  343. float NvInterlockedAddFp32(RWTexture1D<float> uav, uint address, float val)
  344. {
  345. return __NvAtomicAddFP32(uav, address, val);
  346. }
  347. float NvInterlockedAddFp32(RWTexture2D<float> uav, uint2 address, float val)
  348. {
  349. return __NvAtomicAddFP32(uav, address, val);
  350. }
  351. float NvInterlockedAddFp32(RWTexture3D<float> uav, uint3 address, float val)
  352. {
  353. return __NvAtomicAddFP32(uav, address, val);
  354. }
  355. //----------------------------------------------------------------------------//
  356. //---------------------- Typed UAV Load functions ----------------------------//
  357. //----------------------------------------------------------------------------//
  358. // loads value from a UAV of a 4-component resource of any unorm, snorm or float format
  359. // (e.g, DXGI_FORMAT_R8G8B8A8_UNORM, R16G16B16A16_FLOAT or DXGI_FORMAT_R32G32B32A32_FLOAT)
  360. // the loaded value is automatically converted to fp32 and returned
  361. float4 NvLoadUavTyped(RWTexture1D<float4> uav, uint address)
  362. {
  363. __NvReferenceUAVForOp(uav);
  364. uint index = g_NvidiaExt.IncrementCounter();
  365. g_NvidiaExt[index].src0u.x = address;
  366. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  367. return asfloat(g_NvidiaExt[index].dst0u);
  368. }
  369. float4 NvLoadUavTyped(RWTexture2D<float4> uav, uint2 address)
  370. {
  371. __NvReferenceUAVForOp(uav);
  372. uint index = g_NvidiaExt.IncrementCounter();
  373. g_NvidiaExt[index].src0u.xy = address;
  374. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  375. return asfloat(g_NvidiaExt[index].dst0u);
  376. }
  377. float4 NvLoadUavTyped(RWTexture3D<float4> uav, uint3 address)
  378. {
  379. __NvReferenceUAVForOp(uav);
  380. uint index = g_NvidiaExt.IncrementCounter();
  381. g_NvidiaExt[index].src0u.xyz = address;
  382. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  383. return asfloat(g_NvidiaExt[index].dst0u);
  384. }
  385. // loads value from a UAV of a 2-component resource of any unorm, snorm or float format
  386. // the loaded value is automatically converted to fp32 and returned
  387. float2 NvLoadUavTyped(RWTexture1D<float2> uav, uint address)
  388. {
  389. __NvReferenceUAVForOp(uav);
  390. uint index = g_NvidiaExt.IncrementCounter();
  391. g_NvidiaExt[index].src0u.x = address;
  392. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  393. return asfloat(g_NvidiaExt[index].dst0u.xy);
  394. }
  395. float2 NvLoadUavTyped(RWTexture2D<float2> uav, uint2 address)
  396. {
  397. __NvReferenceUAVForOp(uav);
  398. uint index = g_NvidiaExt.IncrementCounter();
  399. g_NvidiaExt[index].src0u.xy = address;
  400. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  401. return asfloat(g_NvidiaExt[index].dst0u.xy);
  402. }
  403. float2 NvLoadUavTyped(RWTexture3D<float2> uav, uint3 address)
  404. {
  405. __NvReferenceUAVForOp(uav);
  406. uint index = g_NvidiaExt.IncrementCounter();
  407. g_NvidiaExt[index].src0u.xyz = address;
  408. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  409. return asfloat(g_NvidiaExt[index].dst0u.xy);
  410. }
  411. // loads value from a UAV of a single component resource of any unorm, snorm or float format
  412. // the loaded value is automatically converted to fp32 and returned
  413. float NvLoadUavTyped(RWTexture1D<float> uav, uint address)
  414. {
  415. __NvReferenceUAVForOp(uav);
  416. uint index = g_NvidiaExt.IncrementCounter();
  417. g_NvidiaExt[index].src0u.x = address;
  418. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  419. return asfloat(g_NvidiaExt[index].dst0u.x);
  420. }
  421. float NvLoadUavTyped(RWTexture2D<float> uav, uint2 address)
  422. {
  423. __NvReferenceUAVForOp(uav);
  424. uint index = g_NvidiaExt.IncrementCounter();
  425. g_NvidiaExt[index].src0u.xy = address;
  426. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  427. return asfloat(g_NvidiaExt[index].dst0u.x);
  428. }
  429. float NvLoadUavTyped(RWTexture3D<float> uav, uint3 address)
  430. {
  431. __NvReferenceUAVForOp(uav);
  432. uint index = g_NvidiaExt.IncrementCounter();
  433. g_NvidiaExt[index].src0u.xyz = address;
  434. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  435. return asfloat(g_NvidiaExt[index].dst0u.x);
  436. }
  437. // loads value from a UAV of a 4-component resource of any uint format
  438. // (e.g, DXGI_FORMAT_R8G8B8A8_UINT, DXGI_FORMAT_R32G32B32A32_UINT)
  439. // the loaded value is automatically converted to uint32 and returned
  440. uint4 NvLoadUavTyped(RWTexture1D<uint4> uav, uint address)
  441. {
  442. __NvReferenceUAVForOp(uav);
  443. uint index = g_NvidiaExt.IncrementCounter();
  444. g_NvidiaExt[index].src0u.x = address;
  445. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  446. return (g_NvidiaExt[index].dst0u);
  447. }
  448. uint4 NvLoadUavTyped(RWTexture2D<uint4> uav, uint2 address)
  449. {
  450. __NvReferenceUAVForOp(uav);
  451. uint index = g_NvidiaExt.IncrementCounter();
  452. g_NvidiaExt[index].src0u.xy = address;
  453. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  454. return (g_NvidiaExt[index].dst0u);
  455. }
  456. uint4 NvLoadUavTyped(RWTexture3D<uint4> uav, uint3 address)
  457. {
  458. __NvReferenceUAVForOp(uav);
  459. uint index = g_NvidiaExt.IncrementCounter();
  460. g_NvidiaExt[index].src0u.xyz = address;
  461. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  462. return (g_NvidiaExt[index].dst0u);
  463. }
  464. // loads value from a UAV of a 2-component resource of any uint format
  465. // the loaded value is automatically converted to uint32 and returned
  466. uint2 NvLoadUavTyped(RWTexture1D<uint2> uav, uint address)
  467. {
  468. __NvReferenceUAVForOp(uav);
  469. uint index = g_NvidiaExt.IncrementCounter();
  470. g_NvidiaExt[index].src0u.x = address;
  471. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  472. return (g_NvidiaExt[index].dst0u.xy);
  473. }
  474. uint2 NvLoadUavTyped(RWTexture2D<uint2> uav, uint2 address)
  475. {
  476. __NvReferenceUAVForOp(uav);
  477. uint index = g_NvidiaExt.IncrementCounter();
  478. g_NvidiaExt[index].src0u.xy = address;
  479. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  480. return (g_NvidiaExt[index].dst0u.xy);
  481. }
  482. uint2 NvLoadUavTyped(RWTexture3D<uint2> uav, uint3 address)
  483. {
  484. __NvReferenceUAVForOp(uav);
  485. uint index = g_NvidiaExt.IncrementCounter();
  486. g_NvidiaExt[index].src0u.xyz = address;
  487. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  488. return (g_NvidiaExt[index].dst0u.xy);
  489. }
  490. // loads value from a UAV of a single component resource of any uint format
  491. // the loaded value is automatically converted to uint32 and returned
  492. uint NvLoadUavTyped(RWTexture1D<uint> uav, uint address)
  493. {
  494. __NvReferenceUAVForOp(uav);
  495. uint index = g_NvidiaExt.IncrementCounter();
  496. g_NvidiaExt[index].src0u.x = address;
  497. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  498. return (g_NvidiaExt[index].dst0u.x);
  499. }
  500. uint NvLoadUavTyped(RWTexture2D<uint> uav, uint2 address)
  501. {
  502. __NvReferenceUAVForOp(uav);
  503. uint index = g_NvidiaExt.IncrementCounter();
  504. g_NvidiaExt[index].src0u.xy = address;
  505. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  506. return (g_NvidiaExt[index].dst0u.x);
  507. }
  508. uint NvLoadUavTyped(RWTexture3D<uint> uav, uint3 address)
  509. {
  510. __NvReferenceUAVForOp(uav);
  511. uint index = g_NvidiaExt.IncrementCounter();
  512. g_NvidiaExt[index].src0u.xyz = address;
  513. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  514. return (g_NvidiaExt[index].dst0u.x);
  515. }
  516. // loads value from a UAV of a 4-component resource of any signed integer format
  517. // (e.g, DXGI_FORMAT_R8G8B8A8_SINT, DXGI_FORMAT_R32G32B32A32_SINT)
  518. // the loaded value is automatically converted to int32 and returned
  519. int4 NvLoadUavTyped(RWTexture1D<int4> uav, uint address)
  520. {
  521. __NvReferenceUAVForOp(uav);
  522. uint index = g_NvidiaExt.IncrementCounter();
  523. g_NvidiaExt[index].src0u.x = address;
  524. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  525. return asint(g_NvidiaExt[index].dst0u);
  526. }
  527. int4 NvLoadUavTyped(RWTexture2D<int4> uav, uint2 address)
  528. {
  529. __NvReferenceUAVForOp(uav);
  530. uint index = g_NvidiaExt.IncrementCounter();
  531. g_NvidiaExt[index].src0u.xy = address;
  532. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  533. return asint(g_NvidiaExt[index].dst0u);
  534. }
  535. int4 NvLoadUavTyped(RWTexture3D<int4> uav, uint3 address)
  536. {
  537. __NvReferenceUAVForOp(uav);
  538. uint index = g_NvidiaExt.IncrementCounter();
  539. g_NvidiaExt[index].src0u.xyz = address;
  540. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  541. return asint(g_NvidiaExt[index].dst0u);
  542. }
  543. // loads value from a UAV of a 2-component resource of any signed integer format
  544. // the loaded value is automatically converted to int32 and returned
  545. int2 NvLoadUavTyped(RWTexture1D<int2> uav, uint address)
  546. {
  547. __NvReferenceUAVForOp(uav);
  548. uint index = g_NvidiaExt.IncrementCounter();
  549. g_NvidiaExt[index].src0u.x = address;
  550. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  551. return asint(g_NvidiaExt[index].dst0u.xy);
  552. }
  553. int2 NvLoadUavTyped(RWTexture2D<int2> uav, uint2 address)
  554. {
  555. __NvReferenceUAVForOp(uav);
  556. uint index = g_NvidiaExt.IncrementCounter();
  557. g_NvidiaExt[index].src0u.xy = address;
  558. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  559. return asint(g_NvidiaExt[index].dst0u.xy);
  560. }
  561. int2 NvLoadUavTyped(RWTexture3D<int2> uav, uint3 address)
  562. {
  563. __NvReferenceUAVForOp(uav);
  564. uint index = g_NvidiaExt.IncrementCounter();
  565. g_NvidiaExt[index].src0u.xyz = address;
  566. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  567. return asint(g_NvidiaExt[index].dst0u.xy);
  568. }
  569. // loads value from a UAV of a single component resource of signed integer format
  570. // the loaded value is automatically converted to int32 and returned
  571. int NvLoadUavTyped(RWTexture1D<int> uav, uint address)
  572. {
  573. __NvReferenceUAVForOp(uav);
  574. uint index = g_NvidiaExt.IncrementCounter();
  575. g_NvidiaExt[index].src0u.x = address;
  576. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  577. return asint(g_NvidiaExt[index].dst0u.x);
  578. }
  579. int NvLoadUavTyped(RWTexture2D<int> uav, uint2 address)
  580. {
  581. __NvReferenceUAVForOp(uav);
  582. uint index = g_NvidiaExt.IncrementCounter();
  583. g_NvidiaExt[index].src0u.xy = address;
  584. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  585. return asint(g_NvidiaExt[index].dst0u.x);
  586. }
  587. int NvLoadUavTyped(RWTexture3D<int> uav, uint3 address)
  588. {
  589. __NvReferenceUAVForOp(uav);
  590. uint index = g_NvidiaExt.IncrementCounter();
  591. g_NvidiaExt[index].src0u.xyz = address;
  592. g_NvidiaExt[index].opcode = NV_EXTN_OP_TYPED_UAV_LOAD;
  593. return asint(g_NvidiaExt[index].dst0u.x);
  594. }