Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

506 lines
20 KiB

  1. /************************************************************************************************************************************\
  2. |* *|
  3. |* Copyright 2012 NVIDIA Corporation. All rights reserved. *|
  4. |* *|
  5. |* NOTICE TO USER: *|
  6. |* *|
  7. |* This software is subject to NVIDIA ownership rights under U.S. and international Copyright laws. *|
  8. |* *|
  9. |* This software and the information contained herein are PROPRIETARY and CONFIDENTIAL to NVIDIA *|
  10. |* and are being provided solely under the terms and conditions of an NVIDIA software license agreement. *|
  11. |* Otherwise, you have no rights to use or access this software in any manner. *|
  12. |* *|
  13. |* If not covered by the applicable NVIDIA software license agreement: *|
  14. |* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE. *|
  15. |* IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. *|
  16. |* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, *|
  17. |* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. *|
  18. |* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, *|
  19. |* OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, *|
  20. |* NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE CODE. *|
  21. |* *|
  22. |* U.S. Government End Users. *|
  23. |* This software is a "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 1995), *|
  24. |* consisting of "commercial computer software" and "commercial computer software documentation" *|
  25. |* as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government only as a commercial end item. *|
  26. |* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), *|
  27. |* all U.S. Government End Users acquire the software with only those rights set forth herein. *|
  28. |* *|
  29. |* Any use of this software in individual and commercial software must include, *|
  30. |* in the user documentation and internal comments to the code, *|
  31. |* the above Disclaimer (as applicable) and U.S. Government End Users Notice. *|
  32. |* *|
  33. \************************************************************************************************************************************/
  34. ////////////////////////// NVIDIA SHADER EXTENSIONS /////////////////
  35. // internal functions
  36. // Functions in this file are not expected to be called by apps directly
  37. #include "nvShaderExtnEnums.h"
  38. struct NvShaderExtnStruct
  39. {
  40. uint opcode; // opcode
  41. uint rid; // resource ID
  42. uint sid; // sampler ID
  43. uint4 dst1u; // destination operand 1 (for instructions that need extra destination operands)
  44. uint4 padding0[3]; // currently unused
  45. uint4 src0u; // uint source operand 0
  46. uint4 src1u; // uint source operand 0
  47. uint4 src2u; // uint source operand 0
  48. uint4 dst0u; // uint destination operand
  49. uint markUavRef; // the next store to UAV is fake and is used only to identify the uav slot
  50. float padding1[28];// struct size: 256 bytes
  51. };
  52. // RW structured buffer for Nvidia shader extensions
  53. // Application needs to define NV_SHADER_EXTN_SLOT as a unused slot, which should be
  54. // set using NvAPI_D3D11_SetNvShaderExtnSlot() call before creating the first shader that
  55. // uses nvidia shader extensions. E.g before including this file in shader define it as:
  56. // #define NV_SHADER_EXTN_SLOT u7
  57. // Note that other operations to this UAV will be ignored so application
  58. // should bind a null resource
  59. RWStructuredBuffer<NvShaderExtnStruct> g_NvidiaExt : register( NV_SHADER_EXTN_SLOT );
  60. //----------------------------------------------------------------------------//
  61. // the exposed SHFL instructions accept a mask parameter in src2
  62. // To compute lane mask from width of segment:
  63. // minLaneID : currentLaneId & src2[12:8]
  64. // maxLaneID : minLaneId | (src2[4:0] & ~src2[12:8])
  65. // where [minLaneId, maxLaneId] defines the segment where currentLaneId belongs
  66. // we always set src2[4:0] to 11111 (0x1F), and set src2[12:8] as (32 - width)
  67. int __NvGetShflMaskFromWidth(uint width)
  68. {
  69. return ((NV_WARP_SIZE - width) << 8) | 0x1F;
  70. }
  71. //----------------------------------------------------------------------------//
  72. void __NvReferenceUAVForOp(RWByteAddressBuffer uav)
  73. {
  74. uint index = g_NvidiaExt.IncrementCounter();
  75. g_NvidiaExt[index].markUavRef = 1;
  76. uav.Store(index, 0);
  77. }
  78. void __NvReferenceUAVForOp(RWTexture1D<float2> uav)
  79. {
  80. uint index = g_NvidiaExt.IncrementCounter();
  81. g_NvidiaExt[index].markUavRef = 1;
  82. uav[index] = float2(0,0);
  83. }
  84. void __NvReferenceUAVForOp(RWTexture2D<float2> uav)
  85. {
  86. uint index = g_NvidiaExt.IncrementCounter();
  87. g_NvidiaExt[index].markUavRef = 1;
  88. uav[uint2(index,index)] = float2(0,0);
  89. }
  90. void __NvReferenceUAVForOp(RWTexture3D<float2> uav)
  91. {
  92. uint index = g_NvidiaExt.IncrementCounter();
  93. g_NvidiaExt[index].markUavRef = 1;
  94. uav[uint3(index,index,index)] = float2(0,0);
  95. }
  96. void __NvReferenceUAVForOp(RWTexture1D<float4> uav)
  97. {
  98. uint index = g_NvidiaExt.IncrementCounter();
  99. g_NvidiaExt[index].markUavRef = 1;
  100. uav[index] = float4(0,0,0,0);
  101. }
  102. void __NvReferenceUAVForOp(RWTexture2D<float4> uav)
  103. {
  104. uint index = g_NvidiaExt.IncrementCounter();
  105. g_NvidiaExt[index].markUavRef = 1;
  106. uav[uint2(index,index)] = float4(0,0,0,0);
  107. }
  108. void __NvReferenceUAVForOp(RWTexture3D<float4> uav)
  109. {
  110. uint index = g_NvidiaExt.IncrementCounter();
  111. g_NvidiaExt[index].markUavRef = 1;
  112. uav[uint3(index,index,index)] = float4(0,0,0,0);
  113. }
  114. void __NvReferenceUAVForOp(RWTexture1D<float> uav)
  115. {
  116. uint index = g_NvidiaExt.IncrementCounter();
  117. g_NvidiaExt[index].markUavRef = 1;
  118. uav[index] = 0.0f;
  119. }
  120. void __NvReferenceUAVForOp(RWTexture2D<float> uav)
  121. {
  122. uint index = g_NvidiaExt.IncrementCounter();
  123. g_NvidiaExt[index].markUavRef = 1;
  124. uav[uint2(index,index)] = 0.0f;
  125. }
  126. void __NvReferenceUAVForOp(RWTexture3D<float> uav)
  127. {
  128. uint index = g_NvidiaExt.IncrementCounter();
  129. g_NvidiaExt[index].markUavRef = 1;
  130. uav[uint3(index,index,index)] = 0.0f;
  131. }
  132. void __NvReferenceUAVForOp(RWTexture1D<uint2> uav)
  133. {
  134. uint index = g_NvidiaExt.IncrementCounter();
  135. g_NvidiaExt[index].markUavRef = 1;
  136. uav[index] = uint2(0,0);
  137. }
  138. void __NvReferenceUAVForOp(RWTexture2D<uint2> uav)
  139. {
  140. uint index = g_NvidiaExt.IncrementCounter();
  141. g_NvidiaExt[index].markUavRef = 1;
  142. uav[uint2(index,index)] = uint2(0,0);
  143. }
  144. void __NvReferenceUAVForOp(RWTexture3D<uint2> uav)
  145. {
  146. uint index = g_NvidiaExt.IncrementCounter();
  147. g_NvidiaExt[index].markUavRef = 1;
  148. uav[uint3(index,index,index)] = uint2(0,0);
  149. }
  150. void __NvReferenceUAVForOp(RWTexture1D<uint4> uav)
  151. {
  152. uint index = g_NvidiaExt.IncrementCounter();
  153. g_NvidiaExt[index].markUavRef = 1;
  154. uav[index] = uint4(0,0,0,0);
  155. }
  156. void __NvReferenceUAVForOp(RWTexture2D<uint4> uav)
  157. {
  158. uint index = g_NvidiaExt.IncrementCounter();
  159. g_NvidiaExt[index].markUavRef = 1;
  160. uav[uint2(index,index)] = uint4(0,0,0,0);
  161. }
  162. void __NvReferenceUAVForOp(RWTexture3D<uint4> uav)
  163. {
  164. uint index = g_NvidiaExt.IncrementCounter();
  165. g_NvidiaExt[index].markUavRef = 1;
  166. uav[uint3(index,index,index)] = uint4(0,0,0,0);
  167. }
  168. void __NvReferenceUAVForOp(RWTexture1D<uint> uav)
  169. {
  170. uint index = g_NvidiaExt.IncrementCounter();
  171. g_NvidiaExt[index].markUavRef = 1;
  172. uav[index] = 0;
  173. }
  174. void __NvReferenceUAVForOp(RWTexture2D<uint> uav)
  175. {
  176. uint index = g_NvidiaExt.IncrementCounter();
  177. g_NvidiaExt[index].markUavRef = 1;
  178. uav[uint2(index,index)] = 0;
  179. }
  180. void __NvReferenceUAVForOp(RWTexture3D<uint> uav)
  181. {
  182. uint index = g_NvidiaExt.IncrementCounter();
  183. g_NvidiaExt[index].markUavRef = 1;
  184. uav[uint3(index,index,index)] = 0;
  185. }
  186. void __NvReferenceUAVForOp(RWTexture1D<int2> uav)
  187. {
  188. uint index = g_NvidiaExt.IncrementCounter();
  189. g_NvidiaExt[index].markUavRef = 1;
  190. uav[index] = int2(0,0);
  191. }
  192. void __NvReferenceUAVForOp(RWTexture2D<int2> uav)
  193. {
  194. uint index = g_NvidiaExt.IncrementCounter();
  195. g_NvidiaExt[index].markUavRef = 1;
  196. uav[uint2(index,index)] = int2(0,0);
  197. }
  198. void __NvReferenceUAVForOp(RWTexture3D<int2> uav)
  199. {
  200. uint index = g_NvidiaExt.IncrementCounter();
  201. g_NvidiaExt[index].markUavRef = 1;
  202. uav[uint3(index,index,index)] = int2(0,0);
  203. }
  204. void __NvReferenceUAVForOp(RWTexture1D<int4> uav)
  205. {
  206. uint index = g_NvidiaExt.IncrementCounter();
  207. g_NvidiaExt[index].markUavRef = 1;
  208. uav[index] = int4(0,0,0,0);
  209. }
  210. void __NvReferenceUAVForOp(RWTexture2D<int4> uav)
  211. {
  212. uint index = g_NvidiaExt.IncrementCounter();
  213. g_NvidiaExt[index].markUavRef = 1;
  214. uav[uint2(index,index)] = int4(0,0,0,0);
  215. }
  216. void __NvReferenceUAVForOp(RWTexture3D<int4> uav)
  217. {
  218. uint index = g_NvidiaExt.IncrementCounter();
  219. g_NvidiaExt[index].markUavRef = 1;
  220. uav[uint3(index,index,index)] = int4(0,0,0,0);
  221. }
  222. void __NvReferenceUAVForOp(RWTexture1D<int> uav)
  223. {
  224. uint index = g_NvidiaExt.IncrementCounter();
  225. g_NvidiaExt[index].markUavRef = 1;
  226. uav[index] = 0;
  227. }
  228. void __NvReferenceUAVForOp(RWTexture2D<int> uav)
  229. {
  230. uint index = g_NvidiaExt.IncrementCounter();
  231. g_NvidiaExt[index].markUavRef = 1;
  232. uav[uint2(index,index)] = 0;
  233. }
  234. void __NvReferenceUAVForOp(RWTexture3D<int> uav)
  235. {
  236. uint index = g_NvidiaExt.IncrementCounter();
  237. g_NvidiaExt[index].markUavRef = 1;
  238. uav[uint3(index,index,index)] = 0;
  239. }
  240. //----------------------------------------------------------------------------//
  241. // ATOMIC op sub-opcodes
  242. #define NV_EXTN_ATOM_ADD 3
  243. #define NV_EXTN_ATOM_MAX 6
  244. #define NV_EXTN_ATOM_MIN 7
  245. //----------------------------------------------------------------------------//
  246. // performs Atomic operation on two consecutive fp16 values in the given UAV
  247. // the uint paramater 'fp16x2Val' is treated as two fp16 values
  248. // the passed sub-opcode 'op' should be an immediate constant
  249. // byteAddress must be multiple of 4
  250. // the returned value are the two fp16 values packed into a single uint
  251. uint __NvAtomicOpFP16x2(RWByteAddressBuffer uav, uint byteAddress, uint fp16x2Val, uint atomicOpType)
  252. {
  253. __NvReferenceUAVForOp(uav);
  254. uint index = g_NvidiaExt.IncrementCounter();
  255. g_NvidiaExt[index].src0u.x = byteAddress;
  256. g_NvidiaExt[index].src1u.x = fp16x2Val;
  257. g_NvidiaExt[index].src2u.x = atomicOpType;
  258. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  259. return g_NvidiaExt[index].dst0u.x;
  260. }
  261. //----------------------------------------------------------------------------//
  262. // performs Atomic operation on a R16G16_FLOAT UAV at the given address
  263. // the uint paramater 'fp16x2Val' is treated as two fp16 values
  264. // the passed sub-opcode 'op' should be an immediate constant
  265. // the returned value are the two fp16 values (.x and .y components) packed into a single uint
  266. // Warning: Behaviour of these set of functions is undefined if the UAV is not
  267. // of R16G16_FLOAT format (might result in app crash or TDR)
  268. uint __NvAtomicOpFP16x2(RWTexture1D<float2> uav, uint address, uint fp16x2Val, uint atomicOpType)
  269. {
  270. __NvReferenceUAVForOp(uav);
  271. uint index = g_NvidiaExt.IncrementCounter();
  272. g_NvidiaExt[index].src0u.x = address;
  273. g_NvidiaExt[index].src1u.x = fp16x2Val;
  274. g_NvidiaExt[index].src2u.x = atomicOpType;
  275. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  276. return g_NvidiaExt[index].dst0u.x;
  277. }
  278. uint __NvAtomicOpFP16x2(RWTexture2D<float2> uav, uint2 address, uint fp16x2Val, uint atomicOpType)
  279. {
  280. __NvReferenceUAVForOp(uav);
  281. uint index = g_NvidiaExt.IncrementCounter();
  282. g_NvidiaExt[index].src0u.xy = address;
  283. g_NvidiaExt[index].src1u.x = fp16x2Val;
  284. g_NvidiaExt[index].src2u.x = atomicOpType;
  285. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  286. return g_NvidiaExt[index].dst0u.x;
  287. }
  288. uint __NvAtomicOpFP16x2(RWTexture3D<float2> uav, uint3 address, uint fp16x2Val, uint atomicOpType)
  289. {
  290. __NvReferenceUAVForOp(uav);
  291. uint index = g_NvidiaExt.IncrementCounter();
  292. g_NvidiaExt[index].src0u.xyz = address;
  293. g_NvidiaExt[index].src1u.x = fp16x2Val;
  294. g_NvidiaExt[index].src2u.x = atomicOpType;
  295. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  296. return g_NvidiaExt[index].dst0u.x;
  297. }
  298. //----------------------------------------------------------------------------//
  299. // performs Atomic operation on a R16G16B16A16_FLOAT UAV at the given address
  300. // the uint2 paramater 'fp16x2Val' is treated as four fp16 values
  301. // i.e, fp16x2Val.x = uav.xy and fp16x2Val.y = uav.yz
  302. // the passed sub-opcode 'op' should be an immediate constant
  303. // the returned value are the four fp16 values (.xyzw components) packed into uint2
  304. // Warning: Behaviour of these set of functions is undefined if the UAV is not
  305. // of R16G16B16A16_FLOAT format (might result in app crash or TDR)
  306. uint2 __NvAtomicOpFP16x2(RWTexture1D<float4> uav, uint address, uint2 fp16x2Val, uint atomicOpType)
  307. {
  308. __NvReferenceUAVForOp(uav);
  309. // break it down into two fp16x2 atomic ops
  310. uint2 retVal;
  311. // first op has x-coordinate = x * 2
  312. uint index = g_NvidiaExt.IncrementCounter();
  313. g_NvidiaExt[index].src0u.x = address * 2;
  314. g_NvidiaExt[index].src1u.x = fp16x2Val.x;
  315. g_NvidiaExt[index].src2u.x = atomicOpType;
  316. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  317. retVal.x = g_NvidiaExt[index].dst0u.x;
  318. // second op has x-coordinate = x * 2 + 1
  319. index = g_NvidiaExt.IncrementCounter();
  320. g_NvidiaExt[index].src0u.x = address * 2 + 1;
  321. g_NvidiaExt[index].src1u.x = fp16x2Val.y;
  322. g_NvidiaExt[index].src2u.x = atomicOpType;
  323. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  324. retVal.y = g_NvidiaExt[index].dst0u.x;
  325. return retVal;
  326. }
  327. uint2 __NvAtomicOpFP16x2(RWTexture2D<float4> uav, uint2 address, uint2 fp16x2Val, uint atomicOpType)
  328. {
  329. __NvReferenceUAVForOp(uav);
  330. // break it down into two fp16x2 atomic ops
  331. uint2 retVal;
  332. // first op has x-coordinate = x * 2
  333. uint2 addressTemp = uint2(address.x * 2, address.y);
  334. uint index = g_NvidiaExt.IncrementCounter();
  335. g_NvidiaExt[index].src0u.xy = addressTemp;
  336. g_NvidiaExt[index].src1u.x = fp16x2Val.x;
  337. g_NvidiaExt[index].src2u.x = atomicOpType;
  338. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  339. retVal.x = g_NvidiaExt[index].dst0u.x;
  340. // second op has x-coordinate = x * 2 + 1
  341. addressTemp.x++;
  342. index = g_NvidiaExt.IncrementCounter();
  343. g_NvidiaExt[index].src0u.xy = addressTemp;
  344. g_NvidiaExt[index].src1u.x = fp16x2Val.y;
  345. g_NvidiaExt[index].src2u.x = atomicOpType;
  346. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  347. retVal.y = g_NvidiaExt[index].dst0u.x;
  348. return retVal;
  349. }
  350. uint2 __NvAtomicOpFP16x2(RWTexture3D<float4> uav, uint3 address, uint2 fp16x2Val, uint atomicOpType)
  351. {
  352. __NvReferenceUAVForOp(uav);
  353. // break it down into two fp16x2 atomic ops
  354. uint2 retVal;
  355. // first op has x-coordinate = x * 2
  356. uint3 addressTemp = uint3(address.x * 2, address.y, address.z);
  357. uint index = g_NvidiaExt.IncrementCounter();
  358. g_NvidiaExt[index].src0u.xyz = addressTemp;
  359. g_NvidiaExt[index].src1u.x = fp16x2Val.x;
  360. g_NvidiaExt[index].src2u.x = atomicOpType;
  361. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  362. retVal.x = g_NvidiaExt[index].dst0u.x;
  363. // second op has x-coordinate = x * 2 + 1
  364. addressTemp.x++;
  365. index = g_NvidiaExt.IncrementCounter();
  366. g_NvidiaExt[index].src0u.xyz = addressTemp;
  367. g_NvidiaExt[index].src1u.x = fp16x2Val.y;
  368. g_NvidiaExt[index].src2u.x = atomicOpType;
  369. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP16_ATOMIC;
  370. retVal.y = g_NvidiaExt[index].dst0u.x;
  371. return retVal;
  372. }
  373. uint __fp32x2Tofp16x2(float2 val)
  374. {
  375. return (f32tof16(val.y)<<16) | f32tof16(val.x) ;
  376. }
  377. uint2 __fp32x4Tofp16x4(float4 val)
  378. {
  379. return uint2( (f32tof16(val.y)<<16) | f32tof16(val.x), (f32tof16(val.w)<<16) | f32tof16(val.z) ) ;
  380. }
  381. // FP32 Atomic functions
  382. // performs Atomic operation treating the uav as float (fp32) values
  383. // the passed sub-opcode 'op' should be an immediate constant
  384. // byteAddress must be multiple of 4
  385. float __NvAtomicAddFP32(RWByteAddressBuffer uav, uint byteAddress, float val)
  386. {
  387. __NvReferenceUAVForOp(uav);
  388. uint index = g_NvidiaExt.IncrementCounter();
  389. g_NvidiaExt[index].src0u.x = byteAddress;
  390. g_NvidiaExt[index].src1u.x = asuint(val); // passing as uint to make it more convinient for the driver to translate
  391. g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
  392. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
  393. return asfloat(g_NvidiaExt[index].dst0u.x);
  394. }
  395. float __NvAtomicAddFP32(RWTexture1D<float> uav, uint address, float val)
  396. {
  397. __NvReferenceUAVForOp(uav);
  398. uint index = g_NvidiaExt.IncrementCounter();
  399. g_NvidiaExt[index].src0u.x = address;
  400. g_NvidiaExt[index].src1u.x = asuint(val);
  401. g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
  402. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
  403. return asfloat(g_NvidiaExt[index].dst0u.x);
  404. }
  405. float __NvAtomicAddFP32(RWTexture2D<float> uav, uint2 address, float val)
  406. {
  407. __NvReferenceUAVForOp(uav);
  408. uint index = g_NvidiaExt.IncrementCounter();
  409. g_NvidiaExt[index].src0u.xy = address;
  410. g_NvidiaExt[index].src1u.x = asuint(val);
  411. g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
  412. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
  413. return asfloat(g_NvidiaExt[index].dst0u.x);
  414. }
  415. float __NvAtomicAddFP32(RWTexture3D<float> uav, uint3 address, float val)
  416. {
  417. __NvReferenceUAVForOp(uav);
  418. uint index = g_NvidiaExt.IncrementCounter();
  419. g_NvidiaExt[index].src0u.xyz = address;
  420. g_NvidiaExt[index].src1u.x = asuint(val);
  421. g_NvidiaExt[index].src2u.x = NV_EXTN_ATOM_ADD;
  422. g_NvidiaExt[index].opcode = NV_EXTN_OP_FP32_ATOMIC;
  423. return asfloat(g_NvidiaExt[index].dst0u.x);
  424. }