Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1451 lines
42 KiB

  1. //
  2. // Copyright (c) 1996-2000 Microsoft Corporation. All rights reserved.
  3. // Mmx.cpp
  4. // MMX Mix engines for Microsoft synth
  5. /*
  6. Variable useage.
  7. Variable register
  8. pfSamplePos eax
  9. pfPitch ebx
  10. dwI ecx
  11. dwIncDelta edx (edx is sometimes a temporary register)
  12. dwPosition1 esi
  13. dwPostiion2 edi
  14. vfRvolume and vfLvolume mm0
  15. vfRVolume, vfLVolume mm2
  16. mm4 - mm7 are temporary mmx registers.
  17. */
  18. // Notes about calculation.
  19. // Loop is unrolled once.
  20. // *1 shifting volumne to 15 bit values to get rid of shifts and simplify code.
  21. // This make the packed mulitply work better later since I keep the sound interpolated
  22. // wave value at 16 bit signed value. For a PMULHW, this results in 15 bit results
  23. // which is the same as the original code.
  24. // *2 linear interpolation can be done very quickly with MMX by re-arranging the
  25. // way that the interpolation is done. Here is code in C that shows the difference.
  26. // Original C code
  27. //lM1 = ((pcWave[dwPosition1 + 1] - pcWave[dwPosition1]) * dwFract1) >> 12;
  28. //lM2 = ((pcWave[dwPosition2 + 1] - pcWave[dwPosition2]) * dwFract2) >> 12;
  29. //lM1 += pcWave[dwPosition1];
  30. //lM2 += pcWave[dwPosition2];
  31. // Equivalent C Code that can be done with a pmadd
  32. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  33. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  34. #include "common.h"
  35. #define STR_MODULENAME "DDKSynth.sys:MMX: "
  36. typedef unsigned __int64 QWORD;
  37. #pragma code_seg()
  38. /*****************************************************************************
  39. * CDigitalAudio::MixMono8X()
  40. *****************************************************************************
  41. * Implement a mono eight-bit mix.
  42. * Heavily optimized for MMX.
  43. */
  44. DWORD CDigitalAudio::MixMono8X(short * pBuffer, DWORD dwLength,
  45. DWORD dwDeltaPeriod, VFRACT vfDeltaVolume,
  46. PFRACT pfDeltaPitch, PFRACT pfSampleLength,
  47. PFRACT pfLoopLength)
  48. {
  49. DWORD dwI,dwIncDelta = dwDeltaPeriod;
  50. char * pcWave = (char *) m_pnWave;
  51. PFRACT pfSamplePos = m_pfLastSample;
  52. VFRACT vfVolume = m_vfLastLVolume;
  53. PFRACT pfPitch = m_pfLastPitch;
  54. PFRACT pfPFract = pfPitch << 8;
  55. VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
  56. QWORD dwFractMASK = 0x000000000FFF0FFF;
  57. QWORD dwFractOne = 0x0000000010001000;
  58. QWORD wordmask = 0x0000FFFF0000FFFF;
  59. QWORD vfDeltaLandRVolume;
  60. _asm{
  61. // vfLVFract and vfRVFract are in mm0
  62. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  63. //VFRACT vfRVFract = vfRVolume1 << 8;
  64. movd mm0, vfVolume
  65. movd mm7, vfVolume
  66. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  67. movd mm1, vfDeltaVolume
  68. movd mm6, vfDeltaVolume
  69. punpckldq mm1, mm6
  70. // dwI = 0
  71. mov ecx, 0
  72. movq vfDeltaLandRVolume, mm1
  73. movq mm1, dwFractOne
  74. movq mm4, dwFractMASK
  75. mov eax, pfSamplePos
  76. punpckldq mm0, mm7
  77. mov ebx, pfPitch
  78. pslld mm0, 8
  79. mov edx, dwIncDelta
  80. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  81. // need to be set before first pass.
  82. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  83. psrld mm2, 5
  84. //for (dwI = 0; dwI < dwLength; )
  85. //{
  86. mainloop:
  87. cmp ecx, dwLength
  88. jae done
  89. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  90. jb NotPastEndOfSample1 //{
  91. cmp pfLoopLength, 0 //if (!pfLoopLength)
  92. je done // break;
  93. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  94. NotPastEndOfSample1: //}
  95. mov esi, eax // dwPosition1 = pfSamplePos;
  96. add eax, ebx // pfSamplePos += pfPitch;
  97. sub edx, 2 // dwIncDelta-=2;
  98. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  99. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  100. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  101. // if it goes below zero.
  102. paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
  103. // vfVFract += vfDeltaVolume;
  104. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  105. mov edx, pfPFract // Temp = pfPFract;
  106. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  107. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  108. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  109. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  110. // TestRVol = vfRVFract & (~TestRVol);
  111. mov pfPFract, edx // pfPFract = Temp;
  112. movq mm2, mm5 // vfLVolume = TestLVol;
  113. // vfRVolume = TestRVol;
  114. shr edx, 8 // Temp = Temp >> 8;
  115. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  116. // vfRVolume = vfRVolume >> 5;
  117. mov ebx, edx // pfPitch = Temp;
  118. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  119. //}
  120. DontIncreaseValues1:
  121. movd mm6, esi // dwFract1 = dwPosition1;
  122. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  123. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  124. inc ecx //dwI++;
  125. // if ( dwI < dwLength) break;
  126. cmp ecx, dwLength
  127. jae StoreOne
  128. //if (pfSamplePos >= pfSampleLength)
  129. //{
  130. cmp eax, pfSampleLength
  131. jb NotPastEndOfSample2
  132. // Original if in C was not negated
  133. //if (!pfLoopLength)
  134. cmp pfLoopLength, 0
  135. //break;
  136. je StoreOne
  137. //else
  138. //pfSamplePos -= pfLoopLength;
  139. sub eax, pfLoopLength
  140. //}
  141. NotPastEndOfSample2:
  142. //shl esi, 1 // do not shift left since pcWave is array of chars
  143. mov edi, eax // dwPosition2 = pfSamplePos;
  144. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  145. movd mm7, eax // dwFract2 = pfSamplePos;
  146. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  147. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  148. // 0, 0, dwFract2, dwFract1
  149. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  150. movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
  151. movd mm3, esi
  152. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  153. //shl edi, 1 //do not shift left since pcWave is array of chars
  154. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  155. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  156. mov esi, ecx // Temp = dWI;
  157. shl esi, 1 // Temp = Temp << 1;
  158. movzx edi, word ptr[edi] //lLM2 = pcWave[dwPoisition2];
  159. movd mm6, edi
  160. pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
  161. // low 4 bytes in mm3
  162. punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  163. add esi, pBuffer //
  164. punpcklbw mm7, mm3 // low four bytes bytes in
  165. // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  166. pmaddwd mm7, mm5 // high dword = lM2 =
  167. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  168. // low dword = lM1 =
  169. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  170. movq mm3, mm2 // put left and right volume levels in mm3
  171. add eax, ebx //pfSamplePos += pfPitch;
  172. packssdw mm3, mm2 // words in mm7
  173. // vfVolume, vfVolume, vfVolume, vfVolume
  174. movd mm5, dword ptr[esi-2] // Load values from buffer
  175. inc ecx // dwI++;
  176. psrad mm7, 12 // shift back down to 16 bits.
  177. packssdw mm7, mm4 // only need one word in mono case.
  178. // low word are lm2 and lm1
  179. // above multiplies and shifts are all done with this one pmul. Low two word are only
  180. // interest in mono case
  181. pmulhw mm3, mm7 // lLM1 *= vfVolume;
  182. // lLM2 *= vfVolume;
  183. paddsw mm5, mm3 // Add values to buffer with saturation
  184. movd dword ptr[esi-2], mm5 // Store values back into buffer.
  185. // }
  186. jmp mainloop
  187. // Need to write only one.
  188. //if (dwI < dwLength)
  189. //{
  190. StoreOne:
  191. #if 1
  192. // Linearly interpolate between points and store only one value.
  193. // combine dwFract Values.
  194. // Make mm7 zero for unpacking
  195. //shl esi, 1 // do not shift left since pcWave is array of chars
  196. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  197. pxor mm7, mm7
  198. //lLM1 = pcWave[dwPosition1];
  199. movzx esi, word ptr[esi]
  200. // Doing AND that was not done for dwFract1 and dwFract2
  201. pand mm6, mm4
  202. // words in MMX register after operation is complete.
  203. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  204. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  205. // put values of pcWave into MMX registers. They are read into a regular register so
  206. // that the routine does not read past the end of the buffer otherwise, it could read
  207. // directly into the MMX registers.
  208. // words in MMX registers
  209. pxor mm7, mm7
  210. // low four bytes
  211. movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  212. // 8 bytes after unpakc
  213. punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
  214. // *2 pmadd efficent code.
  215. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  216. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  217. pmaddwd mm7, mm5// low dword = lM1 =
  218. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  219. psrad mm7, 12 // shift back down to 16 bits
  220. movq mm5, mm2 // move volume into mm5
  221. /*
  222. // Set lLM to be same as lM
  223. lLM1 = lM1;
  224. lLM1 *= vfLVolume1;
  225. lLM1 >>= 5; // Signal bumps up to 15 bits.
  226. lM1 *= vfRVolume1;
  227. lM1 >>= 5;
  228. // Set lLM to be same as lM
  229. lLM2 = lM2;
  230. lLM2 *= vfLVolume2;
  231. lLM2 >>= 5; // Signal bumps up to 15 bits.
  232. lM2 *= vfRVolume2;
  233. lM2 >>= 5;
  234. */
  235. // above multiplies and shifts are all done with this one pmul
  236. pmulhw mm5, mm7
  237. // calculate buffer location.
  238. mov edi, ecx
  239. shl edi, 1
  240. add edi, pBuffer
  241. movd edx, mm5
  242. //pBuffer[dwI+1] += (short) lM1;
  243. add word ptr[edi-2], dx
  244. jno no_oflowr1
  245. //pBuffer[dwI+1] = 0x7fff;
  246. mov word ptr[edi-2], 0x7fff
  247. js no_oflowr1
  248. //pBuffer[dwI+1] = (short) 0x8000;
  249. mov word ptr[edi-2], 0x8000
  250. no_oflowr1:
  251. //}
  252. #endif
  253. done:
  254. mov edx, this // get address of class object
  255. //m_vfLastLVolume = vfVolume;
  256. //m_vfLastRVolume = vfVolume;
  257. // need to shift volume back down to 12 bits before storing
  258. psrld mm2, 3
  259. movd [edx]this.m_vfLastLVolume, mm2
  260. movd [edx]this.m_vfLastRVolume, mm2
  261. //m_pfLastPitch = pfPitch;
  262. mov [edx]this.m_pfLastPitch, ebx
  263. //m_pfLastSample = pfSamplePos;
  264. mov [edx]this.m_pfLastSample, eax
  265. // put value back into dwI to be returned. This could just be passed back in eax I think.
  266. mov dwI, ecx
  267. emms
  268. } // ASM block
  269. return (dwI);
  270. }
  271. /*****************************************************************************
  272. * CDigitalAudio::Mix8X()
  273. *****************************************************************************
  274. * Implement a stereo eight-bit mix.
  275. * Heavily optimized for MMX.
  276. */
  277. DWORD CDigitalAudio::Mix8X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod,
  278. VFRACT vfDeltaLVolume, VFRACT vfDeltaRVolume,
  279. PFRACT pfDeltaPitch, PFRACT pfSampleLength, PFRACT pfLoopLength)
  280. {
  281. DWORD dwI;
  282. //DWORD dwPosition1, dwPosition2;
  283. //long lM1, lLM1;
  284. //long lM2, lLM2;
  285. DWORD dwIncDelta = dwDeltaPeriod;
  286. //VFRACT dwFract1, dwFract2;
  287. char * pcWave = (char *) m_pnWave;
  288. PFRACT pfSamplePos = m_pfLastSample;
  289. VFRACT vfLVolume = m_vfLastLVolume;
  290. VFRACT vfRVolume = m_vfLastRVolume;
  291. VFRACT vfLVolume2 = m_vfLastLVolume;
  292. VFRACT vfRVolume2 = m_vfLastRVolume;
  293. PFRACT pfPitch = m_pfLastPitch;
  294. PFRACT pfPFract = pfPitch << 8;
  295. dwLength <<= 1;
  296. QWORD dwFractMASK = 0x000000000FFF0FFF;
  297. QWORD dwFractOne = 0x0000000010001000;
  298. QWORD wordmask = 0x0000FFFF0000FFFF;
  299. QWORD vfDeltaLandRVolume;
  300. _asm{
  301. // vfLVFract and vfRVFract are in mm0
  302. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  303. //VFRACT vfRVFract = vfRVolume1 << 8;
  304. movd mm0, vfLVolume
  305. movd mm7, vfRVolume
  306. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  307. movd mm1, vfDeltaLVolume
  308. movd mm6, vfDeltaRVolume
  309. punpckldq mm1, mm6
  310. // dwI = 0
  311. mov ecx, 0
  312. movq vfDeltaLandRVolume, mm1
  313. movq mm1, dwFractOne
  314. movq mm4, dwFractMASK
  315. mov eax, pfSamplePos
  316. punpckldq mm0, mm7
  317. mov ebx, pfPitch
  318. pslld mm0, 8
  319. mov edx, dwIncDelta
  320. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  321. // need to be set before first pass.
  322. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  323. psrld mm2, 5
  324. //for (dwI = 0; dwI < dwLength; )
  325. //{
  326. mainloop:
  327. cmp ecx, dwLength
  328. jae done
  329. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  330. jb NotPastEndOfSample1 //{
  331. cmp pfLoopLength, 0 //if (!pfLoopLength)
  332. je done // break;
  333. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  334. NotPastEndOfSample1: //}
  335. mov esi, eax // dwPosition1 = pfSamplePos;
  336. add eax, ebx // pfSamplePos += pfPitch;
  337. sub edx, 2 // dwIncDelta-=2;
  338. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  339. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  340. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  341. // if it goes below zero.
  342. paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
  343. // vfRVFract += vfDeltaRVolume;
  344. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  345. mov edx, pfPFract // Temp = pfPFract;
  346. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  347. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  348. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  349. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  350. // TestRVol = vfRVFract & (~TestRVol);
  351. mov pfPFract, edx // pfPFract = Temp;
  352. movq mm2, mm5 // vfLVolume = TestLVol;
  353. // vfRVolume = TestRVol;
  354. shr edx, 8 // Temp = Temp >> 8;
  355. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  356. // vfRVolume = vfRVolume >> 5;
  357. mov ebx, edx // pfPitch = Temp;
  358. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  359. //}
  360. DontIncreaseValues1:
  361. movd mm6, esi // dwFract1 = dwPosition1;
  362. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  363. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  364. add ecx, 2 //dwI += 2;
  365. // if ( dwI < dwLength) break;
  366. cmp ecx, dwLength
  367. jae StoreOne
  368. //if (pfSamplePos >= pfSampleLength)
  369. //{
  370. cmp eax, pfSampleLength
  371. jb NotPastEndOfSample2
  372. // Original if in C was not negated
  373. //if (!pfLoopLength)
  374. cmp pfLoopLength, 0
  375. //break;
  376. je StoreOne
  377. //else
  378. //pfSamplePos -= pfLoopLength;
  379. sub eax, pfLoopLength
  380. //}
  381. NotPastEndOfSample2:
  382. //shl esi, 1 // do not shift left since pcWave is array of chars
  383. mov edi, eax // dwPosition2 = pfSamplePos;
  384. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  385. movd mm7, eax // dwFract2 = pfSamplePos;
  386. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  387. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  388. // 0, 0, dwFract2, dwFract1
  389. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  390. movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
  391. movd mm3, esi
  392. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  393. //shl edi, 1 // do not shift left since pcWave is array of chars
  394. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  395. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  396. mov esi, ecx // Temp = dWI;
  397. shl esi, 1 // Temp = Temp << 1;
  398. movzx edi, word ptr[edi] //lLM2 = pcWave[dwPosition2];
  399. movd mm6, edi
  400. pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
  401. // low 4 bytes bytes in mm3
  402. punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  403. add esi, pBuffer //
  404. punpcklbw mm7, mm3 // bytes in mm7
  405. // pcWave[dwPos2+1], 0, pcWave[dwPos2], 0, pcWave[dwPos1+1], pcWave[dwPos1], 0
  406. pmaddwd mm7, mm5 // high dword = lM2 =
  407. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  408. // low dword = lM1 =
  409. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  410. movq mm3, mm2 // put left and right volume levels in mm3
  411. add eax, ebx //pfSamplePos += pfPitch;
  412. packssdw mm3, mm2 // words in mm3
  413. // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
  414. movq mm5, qword ptr[esi-4] // Load values from buffer
  415. add ecx, 2 // dwI += 2;
  416. psrad mm7, 12 // shift back down to 16 bits.
  417. pand mm7, wordmask // combine results to get ready to multiply by left and right
  418. movq mm6, mm7 // volume levels.
  419. pslld mm6, 16 //
  420. por mm7, mm6 // words in mm7
  421. // lM2, lM2, lM1, lM1
  422. // above multiplies and shifts are all done with this one pmul
  423. pmulhw mm3, mm7 // lLM1 *= vfLVolume;
  424. // lM1 *= vfRVolume;
  425. // lLM2 *= vfLVolume;
  426. // lM2 *= vfRVolume;
  427. paddsw mm5, mm3 // Add values to buffer with saturation
  428. movq qword ptr[esi-4], mm5 // Store values back into buffer.
  429. // }
  430. jmp mainloop
  431. // Need to write only one.
  432. //if (dwI < dwLength)
  433. //{
  434. StoreOne:
  435. #if 1
  436. // Linearly interpolate between points and store only one value.
  437. // combine dwFract Values.
  438. // Make mm7 zero for unpacking
  439. //shl esi, 1 // do not shift left since pcWave is array of chars
  440. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  441. pxor mm7, mm7
  442. //lLM1 = pcWave[dwPosition1];
  443. movzx esi, word ptr[esi]
  444. // Doing AND that was not done for dwFract1 and dwFract2
  445. pand mm6, mm4
  446. // words in MMX register after operation is complete.
  447. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  448. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  449. // put values of pcWave into MMX registers. They are read into a regular register so
  450. // that the routine does not read past the end of the buffer otherwise, it could read
  451. // directly into the MMX registers.
  452. pxor mm7, mm7
  453. // byte in MMX registers
  454. movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  455. punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
  456. // *2 pmadd efficent code.
  457. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  458. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  459. pmaddwd mm7, mm5// low dword = lM1 =
  460. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  461. psrad mm7, 12 // shift back down to 16 bits
  462. pand mm7, wordmask // combine results to get ready to multiply by left and right
  463. movq mm6, mm7 // volume levels.
  464. pslld mm6, 16 //
  465. por mm7, mm6 // words in mm7
  466. // lM2, lM2, lM1, lM1
  467. pxor mm6, mm6
  468. movq mm5, mm2 // move volume1 into mm5
  469. // use pack to get 4 volume values together for multiplication.
  470. packssdw mm5, mm6 // words in mm7
  471. // 0, 0, vfRVolume1, vfLVolume1
  472. /*
  473. // Set lLM to be same as lM
  474. lLM1 = lM1;
  475. lLM1 *= vfLVolume1;
  476. lLM1 >>= 5; // Signal bumps up to 15 bits.
  477. lM1 *= vfRVolume1;
  478. lM1 >>= 5;
  479. // Set lLM to be same as lM
  480. lLM2 = lM2;
  481. lLM2 *= vfLVolume2;
  482. lLM2 >>= 5; // Signal bumps up to 15 bits.
  483. lM2 *= vfRVolume2;
  484. lM2 >>= 5;
  485. */
  486. // above multiplies and shifts are all done with this one pmul
  487. pmulhw mm5, mm7
  488. // calculate buffer location.
  489. mov edi, ecx
  490. shl edi, 1
  491. add edi, pBuffer
  492. /*
  493. add word ptr[edi-4], si
  494. jno no_oflowl1
  495. // pBuffer[dwI] = 0x7fff;
  496. mov word ptr[edi-4], 0x7fff
  497. js no_oflowl1
  498. //pBuffer[dwI] = (short) 0x8000;
  499. mov word ptr[edi-4], 0x8000
  500. no_oflowl1:
  501. //pBuffer[dwI+1] += (short) lM1;
  502. add word ptr[edi-2], dx
  503. jno no_oflowr1
  504. //pBuffer[dwI+1] = 0x7fff;
  505. mov word ptr[edi-2], 0x7fff
  506. js no_oflowr1
  507. //pBuffer[dwI+1] = (short) 0x8000;
  508. mov word ptr[edi-2], 0x8000
  509. no_oflowr1:
  510. */
  511. movd mm7, dword ptr[edi-4]
  512. paddsw mm7, mm5
  513. movd dword ptr[edi-4], mm7
  514. //}
  515. #endif
  516. done:
  517. mov edx, this // get address of class object
  518. //m_vfLastLVolume = vfLVolume;
  519. //m_vfLastRVolume = vfRVolume;
  520. // need to shift volume back down to 12 bits before storing
  521. psrld mm2, 3
  522. movd [edx]this.m_vfLastLVolume, mm2
  523. psrlq mm2, 32
  524. movd [edx]this.m_vfLastRVolume, mm2
  525. //m_pfLastPitch = pfPitch;
  526. mov [edx]this.m_pfLastPitch, ebx
  527. //m_pfLastSample = pfSamplePos;
  528. mov [edx]this.m_pfLastSample, eax
  529. // put value back into dwI to be returned. This could just be passed back in eax I think.
  530. mov dwI, ecx
  531. emms
  532. } // ASM block
  533. return (dwI >> 1);
  534. }
  535. /*****************************************************************************
  536. * CDigitalAudio::MixMono16X()
  537. *****************************************************************************
  538. * Implement a mono sixteen-bit mix.
  539. * Heavily optimized for MMX.
  540. */
  541. DWORD CDigitalAudio::MixMono16X(short * pBuffer, DWORD dwLength,
  542. DWORD dwDeltaPeriod,VFRACT vfDeltaVolume,
  543. PFRACT pfDeltaPitch,PFRACT pfSampleLength,
  544. PFRACT pfLoopLength)
  545. {
  546. DWORD dwI,dwIncDelta = dwDeltaPeriod;
  547. short * pcWave = (short*) m_pnWave;
  548. PFRACT pfSamplePos = m_pfLastSample;
  549. VFRACT vfVolume = m_vfLastLVolume;
  550. PFRACT pfPitch = m_pfLastPitch;
  551. PFRACT pfPFract = pfPitch << 8;
  552. VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
  553. QWORD dwFractMASK = 0x000000000FFF0FFF;
  554. QWORD dwFractOne = 0x0000000010001000;
  555. QWORD wordmask = 0x0000FFFF0000FFFF;
  556. QWORD vfDeltaLandRVolume;
  557. _asm{
  558. // vfLVFract and vfRVFract are in mm0
  559. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  560. //VFRACT vfRVFract = vfRVolume1 << 8;
  561. movd mm0, vfVolume
  562. movd mm7, vfVolume
  563. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  564. movd mm1, vfDeltaVolume
  565. movd mm6, vfDeltaVolume
  566. punpckldq mm1, mm6
  567. // dwI = 0
  568. mov ecx, 0
  569. movq vfDeltaLandRVolume, mm1
  570. movq mm1, dwFractOne
  571. movq mm4, dwFractMASK
  572. mov eax, pfSamplePos
  573. punpckldq mm0, mm7
  574. mov ebx, pfPitch
  575. pslld mm0, 8
  576. mov edx, dwIncDelta
  577. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  578. // need to be set before first pass.
  579. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  580. psrld mm2, 5
  581. //for (dwI = 0; dwI < dwLength; )
  582. //{
  583. mainloop:
  584. cmp ecx, dwLength
  585. jae done
  586. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  587. jb NotPastEndOfSample1 //{
  588. cmp pfLoopLength, 0 //if (!pfLoopLength)
  589. je done // break;
  590. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  591. NotPastEndOfSample1: //}
  592. mov esi, eax // dwPosition1 = pfSamplePos;
  593. add eax, ebx // pfSamplePos += pfPitch;
  594. sub edx, 2 // dwIncDelta-=2;
  595. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  596. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  597. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  598. // if it goes below zero.
  599. paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
  600. // vfVFract += vfDeltaVolume;
  601. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  602. mov edx, pfPFract // Temp = pfPFract;
  603. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  604. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  605. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  606. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  607. // TestRVol = vfRVFract & (~TestRVol);
  608. mov pfPFract, edx // pfPFract = Temp;
  609. movq mm2, mm5 // vfLVolume = TestLVol;
  610. // vfRVolume = TestRVol;
  611. shr edx, 8 // Temp = Temp >> 8;
  612. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  613. // vfRVolume = vfRVolume >> 5;
  614. mov ebx, edx // pfPitch = Temp;
  615. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  616. //}
  617. DontIncreaseValues1:
  618. movd mm6, esi // dwFract1 = dwPosition1;
  619. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  620. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  621. inc ecx //dwI++;
  622. // if ( dwI < dwLength) break;
  623. cmp ecx, dwLength
  624. jae StoreOne
  625. //if (pfSamplePos >= pfSampleLength)
  626. //{
  627. cmp eax, pfSampleLength
  628. jb NotPastEndOfSample2
  629. // Original if in C was not negated
  630. //if (!pfLoopLength)
  631. cmp pfLoopLength, 0
  632. //break;
  633. je StoreOne
  634. //else
  635. //pfSamplePos -= pfLoopLength;
  636. sub eax, pfLoopLength
  637. //}
  638. NotPastEndOfSample2:
  639. shl esi, 1 // shift left since pcWave is array of shorts
  640. mov edi, eax // dwPosition2 = pfSamplePos;
  641. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  642. movd mm7, eax // dwFract2 = pfSamplePos;
  643. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  644. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  645. // 0, 0, dwFract2, dwFract1
  646. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  647. movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
  648. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  649. shl edi, 1 // shift left since pcWave is array of shorts
  650. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  651. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  652. mov esi, ecx // Temp = dWI;
  653. shl esi, 1 // Temp = Temp << 1;
  654. movq mm3, mm2 // put left and right volume levels in mm3
  655. movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
  656. packssdw mm3, mm2 // words in mm7
  657. // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
  658. add esi, pBuffer //
  659. punpckldq mm7, mm6 // low four bytes bytes in
  660. // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  661. pmaddwd mm7, mm5 // high dword = lM2 =
  662. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  663. // low dword = lM1 =
  664. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  665. add eax, ebx //pfSamplePos += pfPitch;
  666. movd mm5, dword ptr[esi-2] // Load values from buffer
  667. inc ecx // dwI++;
  668. psrad mm7, 12 // shift back down to 16 bits.
  669. packssdw mm7, mm4 // only need one word in mono case.
  670. // low word are lm2 and lm1
  671. // above multiplies and shifts are all done with this one pmul. Low two word are only
  672. // interest in mono case
  673. pmulhw mm3, mm7 // lLM1 *= vfVolume;
  674. // lLM2 *= vfVolume;
  675. paddsw mm5, mm3 // Add values to buffer with saturation
  676. movd dword ptr[esi-2], mm5 // Store values back into buffer.
  677. // }
  678. jmp mainloop
  679. // Need to write only one.
  680. //if (dwI < dwLength)
  681. //{
  682. StoreOne:
  683. #if 1
  684. // Linearly interpolate between points and store only one value.
  685. // combine dwFract Values.
  686. // Make mm7 zero for unpacking
  687. shl esi, 1 // shift left since pcWave is array of shorts
  688. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  689. pxor mm7, mm7
  690. //lLM1 = pcWave[dwPosition1];
  691. mov esi, dword ptr[esi]
  692. // Doing AND that was not done for dwFract1 and dwFract2
  693. pand mm6, mm4
  694. // words in MMX register after operation is complete.
  695. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  696. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  697. // put values of pcWave into MMX registers. They are read into a regular register so
  698. // that the routine does not read past the end of the buffer otherwise, it could read
  699. // directly into the MMX registers.
  700. // words in MMX registers
  701. movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  702. // *2 pmadd efficent code.
  703. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  704. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  705. pmaddwd mm7, mm5// low dword = lM1 =
  706. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  707. psrad mm7, 12 // shift back down to 16 bits
  708. movq mm5, mm2 // move volume into mm5
  709. /*
  710. // Set lLM to be same as lM
  711. lLM1 = lM1;
  712. lLM1 *= vfLVolume1;
  713. lLM1 >>= 5; // Signal bumps up to 15 bits.
  714. lM1 *= vfRVolume1;
  715. lM1 >>= 5;
  716. // Set lLM to be same as lM
  717. lLM2 = lM2;
  718. lLM2 *= vfLVolume2;
  719. lLM2 >>= 5; // Signal bumps up to 15 bits.
  720. lM2 *= vfRVolume2;
  721. lM2 >>= 5;
  722. */
  723. // above multiplies and shifts are all done with this one pmul
  724. pmulhw mm5, mm7
  725. // calculate buffer location.
  726. mov edi, ecx
  727. shl edi, 1
  728. add edi, pBuffer
  729. movd edx, mm5
  730. //pBuffer[dwI+1] += (short) lM1;
  731. add word ptr[edi-2], dx
  732. jno no_oflowr1
  733. //pBuffer[dwI+1] = 0x7fff;
  734. mov word ptr[edi-2], 0x7fff
  735. js no_oflowr1
  736. //pBuffer[dwI+1] = (short) 0x8000;
  737. mov word ptr[edi-2], 0x8000
  738. no_oflowr1:
  739. //}
  740. #endif
  741. done:
  742. mov edx, this // get address of class object
  743. //m_vfLastLVolume = vfVolume;
  744. //m_vfLastRVolume = vfVolume;
  745. // need to shift volume back down to 12 bits before storing
  746. psrld mm2, 3
  747. movd [edx]this.m_vfLastLVolume, mm2
  748. movd [edx]this.m_vfLastRVolume, mm2
  749. //m_pfLastPitch = pfPitch;
  750. mov [edx]this.m_pfLastPitch, ebx
  751. //m_pfLastSample = pfSamplePos;
  752. mov [edx]this.m_pfLastSample, eax
  753. // put value back into dwI to be returned. This could just be passed back in eax I think.
  754. mov dwI, ecx
  755. emms
  756. } // ASM block
  757. return (dwI);
  758. }
  759. /*****************************************************************************
  760. * CDigitalAudio::Mix16X()
  761. *****************************************************************************
  762. * Implement a stereo sixteen-bit mix.
  763. * Heavily optimized for MMX.
  764. */
  765. DWORD CDigitalAudio::Mix16X(short * pBuffer, DWORD dwLength,
  766. DWORD dwDeltaPeriod, VFRACT vfDeltaLVolume,
  767. VFRACT vfDeltaRVolume,PFRACT pfDeltaPitch,
  768. PFRACT pfSampleLength,PFRACT pfLoopLength)
  769. {
  770. DWORD dwI,dwIncDelta = dwDeltaPeriod;
  771. //DWORD dwPosition1, dwPosition2;
  772. //long lM1, lLM1;
  773. //long lM2, lLM2;
  774. //VFRACT dwFract1, dwFract2;
  775. short * pcWave = (short *) m_pnWave;
  776. PFRACT pfSamplePos = m_pfLastSample;
  777. VFRACT vfLVolume = m_vfLastLVolume;
  778. VFRACT vfRVolume = m_vfLastRVolume;
  779. VFRACT vfLVolume2 = m_vfLastLVolume;
  780. VFRACT vfRVolume2 = m_vfLastRVolume;
  781. PFRACT pfPitch = m_pfLastPitch;
  782. PFRACT pfPFract = pfPitch << 8;
  783. dwLength <<= 1;
  784. QWORD dwFractMASK = 0x000000000FFF0FFF;
  785. QWORD dwFractOne = 0x0000000010001000;
  786. QWORD wordmask = 0x0000FFFF0000FFFF;
  787. QWORD vfDeltaLandRVolume;
  788. _asm{
  789. // vfLVFract and vfRVFract are in mm0
  790. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  791. //VFRACT vfRVFract = vfRVolume1 << 8;
  792. movd mm0, vfLVolume
  793. movd mm7, vfRVolume
  794. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  795. movd mm1, vfDeltaLVolume
  796. movd mm6, vfDeltaRVolume
  797. punpckldq mm1, mm6
  798. // dwI = 0
  799. mov ecx, 0
  800. movq vfDeltaLandRVolume, mm1
  801. movq mm1, dwFractOne
  802. movq mm4, dwFractMASK
  803. mov eax, pfSamplePos
  804. punpckldq mm0, mm7
  805. mov ebx, pfPitch
  806. pslld mm0, 8
  807. mov edx, dwIncDelta
  808. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  809. // need to be set before first pass.
  810. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  811. psrld mm2, 5
  812. //for (dwI = 0; dwI < dwLength; )
  813. //{
  814. mainloop:
  815. cmp ecx, dwLength
  816. jae done
  817. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  818. jb NotPastEndOfSample1 //{
  819. cmp pfLoopLength, 0 //if (!pfLoopLength)
  820. je done // break;
  821. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  822. NotPastEndOfSample1: //}
  823. mov esi, eax // dwPosition1 = pfSamplePos;
  824. add eax, ebx // pfSamplePos += pfPitch;
  825. sub edx, 2 // dwIncDelta-=2;
  826. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  827. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  828. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  829. // if it goes below zero.
  830. paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
  831. // vfRVFract += vfDeltaRVolume;
  832. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  833. mov edx, pfPFract // Temp = pfPFract;
  834. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  835. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  836. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  837. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  838. // TestRVol = vfRVFract & (~TestRVol);
  839. mov pfPFract, edx // pfPFract = Temp;
  840. movq mm2, mm5 // vfLVolume = TestLVol;
  841. // vfRVolume = TestRVol;
  842. shr edx, 8 // Temp = Temp >> 8;
  843. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  844. // vfRVolume = vfRVolume >> 5;
  845. mov ebx, edx // pfPitch = Temp;
  846. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  847. //}
  848. DontIncreaseValues1:
  849. movd mm6, esi // dwFract1 = dwPosition1;
  850. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  851. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  852. add ecx, 2 //dwI += 2;
  853. // if ( dwI < dwLength) break;
  854. cmp ecx, dwLength
  855. jae StoreOne
  856. //if (pfSamplePos >= pfSampleLength)
  857. //{
  858. cmp eax, pfSampleLength
  859. jb NotPastEndOfSample2
  860. // Original if in C was not negated
  861. //if (!pfLoopLength)
  862. cmp pfLoopLength, 0
  863. //break;
  864. je StoreOne
  865. //else
  866. //pfSamplePos -= pfLoopLength;
  867. sub eax, pfLoopLength
  868. //}
  869. NotPastEndOfSample2:
  870. shl esi, 1 // shift left since pcWave is array of shorts
  871. mov edi, eax // dwPosition2 = pfSamplePos;
  872. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  873. movd mm7, eax // dwFract2 = pfSamplePos;
  874. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  875. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  876. // 0, 0, dwFract2, dwFract1
  877. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  878. movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
  879. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  880. shl edi, 1 // shift left since pcWave is array of shorts
  881. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  882. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  883. mov esi, ecx // Temp = dWI;
  884. shl esi, 1 // Temp = Temp << 1;
  885. movq mm3, mm2 // put left and right volume levels in mm3
  886. movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
  887. packssdw mm3, mm2 // words in mm7
  888. // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
  889. add esi, pBuffer //
  890. punpckldq mm7, mm6 // low four bytes bytes in
  891. // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  892. pmaddwd mm7, mm5 // high dword = lM2 =
  893. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  894. // low dword = lM1 =
  895. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  896. add eax, ebx //pfSamplePos += pfPitch;
  897. movq mm5, qword ptr[esi-4] // Load values from buffer
  898. add ecx, 2 // dwI += 2;
  899. psrad mm7, 12 // shift back down to 16 bits.
  900. pand mm7, wordmask // combine results to get ready to multiply by left and right
  901. movq mm6, mm7 // volume levels.
  902. pslld mm6, 16 //
  903. por mm7, mm6 // words in mm7
  904. // lM2, lM2, lM1, lM1
  905. // above multiplies and shifts are all done with this one pmul
  906. pmulhw mm3, mm7 // lLM1 *= vfLVolume;
  907. // lM1 *= vfRVolume;
  908. // lLM2 *= vfLVolume;
  909. // lM2 *= vfRVolume;
  910. paddsw mm5, mm3 // Add values to buffer with saturation
  911. movq qword ptr[esi-4], mm5 // Store values back into buffer.
  912. // }
  913. jmp mainloop
  914. // Need to write only one.
  915. //if (dwI < dwLength)
  916. //{
  917. StoreOne:
  918. #if 1
  919. // Linearly interpolate between points and store only one value.
  920. // combine dwFract Values.
  921. // Make mm7 zero for unpacking
  922. shl esi, 1 // shift left since pcWave is array of shorts
  923. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  924. pxor mm7, mm7
  925. //lLM1 = pcWave[dwPosition1];
  926. mov esi, dword ptr[esi]
  927. // Doing AND that was not done for dwFract1 and dwFract2
  928. pand mm6, mm4
  929. // words in MMX register after operation is complete.
  930. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  931. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  932. // put values of pcWave into MMX registers. They are read into a regular register so
  933. // that the routine does not read past the end of the buffer otherwise, it could read
  934. // directly into the MMX registers.
  935. // words in MMX registers
  936. movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  937. // *2 pmadd efficent code.
  938. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  939. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  940. pmaddwd mm7, mm5// low dword = lM1 =
  941. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  942. psrad mm7, 12 // shift back down to 16 bits
  943. pand mm7, wordmask // combine results to get ready to multiply by left and right
  944. movq mm6, mm7 // volume levels.
  945. pslld mm6, 16 //
  946. por mm7, mm6 // words in mm7
  947. // lM2, lM2, lM1, lM1
  948. pxor mm6, mm6
  949. movq mm5, mm2 // move volume1 into mm5
  950. // use pack to get 4 volume values together for multiplication.
  951. packssdw mm5, mm6 // words in mm7
  952. // 0, 0, vfRVolume1, vfLVolume1
  953. /*
  954. // Set lLM to be same as lM
  955. lLM1 = lM1;
  956. lLM1 *= vfLVolume1;
  957. lLM1 >>= 5; // Signal bumps up to 15 bits.
  958. lM1 *= vfRVolume1;
  959. lM1 >>= 5;
  960. // Set lLM to be same as lM
  961. lLM2 = lM2;
  962. lLM2 *= vfLVolume2;
  963. lLM2 >>= 5; // Signal bumps up to 15 bits.
  964. lM2 *= vfRVolume2;
  965. lM2 >>= 5;
  966. */
  967. // above multiplies and shifts are all done with this one pmul
  968. pmulhw mm5, mm7
  969. // calculate buffer location.
  970. mov edi, ecx
  971. shl edi, 1
  972. add edi, pBuffer
  973. /*
  974. add word ptr[edi-4], si
  975. jno no_oflowl1
  976. // pBuffer[dwI] = 0x7fff;
  977. mov word ptr[edi-4], 0x7fff
  978. js no_oflowl1
  979. //pBuffer[dwI] = (short) 0x8000;
  980. mov word ptr[edi-4], 0x8000
  981. no_oflowl1:
  982. //pBuffer[dwI+1] += (short) lM1;
  983. add word ptr[edi-2], dx
  984. jno no_oflowr1
  985. //pBuffer[dwI+1] = 0x7fff;
  986. mov word ptr[edi-2], 0x7fff
  987. js no_oflowr1
  988. //pBuffer[dwI+1] = (short) 0x8000;
  989. mov word ptr[edi-2], 0x8000
  990. no_oflowr1:
  991. */
  992. movd mm7, dword ptr[edi-4]
  993. paddsw mm7, mm5
  994. movd dword ptr[edi-4], mm7
  995. //}
  996. #endif
  997. done:
  998. mov edx, this // get address of class object
  999. //m_vfLastLVolume = vfLVolume;
  1000. //m_vfLastRVolume = vfRVolume;
  1001. // need to shift volume back down to 12 bits before storing
  1002. psrld mm2, 3
  1003. movd [edx]this.m_vfLastLVolume, mm2
  1004. psrlq mm2, 32
  1005. movd [edx]this.m_vfLastRVolume, mm2
  1006. //m_pfLastPitch = pfPitch;
  1007. mov [edx]this.m_pfLastPitch, ebx
  1008. //m_pfLastSample = pfSamplePos;
  1009. mov [edx]this.m_pfLastSample, eax
  1010. // put value back into dwI to be returned. This could just be passed back in eax I think.
  1011. mov dwI, ecx
  1012. emms
  1013. } // ASM block
  1014. return (dwI >> 1);
  1015. }
  1016. /*****************************************************************************
  1017. * MMXDisabled()
  1018. *****************************************************************************
  1019. * Check the registry key to determine whether to ignore MMX.
  1020. */
  1021. static BOOL MMXDisabled()
  1022. {
  1023. ULONG ulValue;
  1024. if (!GetRegValueDword(
  1025. TEXT("Software\\Microsoft\\DirectMusic"),
  1026. TEXT("MMXDisabled"),
  1027. &ulValue))
  1028. {
  1029. return FALSE;
  1030. }
  1031. return (BOOL)ulValue;
  1032. }
  1033. #define CPU_ID _asm _emit 0x0f _asm _emit 0xa2
  1034. /*****************************************************************************
  1035. * MultiMediaInstructionsSupported()
  1036. *****************************************************************************
  1037. * Returns whether this CPU supports MMX.
  1038. */
  1039. BOOL MultiMediaInstructionsSupported()
  1040. {
  1041. BOOL bMultiMediaInstructionsSupported;
  1042. if (!MMXDisabled())
  1043. {
  1044. _asm
  1045. {
  1046. pushfd // Store original EFLAGS on stack
  1047. pop eax // Get original EFLAGS in EAX
  1048. mov ecx, eax // Duplicate original EFLAGS in ECX for toggle check
  1049. xor eax, 0x00200000L // Flip ID bit in EFLAGS
  1050. push eax // Save new EFLAGS value on stack
  1051. popfd // Replace current EFLAGS value
  1052. pushfd // Store new EFLAGS on stack
  1053. pop eax // Get new EFLAGS in EAX
  1054. xor eax, ecx // Can we toggle ID bit?
  1055. jz Done // Jump if no, Processor is older than a Pentium so CPU_ID is not supported
  1056. mov eax, 1 // Set EAX to tell the CPUID instruction what to return
  1057. push ebx
  1058. CPU_ID // Get family/model/stepping/features
  1059. pop ebx
  1060. xor eax,eax // Assume failure
  1061. test edx, 0x00800000L // Check if mmx technology available
  1062. jz Done // Jump if no
  1063. // Tests passed, this machine supports MMX
  1064. inc eax // Set to success
  1065. Done:
  1066. mov bMultiMediaInstructionsSupported, eax
  1067. }
  1068. } else {
  1069. bMultiMediaInstructionsSupported = 0;
  1070. }
  1071. return (bMultiMediaInstructionsSupported);
  1072. }