Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1479 lines
42 KiB

  1. // Copyright (c) 1996-1999 Microsoft Corporation
  2. // Mmx.cpp
  3. // MMX Mix engines for Microsoft synth
  4. /*
  5. Variable useage.
  6. Variable register
  7. pfSamplePos eax
  8. pfPitch ebx
  9. dwI ecx
  10. dwIncDelta edx (edx is sometimes a temporary register)
  11. dwPosition1 esi
  12. dwPostiion2 edi
  13. vfRvolume and vfLvolume mm0
  14. vfRVolume, vfLVolume mm2
  15. mm4 - mm7 are temporary mmx registers.
  16. */
  17. // Notes about calculation.
  18. // Loop is unrolled once.
  19. // *1 shifting volumne to 15 bit values to get rid of shifts and simplify code.
  20. // This make the packed mulitply work better later since I keep the sound interpolated
  21. // wave value at 16 bit signed value. For a PMULHW, this results in 15 bit results
  22. // which is the same as the original code.
  23. // *2 linear interpolation can be done very quickly with MMX by re-arranging the
  24. // way that the interpolation is done. Here is code in C that shows the difference.
  25. // Original C code
  26. //lM1 = ((pcWave[dwPosition1 + 1] - pcWave[dwPosition1]) * dwFract1) >> 12;
  27. //lM2 = ((pcWave[dwPosition2 + 1] - pcWave[dwPosition2]) * dwFract2) >> 12;
  28. //lM1 += pcWave[dwPosition1];
  29. //lM2 += pcWave[dwPosition2];
  30. // Equivalent C Code that can be done with a pmadd
  31. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  32. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  33. #ifdef DMSYNTH_MINIPORT
  34. #include "common.h"
  35. #else
  36. #include "simple.h"
  37. #include <mmsystem.h>
  38. #include "synth.h"
  39. #include "debug.h"
  40. #endif
  41. typedef unsigned __int64 QWORD;
  42. #ifdef ORG_MONO_MIXER
  43. DWORD CDigitalAudio::MixMono8X(short * pBuffer,
  44. DWORD dwLength,
  45. DWORD dwDeltaPeriod,
  46. VFRACT vfDeltaVolume,
  47. VFRACT vfLastVolume[],
  48. PFRACT pfDeltaPitch,
  49. PFRACT pfSampleLength,
  50. PFRACT pfLoopLength)
  51. {
  52. DWORD dwI;
  53. DWORD dwIncDelta = dwDeltaPeriod;
  54. char * pcWave = (char *) m_pnWave;
  55. PFRACT pfSamplePos = m_pfLastSample;
  56. VFRACT vfVolume = vfLastVolume[0];
  57. PFRACT pfPitch = m_pfLastPitch;
  58. PFRACT pfPFract = pfPitch << 8;
  59. VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
  60. QWORD dwFractMASK = 0x000000000FFF0FFF;
  61. QWORD dwFractOne = 0x0000000010001000;
  62. QWORD wordmask = 0x0000FFFF0000FFFF;
  63. QWORD vfDeltaLandRVolume;
  64. _asm{
  65. // vfLVFract and vfRVFract are in mm0
  66. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  67. //VFRACT vfRVFract = vfRVolume1 << 8;
  68. movd mm0, vfVolume
  69. movd mm7, vfVolume
  70. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  71. movd mm1, vfDeltaVolume
  72. movd mm6, vfDeltaVolume
  73. punpckldq mm1, mm6
  74. // dwI = 0
  75. mov ecx, 0
  76. movq vfDeltaLandRVolume, mm1
  77. movq mm1, dwFractOne
  78. movq mm4, dwFractMASK
  79. mov eax, pfSamplePos
  80. punpckldq mm0, mm7
  81. mov ebx, pfPitch
  82. pslld mm0, 8
  83. mov edx, dwIncDelta
  84. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  85. // need to be set before first pass.
  86. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  87. psrld mm2, 5
  88. //for (dwI = 0; dwI < dwLength; )
  89. //{
  90. mainloop:
  91. cmp ecx, dwLength
  92. jae done
  93. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  94. jb NotPastEndOfSample1 //{
  95. cmp pfLoopLength, 0 //if (!pfLoopLength)
  96. je done // break;
  97. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  98. NotPastEndOfSample1: //}
  99. mov esi, eax // dwPosition1 = pfSamplePos;
  100. add eax, ebx // pfSamplePos += pfPitch;
  101. sub edx, 2 // dwIncDelta-=2;
  102. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  103. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  104. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  105. // if it goes below zero.
  106. paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
  107. // vfVFract += vfDeltaVolume;
  108. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  109. mov edx, pfPFract // Temp = pfPFract;
  110. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  111. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  112. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  113. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  114. // TestRVol = vfRVFract & (~TestRVol);
  115. mov pfPFract, edx // pfPFract = Temp;
  116. movq mm2, mm5 // vfLVolume = TestLVol;
  117. // vfRVolume = TestRVol;
  118. shr edx, 8 // Temp = Temp >> 8;
  119. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  120. // vfRVolume = vfRVolume >> 5;
  121. mov ebx, edx // pfPitch = Temp;
  122. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  123. //}
  124. DontIncreaseValues1:
  125. movd mm6, esi // dwFract1 = dwPosition1;
  126. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  127. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  128. inc ecx //dwI++;
  129. // if ( dwI < dwLength) break;
  130. cmp ecx, dwLength
  131. jae StoreOne
  132. //if (pfSamplePos >= pfSampleLength)
  133. //{
  134. cmp eax, pfSampleLength
  135. jb NotPastEndOfSample2
  136. // Original if in C was not negated
  137. //if (!pfLoopLength)
  138. cmp pfLoopLength, 0
  139. //break;
  140. je StoreOne
  141. //else
  142. //pfSamplePos -= pfLoopLength;
  143. sub eax, pfLoopLength
  144. //}
  145. NotPastEndOfSample2:
  146. //shl esi, 1 // do not shift left since pcWave is array of chars
  147. mov edi, eax // dwPosition2 = pfSamplePos;
  148. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  149. movd mm7, eax // dwFract2 = pfSamplePos;
  150. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  151. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  152. // 0, 0, dwFract2, dwFract1
  153. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  154. movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
  155. movd mm3, esi
  156. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  157. //shl edi, 1 //do not shift left since pcWave is array of chars
  158. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  159. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  160. mov esi, ecx // Temp = dWI;
  161. shl esi, 1 // Temp = Temp << 1;
  162. movzx edi, word ptr[edi] //lLM2 = pcWave[dwPoisition2];
  163. movd mm6, edi
  164. pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
  165. // low 4 bytes in mm3
  166. punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  167. add esi, pBuffer //
  168. punpcklbw mm7, mm3 // low four bytes bytes in
  169. // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  170. pmaddwd mm7, mm5 // high dword = lM2 =
  171. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  172. // low dword = lM1 =
  173. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  174. movq mm3, mm2 // put left and right volume levels in mm3
  175. add eax, ebx //pfSamplePos += pfPitch;
  176. packssdw mm3, mm2 // words in mm7
  177. // vfVolume, vfVolume, vfVolume, vfVolume
  178. movd mm5, dword ptr[esi-2] // Load values from buffer
  179. inc ecx // dwI++;
  180. psrad mm7, 12 // shift back down to 16 bits.
  181. packssdw mm7, mm4 // only need one word in mono case.
  182. // low word are lm2 and lm1
  183. // above multiplies and shifts are all done with this one pmul. Low two word are only
  184. // interest in mono case
  185. pmulhw mm3, mm7 // lLM1 *= vfVolume;
  186. // lLM2 *= vfVolume;
  187. paddsw mm5, mm3 // Add values to buffer with saturation
  188. movd dword ptr[esi-2], mm5 // Store values back into buffer.
  189. // }
  190. jmp mainloop
  191. // Need to write only one.
  192. //if (dwI < dwLength)
  193. //{
  194. StoreOne:
  195. #if 1
  196. // Linearly interpolate between points and store only one value.
  197. // combine dwFract Values.
  198. // Make mm7 zero for unpacking
  199. //shl esi, 1 // do not shift left since pcWave is array of chars
  200. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  201. pxor mm7, mm7
  202. //lLM1 = pcWave[dwPosition1];
  203. movzx esi, word ptr[esi]
  204. // Doing AND that was not done for dwFract1 and dwFract2
  205. pand mm6, mm4
  206. // words in MMX register after operation is complete.
  207. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  208. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  209. // put values of pcWave into MMX registers. They are read into a regular register so
  210. // that the routine does not read past the end of the buffer otherwise, it could read
  211. // directly into the MMX registers.
  212. // words in MMX registers
  213. pxor mm7, mm7
  214. // low four bytes
  215. movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  216. // 8 bytes after unpakc
  217. punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
  218. // *2 pmadd efficent code.
  219. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  220. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  221. pmaddwd mm7, mm5// low dword = lM1 =
  222. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  223. psrad mm7, 12 // shift back down to 16 bits
  224. movq mm5, mm2 // move volume into mm5
  225. /*
  226. // Set lLM to be same as lM
  227. lLM1 = lM1;
  228. lLM1 *= vfLVolume1;
  229. lLM1 >>= 5; // Signal bumps up to 15 bits.
  230. lM1 *= vfRVolume1;
  231. lM1 >>= 5;
  232. // Set lLM to be same as lM
  233. lLM2 = lM2;
  234. lLM2 *= vfLVolume2;
  235. lLM2 >>= 5; // Signal bumps up to 15 bits.
  236. lM2 *= vfRVolume2;
  237. lM2 >>= 5;
  238. */
  239. // above multiplies and shifts are all done with this one pmul
  240. pmulhw mm5, mm7
  241. // calculate buffer location.
  242. mov edi, ecx
  243. shl edi, 1
  244. add edi, pBuffer
  245. movd edx, mm5
  246. //pBuffer[dwI+1] += (short) lM1;
  247. add word ptr[edi-2], dx
  248. jno no_oflowr1
  249. //pBuffer[dwI+1] = 0x7fff;
  250. mov word ptr[edi-2], 0x7fff
  251. js no_oflowr1
  252. //pBuffer[dwI+1] = (short) 0x8000;
  253. mov word ptr[edi-2], 0x8000
  254. no_oflowr1:
  255. //}
  256. #endif
  257. done:
  258. mov edx, this // get address of class object
  259. //vfLastVolume[0] = vfVolume;
  260. //vfLastVolume[1] = vfVolume;
  261. // need to shift volume back down to 12 bits before storing
  262. psrld mm2, 3
  263. #if 0
  264. movd [edx]this.m_vfLastVolume[0], mm2
  265. movd [edx]this.m_vfLastVolume[1], mm2
  266. #endif
  267. movd vfLastVolume[0], mm2
  268. movd vfLastVolume[1], mm2
  269. //m_pfLastPitch = pfPitch;
  270. mov [edx]this.m_pfLastPitch, ebx
  271. //m_pfLastSample = pfSamplePos;
  272. mov [edx]this.m_pfLastSample, eax
  273. // put value back into dwI to be returned. This could just be passed back in eax I think.
  274. mov dwI, ecx
  275. emms
  276. } // ASM block
  277. return (dwI);
  278. }
  279. #endif
  280. DWORD CDigitalAudio::Mix8X(short * pBuffer,
  281. DWORD dwLength,
  282. DWORD dwDeltaPeriod,
  283. VFRACT vfDeltaLVolume,
  284. VFRACT vfDeltaRVolume,
  285. VFRACT vfLastVolume[],
  286. PFRACT pfDeltaPitch,
  287. PFRACT pfSampleLength,
  288. PFRACT pfLoopLength)
  289. {
  290. DWORD dwI;
  291. //DWORD dwPosition1, dwPosition2;
  292. //long lM1, lLM1;
  293. //long lM2, lLM2;
  294. DWORD dwIncDelta = dwDeltaPeriod;
  295. //VFRACT dwFract1, dwFract2;
  296. char * pcWave = (char *) m_pnWave;
  297. PFRACT pfSamplePos = m_pfLastSample;
  298. VFRACT vfLVolume = vfLastVolume[0];
  299. VFRACT vfRVolume = vfLastVolume[1];
  300. VFRACT vfLVolume2 = vfLastVolume[0];
  301. VFRACT vfRVolume2 = vfLastVolume[1];
  302. PFRACT pfPitch = m_pfLastPitch;
  303. PFRACT pfPFract = pfPitch << 8;
  304. dwLength <<= 1;
  305. QWORD dwFractMASK = 0x000000000FFF0FFF;
  306. QWORD dwFractOne = 0x0000000010001000;
  307. QWORD wordmask = 0x0000FFFF0000FFFF;
  308. QWORD vfDeltaLandRVolume;
  309. _asm{
  310. // vfLVFract and vfRVFract are in mm0
  311. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  312. //VFRACT vfRVFract = vfRVolume1 << 8;
  313. movd mm0, vfLVolume
  314. movd mm7, vfRVolume
  315. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  316. movd mm1, vfDeltaLVolume
  317. movd mm6, vfDeltaRVolume
  318. punpckldq mm1, mm6
  319. // dwI = 0
  320. mov ecx, 0
  321. movq vfDeltaLandRVolume, mm1
  322. movq mm1, dwFractOne
  323. movq mm4, dwFractMASK
  324. mov eax, pfSamplePos
  325. punpckldq mm0, mm7
  326. mov ebx, pfPitch
  327. pslld mm0, 8
  328. mov edx, dwIncDelta
  329. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  330. // need to be set before first pass.
  331. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  332. psrld mm2, 5
  333. //for (dwI = 0; dwI < dwLength; )
  334. //{
  335. mainloop:
  336. cmp ecx, dwLength
  337. jae done
  338. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  339. jb NotPastEndOfSample1 //{
  340. cmp pfLoopLength, 0 //if (!pfLoopLength)
  341. je done // break;
  342. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  343. NotPastEndOfSample1: //}
  344. mov esi, eax // dwPosition1 = pfSamplePos;
  345. add eax, ebx // pfSamplePos += pfPitch;
  346. sub edx, 2 // dwIncDelta-=2;
  347. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  348. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  349. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  350. // if it goes below zero.
  351. paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
  352. // vfRVFract += vfDeltaRVolume;
  353. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  354. mov edx, pfPFract // Temp = pfPFract;
  355. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  356. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  357. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  358. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  359. // TestRVol = vfRVFract & (~TestRVol);
  360. mov pfPFract, edx // pfPFract = Temp;
  361. movq mm2, mm5 // vfLVolume = TestLVol;
  362. // vfRVolume = TestRVol;
  363. shr edx, 8 // Temp = Temp >> 8;
  364. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  365. // vfRVolume = vfRVolume >> 5;
  366. mov ebx, edx // pfPitch = Temp;
  367. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  368. //}
  369. DontIncreaseValues1:
  370. movd mm6, esi // dwFract1 = dwPosition1;
  371. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  372. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  373. add ecx, 2 //dwI += 2;
  374. // if ( dwI < dwLength) break;
  375. cmp ecx, dwLength
  376. jae StoreOne
  377. //if (pfSamplePos >= pfSampleLength)
  378. //{
  379. cmp eax, pfSampleLength
  380. jb NotPastEndOfSample2
  381. // Original if in C was not negated
  382. //if (!pfLoopLength)
  383. cmp pfLoopLength, 0
  384. //break;
  385. je StoreOne
  386. //else
  387. //pfSamplePos -= pfLoopLength;
  388. sub eax, pfLoopLength
  389. //}
  390. NotPastEndOfSample2:
  391. //shl esi, 1 // do not shift left since pcWave is array of chars
  392. mov edi, eax // dwPosition2 = pfSamplePos;
  393. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  394. movd mm7, eax // dwFract2 = pfSamplePos;
  395. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  396. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  397. // 0, 0, dwFract2, dwFract1
  398. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  399. movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
  400. movd mm3, esi
  401. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  402. //shl edi, 1 // do not shift left since pcWave is array of chars
  403. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  404. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  405. mov esi, ecx // Temp = dWI;
  406. shl esi, 1 // Temp = Temp << 1;
  407. movzx edi, word ptr[edi] //lLM2 = pcWave[dwPosition2];
  408. movd mm6, edi
  409. pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
  410. // low 4 bytes bytes in mm3
  411. punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  412. add esi, pBuffer //
  413. punpcklbw mm7, mm3 // bytes in mm7
  414. // pcWave[dwPos2+1], 0, pcWave[dwPos2], 0, pcWave[dwPos1+1], pcWave[dwPos1], 0
  415. pmaddwd mm7, mm5 // high dword = lM2 =
  416. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  417. // low dword = lM1 =
  418. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  419. movq mm3, mm2 // put left and right volume levels in mm3
  420. add eax, ebx //pfSamplePos += pfPitch;
  421. packssdw mm3, mm2 // words in mm3
  422. // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
  423. movq mm5, qword ptr[esi-4] // Load values from buffer
  424. add ecx, 2 // dwI += 2;
  425. psrad mm7, 12 // shift back down to 16 bits.
  426. pand mm7, wordmask // combine results to get ready to multiply by left and right
  427. movq mm6, mm7 // volume levels.
  428. pslld mm6, 16 //
  429. por mm7, mm6 // words in mm7
  430. // lM2, lM2, lM1, lM1
  431. // above multiplies and shifts are all done with this one pmul
  432. pmulhw mm3, mm7 // lLM1 *= vfLVolume;
  433. // lM1 *= vfRVolume;
  434. // lLM2 *= vfLVolume;
  435. // lM2 *= vfRVolume;
  436. paddsw mm5, mm3 // Add values to buffer with saturation
  437. movq qword ptr[esi-4], mm5 // Store values back into buffer.
  438. // }
  439. jmp mainloop
  440. // Need to write only one.
  441. //if (dwI < dwLength)
  442. //{
  443. StoreOne:
  444. #if 1
  445. // Linearly interpolate between points and store only one value.
  446. // combine dwFract Values.
  447. // Make mm7 zero for unpacking
  448. //shl esi, 1 // do not shift left since pcWave is array of chars
  449. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  450. pxor mm7, mm7
  451. //lLM1 = pcWave[dwPosition1];
  452. movzx esi, word ptr[esi]
  453. // Doing AND that was not done for dwFract1 and dwFract2
  454. pand mm6, mm4
  455. // words in MMX register after operation is complete.
  456. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  457. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  458. // put values of pcWave into MMX registers. They are read into a regular register so
  459. // that the routine does not read past the end of the buffer otherwise, it could read
  460. // directly into the MMX registers.
  461. pxor mm7, mm7
  462. // byte in MMX registers
  463. movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  464. punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
  465. // *2 pmadd efficent code.
  466. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  467. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  468. pmaddwd mm7, mm5// low dword = lM1 =
  469. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  470. psrad mm7, 12 // shift back down to 16 bits
  471. pand mm7, wordmask // combine results to get ready to multiply by left and right
  472. movq mm6, mm7 // volume levels.
  473. pslld mm6, 16 //
  474. por mm7, mm6 // words in mm7
  475. // lM2, lM2, lM1, lM1
  476. pxor mm6, mm6
  477. movq mm5, mm2 // move volume1 into mm5
  478. // use pack to get 4 volume values together for multiplication.
  479. packssdw mm5, mm6 // words in mm7
  480. // 0, 0, vfRVolume1, vfLVolume1
  481. /*
  482. // Set lLM to be same as lM
  483. lLM1 = lM1;
  484. lLM1 *= vfLVolume1;
  485. lLM1 >>= 5; // Signal bumps up to 15 bits.
  486. lM1 *= vfRVolume1;
  487. lM1 >>= 5;
  488. // Set lLM to be same as lM
  489. lLM2 = lM2;
  490. lLM2 *= vfLVolume2;
  491. lLM2 >>= 5; // Signal bumps up to 15 bits.
  492. lM2 *= vfRVolume2;
  493. lM2 >>= 5;
  494. */
  495. // above multiplies and shifts are all done with this one pmul
  496. pmulhw mm5, mm7
  497. // calculate buffer location.
  498. mov edi, ecx
  499. shl edi, 1
  500. add edi, pBuffer
  501. /*
  502. add word ptr[edi-4], si
  503. jno no_oflowl1
  504. // pBuffer[dwI] = 0x7fff;
  505. mov word ptr[edi-4], 0x7fff
  506. js no_oflowl1
  507. //pBuffer[dwI] = (short) 0x8000;
  508. mov word ptr[edi-4], 0x8000
  509. no_oflowl1:
  510. //pBuffer[dwI+1] += (short) lM1;
  511. add word ptr[edi-2], dx
  512. jno no_oflowr1
  513. //pBuffer[dwI+1] = 0x7fff;
  514. mov word ptr[edi-2], 0x7fff
  515. js no_oflowr1
  516. //pBuffer[dwI+1] = (short) 0x8000;
  517. mov word ptr[edi-2], 0x8000
  518. no_oflowr1:
  519. */
  520. movd mm7, dword ptr[edi-4]
  521. paddsw mm7, mm5
  522. movd dword ptr[edi-4], mm7
  523. //}
  524. #endif
  525. done:
  526. mov edx, this // get address of class object
  527. //vfLastVolume[0] = vfLVolume;
  528. //vfLastVolume[1] = vfRVolume;
  529. // need to shift volume back down to 12 bits before storing
  530. #if 0
  531. psrld mm2, 3
  532. movd [edx]this.m_vfLastVolume[0], mm2
  533. psrlq mm2, 32
  534. movd [edx]this.m_vfLastVolume[1], mm2
  535. #endif
  536. psrld mm2, 3
  537. movd vfLastVolume[0], mm2
  538. psrlq mm2, 32
  539. movd vfLastVolume[1], mm2
  540. //m_pfLastPitch = pfPitch;
  541. mov [edx]this.m_pfLastPitch, ebx
  542. //m_pfLastSample = pfSamplePos;
  543. mov [edx]this.m_pfLastSample, eax
  544. // put value back into dwI to be returned. This could just be passed back in eax I think.
  545. mov dwI, ecx
  546. emms
  547. } // ASM block
  548. return (dwI >> 1);
  549. }
  550. #ifdef ORG_MONO_MIXER
  551. DWORD CDigitalAudio::MixMono16X(short * pBuffer,
  552. DWORD dwLength,
  553. DWORD dwDeltaPeriod,
  554. VFRACT vfDeltaVolume,
  555. VFRACT vfLastVolume[],
  556. PFRACT pfDeltaPitch,
  557. PFRACT pfSampleLength,
  558. PFRACT pfLoopLength)
  559. {
  560. DWORD dwI;
  561. DWORD dwIncDelta = dwDeltaPeriod;
  562. short * pcWave = (short*) m_pnWave;
  563. PFRACT pfSamplePos = m_pfLastSample;
  564. VFRACT vfVolume = vfLastVolume[0];
  565. PFRACT pfPitch = m_pfLastPitch;
  566. PFRACT pfPFract = pfPitch << 8;
  567. VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
  568. QWORD dwFractMASK = 0x000000000FFF0FFF;
  569. QWORD dwFractOne = 0x0000000010001000;
  570. QWORD wordmask = 0x0000FFFF0000FFFF;
  571. QWORD vfDeltaLandRVolume;
  572. _asm{
  573. // vfLVFract and vfRVFract are in mm0
  574. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  575. //VFRACT vfRVFract = vfRVolume1 << 8;
  576. movd mm0, vfVolume
  577. movd mm7, vfVolume
  578. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  579. movd mm1, vfDeltaVolume
  580. movd mm6, vfDeltaVolume
  581. punpckldq mm1, mm6
  582. // dwI = 0
  583. mov ecx, 0
  584. movq vfDeltaLandRVolume, mm1
  585. movq mm1, dwFractOne
  586. movq mm4, dwFractMASK
  587. mov eax, pfSamplePos
  588. punpckldq mm0, mm7
  589. mov ebx, pfPitch
  590. pslld mm0, 8
  591. mov edx, dwIncDelta
  592. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  593. // need to be set before first pass.
  594. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  595. psrld mm2, 5
  596. //for (dwI = 0; dwI < dwLength; )
  597. //{
  598. mainloop:
  599. cmp ecx, dwLength
  600. jae done
  601. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  602. jb NotPastEndOfSample1 //{
  603. cmp pfLoopLength, 0 //if (!pfLoopLength)
  604. je done // break;
  605. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  606. NotPastEndOfSample1: //}
  607. mov esi, eax // dwPosition1 = pfSamplePos;
  608. add eax, ebx // pfSamplePos += pfPitch;
  609. sub edx, 2 // dwIncDelta-=2;
  610. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  611. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  612. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  613. // if it goes below zero.
  614. paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
  615. // vfVFract += vfDeltaVolume;
  616. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  617. mov edx, pfPFract // Temp = pfPFract;
  618. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  619. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  620. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  621. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  622. // TestRVol = vfRVFract & (~TestRVol);
  623. mov pfPFract, edx // pfPFract = Temp;
  624. movq mm2, mm5 // vfLVolume = TestLVol;
  625. // vfRVolume = TestRVol;
  626. shr edx, 8 // Temp = Temp >> 8;
  627. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  628. // vfRVolume = vfRVolume >> 5;
  629. mov ebx, edx // pfPitch = Temp;
  630. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  631. //}
  632. DontIncreaseValues1:
  633. movd mm6, esi // dwFract1 = dwPosition1;
  634. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  635. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  636. inc ecx //dwI++;
  637. // if ( dwI < dwLength) break;
  638. cmp ecx, dwLength
  639. jae StoreOne
  640. //if (pfSamplePos >= pfSampleLength)
  641. //{
  642. cmp eax, pfSampleLength
  643. jb NotPastEndOfSample2
  644. // Original if in C was not negated
  645. //if (!pfLoopLength)
  646. cmp pfLoopLength, 0
  647. //break;
  648. je StoreOne
  649. //else
  650. //pfSamplePos -= pfLoopLength;
  651. sub eax, pfLoopLength
  652. //}
  653. NotPastEndOfSample2:
  654. shl esi, 1 // shift left since pcWave is array of shorts
  655. mov edi, eax // dwPosition2 = pfSamplePos;
  656. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  657. movd mm7, eax // dwFract2 = pfSamplePos;
  658. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  659. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  660. // 0, 0, dwFract2, dwFract1
  661. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  662. movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
  663. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  664. shl edi, 1 // shift left since pcWave is array of shorts
  665. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  666. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  667. mov esi, ecx // Temp = dWI;
  668. shl esi, 1 // Temp = Temp << 1;
  669. movq mm3, mm2 // put left and right volume levels in mm3
  670. movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
  671. packssdw mm3, mm2 // words in mm7
  672. // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
  673. add esi, pBuffer //
  674. punpckldq mm7, mm6 // low four bytes bytes in
  675. // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  676. pmaddwd mm7, mm5 // high dword = lM2 =
  677. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  678. // low dword = lM1 =
  679. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  680. add eax, ebx //pfSamplePos += pfPitch;
  681. movd mm5, dword ptr[esi-2] // Load values from buffer
  682. inc ecx // dwI++;
  683. psrad mm7, 12 // shift back down to 16 bits.
  684. packssdw mm7, mm4 // only need one word in mono case.
  685. // low word are lm2 and lm1
  686. // above multiplies and shifts are all done with this one pmul. Low two word are only
  687. // interest in mono case
  688. pmulhw mm3, mm7 // lLM1 *= vfVolume;
  689. // lLM2 *= vfVolume;
  690. paddsw mm5, mm3 // Add values to buffer with saturation
  691. movd dword ptr[esi-2], mm5 // Store values back into buffer.
  692. // }
  693. jmp mainloop
  694. // Need to write only one.
  695. //if (dwI < dwLength)
  696. //{
  697. StoreOne:
  698. #if 1
  699. // Linearly interpolate between points and store only one value.
  700. // combine dwFract Values.
  701. // Make mm7 zero for unpacking
  702. shl esi, 1 // shift left since pcWave is array of shorts
  703. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  704. pxor mm7, mm7
  705. //lLM1 = pcWave[dwPosition1];
  706. mov esi, dword ptr[esi]
  707. // Doing AND that was not done for dwFract1 and dwFract2
  708. pand mm6, mm4
  709. // words in MMX register after operation is complete.
  710. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  711. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  712. // put values of pcWave into MMX registers. They are read into a regular register so
  713. // that the routine does not read past the end of the buffer otherwise, it could read
  714. // directly into the MMX registers.
  715. // words in MMX registers
  716. movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  717. // *2 pmadd efficent code.
  718. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  719. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  720. pmaddwd mm7, mm5// low dword = lM1 =
  721. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  722. psrad mm7, 12 // shift back down to 16 bits
  723. movq mm5, mm2 // move volume into mm5
  724. /*
  725. // Set lLM to be same as lM
  726. lLM1 = lM1;
  727. lLM1 *= vfLVolume1;
  728. lLM1 >>= 5; // Signal bumps up to 15 bits.
  729. lM1 *= vfRVolume1;
  730. lM1 >>= 5;
  731. // Set lLM to be same as lM
  732. lLM2 = lM2;
  733. lLM2 *= vfLVolume2;
  734. lLM2 >>= 5; // Signal bumps up to 15 bits.
  735. lM2 *= vfRVolume2;
  736. lM2 >>= 5;
  737. */
  738. // above multiplies and shifts are all done with this one pmul
  739. pmulhw mm5, mm7
  740. // calculate buffer location.
  741. mov edi, ecx
  742. shl edi, 1
  743. add edi, pBuffer
  744. movd edx, mm5
  745. //pBuffer[dwI+1] += (short) lM1;
  746. add word ptr[edi-2], dx
  747. jno no_oflowr1
  748. //pBuffer[dwI+1] = 0x7fff;
  749. mov word ptr[edi-2], 0x7fff
  750. js no_oflowr1
  751. //pBuffer[dwI+1] = (short) 0x8000;
  752. mov word ptr[edi-2], 0x8000
  753. no_oflowr1:
  754. //}
  755. #endif
  756. done:
  757. mov edx, this // get address of class object
  758. //vfLastVolume[0] = vfVolume;
  759. //vfLastVolume[1] = vfVolume;
  760. // need to shift volume back down to 12 bits before storing
  761. psrld mm2, 3
  762. #if 0
  763. movd [edx]this.m_vfLastVolume[0], mm2
  764. movd [edx]this.m_vfLastVolume[1], mm2
  765. #endif
  766. movd vfLastVolume[0], mm2
  767. movd vfLastVolume[1], mm2
  768. //m_pfLastPitch = pfPitch;
  769. mov [edx]this.m_pfLastPitch, ebx
  770. //m_pfLastSample = pfSamplePos;
  771. mov [edx]this.m_pfLastSample, eax
  772. // put value back into dwI to be returned. This could just be passed back in eax I think.
  773. mov dwI, ecx
  774. emms
  775. } // ASM block
  776. return (dwI);
  777. }
  778. #endif
  779. DWORD CDigitalAudio::Mix16X(short * pBuffer,
  780. DWORD dwLength,
  781. DWORD dwDeltaPeriod,
  782. VFRACT vfDeltaLVolume,
  783. VFRACT vfDeltaRVolume,
  784. VFRACT vfLastVolume[],
  785. PFRACT pfDeltaPitch,
  786. PFRACT pfSampleLength,
  787. PFRACT pfLoopLength)
  788. {
  789. DWORD dwI;
  790. //DWORD dwPosition1, dwPosition2;
  791. //long lM1, lLM1;
  792. //long lM2, lLM2;
  793. DWORD dwIncDelta = dwDeltaPeriod;
  794. //VFRACT dwFract1, dwFract2;
  795. short * pcWave = (short *) m_pnWave;
  796. PFRACT pfSamplePos = m_pfLastSample;
  797. VFRACT vfLVolume = vfLastVolume[0];
  798. VFRACT vfRVolume = vfLastVolume[1];
  799. VFRACT vfLVolume2 = vfLastVolume[0];
  800. VFRACT vfRVolume2 = vfLastVolume[1];
  801. PFRACT pfPitch = m_pfLastPitch;
  802. PFRACT pfPFract = pfPitch << 8;
  803. dwLength <<= 1;
  804. QWORD dwFractMASK = 0x000000000FFF0FFF;
  805. QWORD dwFractOne = 0x0000000010001000;
  806. QWORD wordmask = 0x0000FFFF0000FFFF;
  807. QWORD vfDeltaLandRVolume;
  808. _asm{
  809. // vfLVFract and vfRVFract are in mm0
  810. //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
  811. //VFRACT vfRVFract = vfRVolume1 << 8;
  812. movd mm0, vfLVolume
  813. movd mm7, vfRVolume
  814. // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
  815. movd mm1, vfDeltaLVolume
  816. movd mm6, vfDeltaRVolume
  817. punpckldq mm1, mm6
  818. // dwI = 0
  819. mov ecx, 0
  820. movq vfDeltaLandRVolume, mm1
  821. movq mm1, dwFractOne
  822. movq mm4, dwFractMASK
  823. mov eax, pfSamplePos
  824. punpckldq mm0, mm7
  825. mov ebx, pfPitch
  826. pslld mm0, 8
  827. mov edx, dwIncDelta
  828. movq mm2, mm0 // vfLVolume and vfRVolume in mm2
  829. // need to be set before first pass.
  830. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
  831. psrld mm2, 5
  832. //for (dwI = 0; dwI < dwLength; )
  833. //{
  834. mainloop:
  835. cmp ecx, dwLength
  836. jae done
  837. cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
  838. jb NotPastEndOfSample1 //{
  839. cmp pfLoopLength, 0 //if (!pfLoopLength)
  840. je done // break;
  841. sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
  842. NotPastEndOfSample1: //}
  843. mov esi, eax // dwPosition1 = pfSamplePos;
  844. add eax, ebx // pfSamplePos += pfPitch;
  845. sub edx, 2 // dwIncDelta-=2;
  846. jnz DontIncreaseValues1 //if (!dwIncDelta) {
  847. // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
  848. // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
  849. // if it goes below zero.
  850. paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
  851. // vfRVFract += vfDeltaRVolume;
  852. pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
  853. mov edx, pfPFract // Temp = pfPFract;
  854. pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
  855. // if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
  856. add edx, pfDeltaPitch // Temp += pfDeltaPitch;
  857. pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
  858. // TestRVol = vfRVFract & (~TestRVol);
  859. mov pfPFract, edx // pfPFract = Temp;
  860. movq mm2, mm5 // vfLVolume = TestLVol;
  861. // vfRVolume = TestRVol;
  862. shr edx, 8 // Temp = Temp >> 8;
  863. psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
  864. // vfRVolume = vfRVolume >> 5;
  865. mov ebx, edx // pfPitch = Temp;
  866. mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
  867. //}
  868. DontIncreaseValues1:
  869. movd mm6, esi // dwFract1 = dwPosition1;
  870. movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
  871. shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
  872. add ecx, 2 //dwI += 2;
  873. // if ( dwI < dwLength) break;
  874. cmp ecx, dwLength
  875. jae StoreOne
  876. //if (pfSamplePos >= pfSampleLength)
  877. //{
  878. cmp eax, pfSampleLength
  879. jb NotPastEndOfSample2
  880. // Original if in C was not negated
  881. //if (!pfLoopLength)
  882. cmp pfLoopLength, 0
  883. //break;
  884. je StoreOne
  885. //else
  886. //pfSamplePos -= pfLoopLength;
  887. sub eax, pfLoopLength
  888. //}
  889. NotPastEndOfSample2:
  890. shl esi, 1 // shift left since pcWave is array of shorts
  891. mov edi, eax // dwPosition2 = pfSamplePos;
  892. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  893. movd mm7, eax // dwFract2 = pfSamplePos;
  894. shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
  895. punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
  896. // 0, 0, dwFract2, dwFract1
  897. pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
  898. movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
  899. psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
  900. shl edi, 1 // shift left since pcWave is array of shorts
  901. punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
  902. add edi, pcWave // Put address of pcWave[dwPosition2] in edi
  903. mov esi, ecx // Temp = dWI;
  904. shl esi, 1 // Temp = Temp << 1;
  905. movq mm3, mm2 // put left and right volume levels in mm3
  906. movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
  907. packssdw mm3, mm2 // words in mm7
  908. // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
  909. add esi, pBuffer //
  910. punpckldq mm7, mm6 // low four bytes bytes in
  911. // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
  912. pmaddwd mm7, mm5 // high dword = lM2 =
  913. //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
  914. // low dword = lM1 =
  915. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  916. add eax, ebx //pfSamplePos += pfPitch;
  917. movq mm5, qword ptr[esi-4] // Load values from buffer
  918. add ecx, 2 // dwI += 2;
  919. psrad mm7, 12 // shift back down to 16 bits.
  920. pand mm7, wordmask // combine results to get ready to multiply by left and right
  921. movq mm6, mm7 // volume levels.
  922. pslld mm6, 16 //
  923. por mm7, mm6 // words in mm7
  924. // lM2, lM2, lM1, lM1
  925. // above multiplies and shifts are all done with this one pmul
  926. pmulhw mm3, mm7 // lLM1 *= vfLVolume;
  927. // lM1 *= vfRVolume;
  928. // lLM2 *= vfLVolume;
  929. // lM2 *= vfRVolume;
  930. paddsw mm5, mm3 // Add values to buffer with saturation
  931. movq qword ptr[esi-4], mm5 // Store values back into buffer.
  932. // }
  933. jmp mainloop
  934. // Need to write only one.
  935. //if (dwI < dwLength)
  936. //{
  937. StoreOne:
  938. #if 1
  939. // Linearly interpolate between points and store only one value.
  940. // combine dwFract Values.
  941. // Make mm7 zero for unpacking
  942. shl esi, 1 // shift left since pcWave is array of shorts
  943. add esi, pcWave // Put address of pcWave[dwPosition1] in esi
  944. pxor mm7, mm7
  945. //lLM1 = pcWave[dwPosition1];
  946. mov esi, dword ptr[esi]
  947. // Doing AND that was not done for dwFract1 and dwFract2
  948. pand mm6, mm4
  949. // words in MMX register after operation is complete.
  950. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
  951. punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
  952. // put values of pcWave into MMX registers. They are read into a regular register so
  953. // that the routine does not read past the end of the buffer otherwise, it could read
  954. // directly into the MMX registers.
  955. // words in MMX registers
  956. movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
  957. // *2 pmadd efficent code.
  958. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
  959. //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
  960. pmaddwd mm7, mm5// low dword = lM1 =
  961. //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
  962. psrad mm7, 12 // shift back down to 16 bits
  963. pand mm7, wordmask // combine results to get ready to multiply by left and right
  964. movq mm6, mm7 // volume levels.
  965. pslld mm6, 16 //
  966. por mm7, mm6 // words in mm7
  967. // lM2, lM2, lM1, lM1
  968. pxor mm6, mm6
  969. movq mm5, mm2 // move volume1 into mm5
  970. // use pack to get 4 volume values together for multiplication.
  971. packssdw mm5, mm6 // words in mm7
  972. // 0, 0, vfRVolume1, vfLVolume1
  973. /*
  974. // Set lLM to be same as lM
  975. lLM1 = lM1;
  976. lLM1 *= vfLVolume1;
  977. lLM1 >>= 5; // Signal bumps up to 15 bits.
  978. lM1 *= vfRVolume1;
  979. lM1 >>= 5;
  980. // Set lLM to be same as lM
  981. lLM2 = lM2;
  982. lLM2 *= vfLVolume2;
  983. lLM2 >>= 5; // Signal bumps up to 15 bits.
  984. lM2 *= vfRVolume2;
  985. lM2 >>= 5;
  986. */
  987. // above multiplies and shifts are all done with this one pmul
  988. pmulhw mm5, mm7
  989. // calculate buffer location.
  990. mov edi, ecx
  991. shl edi, 1
  992. add edi, pBuffer
  993. /*
  994. add word ptr[edi-4], si
  995. jno no_oflowl1
  996. // pBuffer[dwI] = 0x7fff;
  997. mov word ptr[edi-4], 0x7fff
  998. js no_oflowl1
  999. //pBuffer[dwI] = (short) 0x8000;
  1000. mov word ptr[edi-4], 0x8000
  1001. no_oflowl1:
  1002. //pBuffer[dwI+1] += (short) lM1;
  1003. add word ptr[edi-2], dx
  1004. jno no_oflowr1
  1005. //pBuffer[dwI+1] = 0x7fff;
  1006. mov word ptr[edi-2], 0x7fff
  1007. js no_oflowr1
  1008. //pBuffer[dwI+1] = (short) 0x8000;
  1009. mov word ptr[edi-2], 0x8000
  1010. no_oflowr1:
  1011. */
  1012. movd mm7, dword ptr[edi-4]
  1013. paddsw mm7, mm5
  1014. movd dword ptr[edi-4], mm7
  1015. //}
  1016. #endif
  1017. done:
  1018. mov edx, this // get address of class object
  1019. //vfLastVolume[0] = vfLVolume;
  1020. //vfLastVolume[1] = vfRVolume;
  1021. // need to shift volume back down to 12 bits before storing
  1022. #if 0
  1023. psrld mm2, 3
  1024. movd [edx]this.vfLastVolume[0], mm2
  1025. psrlq mm2, 32
  1026. movd [edx]this.vfLastVolume[1], mm2
  1027. #endif
  1028. psrld mm2, 3
  1029. movd vfLastVolume[0], mm2
  1030. psrlq mm2, 32
  1031. movd vfLastVolume[1], mm2
  1032. //m_pfLastPitch = pfPitch;
  1033. mov [edx]this.m_pfLastPitch, ebx
  1034. //m_pfLastSample = pfSamplePos;
  1035. mov [edx]this.m_pfLastSample, eax
  1036. // put value back into dwI to be returned. This could just be passed back in eax I think.
  1037. mov dwI, ecx
  1038. emms
  1039. } // ASM block
  1040. return (dwI >> 1);
  1041. }
  1042. static BOOL MMXDisabled()
  1043. {
  1044. ULONG ulValue = FALSE;
  1045. if (!GetRegValueDword(
  1046. TEXT("Software\\Microsoft\\DirectMusic"),
  1047. TEXT("MMXDisabled"),
  1048. &ulValue))
  1049. {
  1050. return FALSE;
  1051. }
  1052. return (BOOL)ulValue;
  1053. }
  1054. #define CPU_ID _asm _emit 0x0f _asm _emit 0xa2
  1055. BOOL MultiMediaInstructionsSupported()
  1056. {
  1057. static BOOL bMultiMediaInstructionsSupported = FALSE;
  1058. static BOOL bFlagNotSetYet = TRUE;
  1059. // No need to keep interogating the CPU after it has been checked the first time
  1060. if (bFlagNotSetYet)
  1061. {
  1062. bFlagNotSetYet = FALSE; // Don't repeat the check for each call
  1063. if (!MMXDisabled())
  1064. {
  1065. _asm
  1066. {
  1067. pushfd // Store original EFLAGS on stack
  1068. pop eax // Get original EFLAGS in EAX
  1069. mov ecx, eax // Duplicate original EFLAGS in ECX for toggle check
  1070. xor eax, 0x00200000L // Flip ID bit in EFLAGS
  1071. push eax // Save new EFLAGS value on stack
  1072. popfd // Replace current EFLAGS value
  1073. pushfd // Store new EFLAGS on stack
  1074. pop eax // Get new EFLAGS in EAX
  1075. xor eax, ecx // Can we toggle ID bit?
  1076. jz Done // Jump if no, Processor is older than a Pentium so CPU_ID is not supported
  1077. mov eax, 1 // Set EAX to tell the CPUID instruction what to return
  1078. push ebx
  1079. CPU_ID // Get family/model/stepping/features
  1080. pop ebx
  1081. test edx, 0x00800000L // Check if mmx technology available
  1082. jz Done // Jump if no
  1083. }
  1084. // Tests have passed, this machine supports the Intel MultiMedia Instruction Set!
  1085. bMultiMediaInstructionsSupported = TRUE;
  1086. Done:
  1087. NULL;
  1088. }
  1089. }
  1090. #if DBG
  1091. if ( bMultiMediaInstructionsSupported )
  1092. {
  1093. Trace(1,"MMX - Detected, Enabling MMX mixing\n\r");
  1094. }
  1095. else
  1096. {
  1097. Trace(1,"MMX - Not Detected\n\r");
  1098. }
  1099. #endif
  1100. return (bMultiMediaInstructionsSupported);
  1101. }