// // Copyright (c) 1996-2000 Microsoft Corporation. All rights reserved. // Mmx.cpp // MMX Mix engines for Microsoft synth /* Variable useage. Variable register pfSamplePos eax pfPitch ebx dwI ecx dwIncDelta edx (edx is sometimes a temporary register) dwPosition1 esi dwPostiion2 edi vfRvolume and vfLvolume mm0 vfRVolume, vfLVolume mm2 mm4 - mm7 are temporary mmx registers. */ // Notes about calculation. // Loop is unrolled once. // *1 shifting volumne to 15 bit values to get rid of shifts and simplify code. // This make the packed mulitply work better later since I keep the sound interpolated // wave value at 16 bit signed value. For a PMULHW, this results in 15 bit results // which is the same as the original code. // *2 linear interpolation can be done very quickly with MMX by re-arranging the // way that the interpolation is done. Here is code in C that shows the difference. // Original C code //lM1 = ((pcWave[dwPosition1 + 1] - pcWave[dwPosition1]) * dwFract1) >> 12; //lM2 = ((pcWave[dwPosition2 + 1] - pcWave[dwPosition2]) * dwFract2) >> 12; //lM1 += pcWave[dwPosition1]; //lM2 += pcWave[dwPosition2]; // Equivalent C Code that can be done with a pmadd //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12; //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12; #include "common.h" #define STR_MODULENAME "DDKSynth.sys:MMX: " typedef unsigned __int64 QWORD; #pragma code_seg() /***************************************************************************** * CDigitalAudio::MixMono8X() ***************************************************************************** * Implement a mono eight-bit mix. * Heavily optimized for MMX. */ DWORD CDigitalAudio::MixMono8X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod, VFRACT vfDeltaVolume, PFRACT pfDeltaPitch, PFRACT pfSampleLength, PFRACT pfLoopLength) { DWORD dwI,dwIncDelta = dwDeltaPeriod; char * pcWave = (char *) m_pnWave; PFRACT pfSamplePos = m_pfLastSample; VFRACT vfVolume = m_vfLastLVolume; PFRACT pfPitch = m_pfLastPitch; PFRACT pfPFract = pfPitch << 8; VFRACT vfVFract = vfVolume << 8; // Keep high res version around. QWORD dwFractMASK = 0x000000000FFF0FFF; QWORD dwFractOne = 0x0000000010001000; QWORD wordmask = 0x0000FFFF0000FFFF; QWORD vfDeltaLandRVolume; _asm{ // vfLVFract and vfRVFract are in mm0 //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around. //VFRACT vfRVFract = vfRVolume1 << 8; movd mm0, vfVolume movd mm7, vfVolume // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume movd mm1, vfDeltaVolume movd mm6, vfDeltaVolume punpckldq mm1, mm6 // dwI = 0 mov ecx, 0 movq vfDeltaLandRVolume, mm1 movq mm1, dwFractOne movq mm4, dwFractMASK mov eax, pfSamplePos punpckldq mm0, mm7 mov ebx, pfPitch pslld mm0, 8 mov edx, dwIncDelta movq mm2, mm0 // vfLVolume and vfRVolume in mm2 // need to be set before first pass. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value psrld mm2, 5 //for (dwI = 0; dwI < dwLength; ) //{ mainloop: cmp ecx, dwLength jae done cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength) jb NotPastEndOfSample1 //{ cmp pfLoopLength, 0 //if (!pfLoopLength) je done // break; sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength; NotPastEndOfSample1: //} mov esi, eax // dwPosition1 = pfSamplePos; add eax, ebx // pfSamplePos += pfPitch; sub edx, 2 // dwIncDelta-=2; jnz DontIncreaseValues1 //if (!dwIncDelta) { // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume // if it goes below zero. paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume; // vfVFract += vfDeltaVolume; pxor mm5, mm5 // TestLVol = 0; TestRVol = 0; mov edx, pfPFract // Temp = pfPFract; pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff; // if (TestRVol > vfRVFract) TestRVol = 0xffffffff; add edx, pfDeltaPitch // Temp += pfDeltaPitch; pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol); // TestRVol = vfRVFract & (~TestRVol); mov pfPFract, edx // pfPFract = Temp; movq mm2, mm5 // vfLVolume = TestLVol; // vfRVolume = TestRVol; shr edx, 8 // Temp = Temp >> 8; psrld mm2, 5 // vfLVolume = vfLVolume >> 5; // vfRVolume = vfRVolume >> 5; mov ebx, edx // pfPitch = Temp; mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod; //} DontIncreaseValues1: movd mm6, esi // dwFract1 = dwPosition1; movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000 shr esi, 12 // dwPosition1 = dwPosition1 >> 12; inc ecx //dwI++; // if ( dwI < dwLength) break; cmp ecx, dwLength jae StoreOne //if (pfSamplePos >= pfSampleLength) //{ cmp eax, pfSampleLength jb NotPastEndOfSample2 // Original if in C was not negated //if (!pfLoopLength) cmp pfLoopLength, 0 //break; je StoreOne //else //pfSamplePos -= pfLoopLength; sub eax, pfLoopLength //} NotPastEndOfSample2: //shl esi, 1 // do not shift left since pcWave is array of chars mov edi, eax // dwPosition2 = pfSamplePos; add esi, pcWave // Put address of pcWave[dwPosition1] in esi movd mm7, eax // dwFract2 = pfSamplePos; shr edi, 12 // dwPosition2 = dwPosition2 >> 12; punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are // 0, 0, dwFract2, dwFract1 pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff; movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1]; movd mm3, esi psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1 //shl edi, 1 //do not shift left since pcWave is array of chars punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1 add edi, pcWave // Put address of pcWave[dwPosition2] in edi mov esi, ecx // Temp = dWI; shl esi, 1 // Temp = Temp << 1; movzx edi, word ptr[edi] //lLM2 = pcWave[dwPoisition2]; movd mm6, edi pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit // low 4 bytes in mm3 punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1] add esi, pBuffer // punpcklbw mm7, mm3 // low four bytes bytes in // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1] pmaddwd mm7, mm5 // high dword = lM2 = //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) // low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) movq mm3, mm2 // put left and right volume levels in mm3 add eax, ebx //pfSamplePos += pfPitch; packssdw mm3, mm2 // words in mm7 // vfVolume, vfVolume, vfVolume, vfVolume movd mm5, dword ptr[esi-2] // Load values from buffer inc ecx // dwI++; psrad mm7, 12 // shift back down to 16 bits. packssdw mm7, mm4 // only need one word in mono case. // low word are lm2 and lm1 // above multiplies and shifts are all done with this one pmul. Low two word are only // interest in mono case pmulhw mm3, mm7 // lLM1 *= vfVolume; // lLM2 *= vfVolume; paddsw mm5, mm3 // Add values to buffer with saturation movd dword ptr[esi-2], mm5 // Store values back into buffer. // } jmp mainloop // Need to write only one. //if (dwI < dwLength) //{ StoreOne: #if 1 // Linearly interpolate between points and store only one value. // combine dwFract Values. // Make mm7 zero for unpacking //shl esi, 1 // do not shift left since pcWave is array of chars add esi, pcWave // Put address of pcWave[dwPosition1] in esi pxor mm7, mm7 //lLM1 = pcWave[dwPosition1]; movzx esi, word ptr[esi] // Doing AND that was not done for dwFract1 and dwFract2 pand mm6, mm4 // words in MMX register after operation is complete. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1 punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1 // put values of pcWave into MMX registers. They are read into a regular register so // that the routine does not read past the end of the buffer otherwise, it could read // directly into the MMX registers. // words in MMX registers pxor mm7, mm7 // low four bytes movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1] // 8 bytes after unpakc punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0 // *2 pmadd efficent code. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12; //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12; pmaddwd mm7, mm5// low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) psrad mm7, 12 // shift back down to 16 bits movq mm5, mm2 // move volume into mm5 /* // Set lLM to be same as lM lLM1 = lM1; lLM1 *= vfLVolume1; lLM1 >>= 5; // Signal bumps up to 15 bits. lM1 *= vfRVolume1; lM1 >>= 5; // Set lLM to be same as lM lLM2 = lM2; lLM2 *= vfLVolume2; lLM2 >>= 5; // Signal bumps up to 15 bits. lM2 *= vfRVolume2; lM2 >>= 5; */ // above multiplies and shifts are all done with this one pmul pmulhw mm5, mm7 // calculate buffer location. mov edi, ecx shl edi, 1 add edi, pBuffer movd edx, mm5 //pBuffer[dwI+1] += (short) lM1; add word ptr[edi-2], dx jno no_oflowr1 //pBuffer[dwI+1] = 0x7fff; mov word ptr[edi-2], 0x7fff js no_oflowr1 //pBuffer[dwI+1] = (short) 0x8000; mov word ptr[edi-2], 0x8000 no_oflowr1: //} #endif done: mov edx, this // get address of class object //m_vfLastLVolume = vfVolume; //m_vfLastRVolume = vfVolume; // need to shift volume back down to 12 bits before storing psrld mm2, 3 movd [edx]this.m_vfLastLVolume, mm2 movd [edx]this.m_vfLastRVolume, mm2 //m_pfLastPitch = pfPitch; mov [edx]this.m_pfLastPitch, ebx //m_pfLastSample = pfSamplePos; mov [edx]this.m_pfLastSample, eax // put value back into dwI to be returned. This could just be passed back in eax I think. mov dwI, ecx emms } // ASM block return (dwI); } /***************************************************************************** * CDigitalAudio::Mix8X() ***************************************************************************** * Implement a stereo eight-bit mix. * Heavily optimized for MMX. */ DWORD CDigitalAudio::Mix8X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod, VFRACT vfDeltaLVolume, VFRACT vfDeltaRVolume, PFRACT pfDeltaPitch, PFRACT pfSampleLength, PFRACT pfLoopLength) { DWORD dwI; //DWORD dwPosition1, dwPosition2; //long lM1, lLM1; //long lM2, lLM2; DWORD dwIncDelta = dwDeltaPeriod; //VFRACT dwFract1, dwFract2; char * pcWave = (char *) m_pnWave; PFRACT pfSamplePos = m_pfLastSample; VFRACT vfLVolume = m_vfLastLVolume; VFRACT vfRVolume = m_vfLastRVolume; VFRACT vfLVolume2 = m_vfLastLVolume; VFRACT vfRVolume2 = m_vfLastRVolume; PFRACT pfPitch = m_pfLastPitch; PFRACT pfPFract = pfPitch << 8; dwLength <<= 1; QWORD dwFractMASK = 0x000000000FFF0FFF; QWORD dwFractOne = 0x0000000010001000; QWORD wordmask = 0x0000FFFF0000FFFF; QWORD vfDeltaLandRVolume; _asm{ // vfLVFract and vfRVFract are in mm0 //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around. //VFRACT vfRVFract = vfRVolume1 << 8; movd mm0, vfLVolume movd mm7, vfRVolume // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume movd mm1, vfDeltaLVolume movd mm6, vfDeltaRVolume punpckldq mm1, mm6 // dwI = 0 mov ecx, 0 movq vfDeltaLandRVolume, mm1 movq mm1, dwFractOne movq mm4, dwFractMASK mov eax, pfSamplePos punpckldq mm0, mm7 mov ebx, pfPitch pslld mm0, 8 mov edx, dwIncDelta movq mm2, mm0 // vfLVolume and vfRVolume in mm2 // need to be set before first pass. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value psrld mm2, 5 //for (dwI = 0; dwI < dwLength; ) //{ mainloop: cmp ecx, dwLength jae done cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength) jb NotPastEndOfSample1 //{ cmp pfLoopLength, 0 //if (!pfLoopLength) je done // break; sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength; NotPastEndOfSample1: //} mov esi, eax // dwPosition1 = pfSamplePos; add eax, ebx // pfSamplePos += pfPitch; sub edx, 2 // dwIncDelta-=2; jnz DontIncreaseValues1 //if (!dwIncDelta) { // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume // if it goes below zero. paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume; // vfRVFract += vfDeltaRVolume; pxor mm5, mm5 // TestLVol = 0; TestRVol = 0; mov edx, pfPFract // Temp = pfPFract; pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff; // if (TestRVol > vfRVFract) TestRVol = 0xffffffff; add edx, pfDeltaPitch // Temp += pfDeltaPitch; pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol); // TestRVol = vfRVFract & (~TestRVol); mov pfPFract, edx // pfPFract = Temp; movq mm2, mm5 // vfLVolume = TestLVol; // vfRVolume = TestRVol; shr edx, 8 // Temp = Temp >> 8; psrld mm2, 5 // vfLVolume = vfLVolume >> 5; // vfRVolume = vfRVolume >> 5; mov ebx, edx // pfPitch = Temp; mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod; //} DontIncreaseValues1: movd mm6, esi // dwFract1 = dwPosition1; movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000 shr esi, 12 // dwPosition1 = dwPosition1 >> 12; add ecx, 2 //dwI += 2; // if ( dwI < dwLength) break; cmp ecx, dwLength jae StoreOne //if (pfSamplePos >= pfSampleLength) //{ cmp eax, pfSampleLength jb NotPastEndOfSample2 // Original if in C was not negated //if (!pfLoopLength) cmp pfLoopLength, 0 //break; je StoreOne //else //pfSamplePos -= pfLoopLength; sub eax, pfLoopLength //} NotPastEndOfSample2: //shl esi, 1 // do not shift left since pcWave is array of chars mov edi, eax // dwPosition2 = pfSamplePos; add esi, pcWave // Put address of pcWave[dwPosition1] in esi movd mm7, eax // dwFract2 = pfSamplePos; shr edi, 12 // dwPosition2 = dwPosition2 >> 12; punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are // 0, 0, dwFract2, dwFract1 pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff; movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1]; movd mm3, esi psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1 //shl edi, 1 // do not shift left since pcWave is array of chars punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1 add edi, pcWave // Put address of pcWave[dwPosition2] in edi mov esi, ecx // Temp = dWI; shl esi, 1 // Temp = Temp << 1; movzx edi, word ptr[edi] //lLM2 = pcWave[dwPosition2]; movd mm6, edi pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit // low 4 bytes bytes in mm3 punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1] add esi, pBuffer // punpcklbw mm7, mm3 // bytes in mm7 // pcWave[dwPos2+1], 0, pcWave[dwPos2], 0, pcWave[dwPos1+1], pcWave[dwPos1], 0 pmaddwd mm7, mm5 // high dword = lM2 = //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) // low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) movq mm3, mm2 // put left and right volume levels in mm3 add eax, ebx //pfSamplePos += pfPitch; packssdw mm3, mm2 // words in mm3 // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1 movq mm5, qword ptr[esi-4] // Load values from buffer add ecx, 2 // dwI += 2; psrad mm7, 12 // shift back down to 16 bits. pand mm7, wordmask // combine results to get ready to multiply by left and right movq mm6, mm7 // volume levels. pslld mm6, 16 // por mm7, mm6 // words in mm7 // lM2, lM2, lM1, lM1 // above multiplies and shifts are all done with this one pmul pmulhw mm3, mm7 // lLM1 *= vfLVolume; // lM1 *= vfRVolume; // lLM2 *= vfLVolume; // lM2 *= vfRVolume; paddsw mm5, mm3 // Add values to buffer with saturation movq qword ptr[esi-4], mm5 // Store values back into buffer. // } jmp mainloop // Need to write only one. //if (dwI < dwLength) //{ StoreOne: #if 1 // Linearly interpolate between points and store only one value. // combine dwFract Values. // Make mm7 zero for unpacking //shl esi, 1 // do not shift left since pcWave is array of chars add esi, pcWave // Put address of pcWave[dwPosition1] in esi pxor mm7, mm7 //lLM1 = pcWave[dwPosition1]; movzx esi, word ptr[esi] // Doing AND that was not done for dwFract1 and dwFract2 pand mm6, mm4 // words in MMX register after operation is complete. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1 punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1 // put values of pcWave into MMX registers. They are read into a regular register so // that the routine does not read past the end of the buffer otherwise, it could read // directly into the MMX registers. pxor mm7, mm7 // byte in MMX registers movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1] punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0 // *2 pmadd efficent code. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12; //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12; pmaddwd mm7, mm5// low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) psrad mm7, 12 // shift back down to 16 bits pand mm7, wordmask // combine results to get ready to multiply by left and right movq mm6, mm7 // volume levels. pslld mm6, 16 // por mm7, mm6 // words in mm7 // lM2, lM2, lM1, lM1 pxor mm6, mm6 movq mm5, mm2 // move volume1 into mm5 // use pack to get 4 volume values together for multiplication. packssdw mm5, mm6 // words in mm7 // 0, 0, vfRVolume1, vfLVolume1 /* // Set lLM to be same as lM lLM1 = lM1; lLM1 *= vfLVolume1; lLM1 >>= 5; // Signal bumps up to 15 bits. lM1 *= vfRVolume1; lM1 >>= 5; // Set lLM to be same as lM lLM2 = lM2; lLM2 *= vfLVolume2; lLM2 >>= 5; // Signal bumps up to 15 bits. lM2 *= vfRVolume2; lM2 >>= 5; */ // above multiplies and shifts are all done with this one pmul pmulhw mm5, mm7 // calculate buffer location. mov edi, ecx shl edi, 1 add edi, pBuffer /* add word ptr[edi-4], si jno no_oflowl1 // pBuffer[dwI] = 0x7fff; mov word ptr[edi-4], 0x7fff js no_oflowl1 //pBuffer[dwI] = (short) 0x8000; mov word ptr[edi-4], 0x8000 no_oflowl1: //pBuffer[dwI+1] += (short) lM1; add word ptr[edi-2], dx jno no_oflowr1 //pBuffer[dwI+1] = 0x7fff; mov word ptr[edi-2], 0x7fff js no_oflowr1 //pBuffer[dwI+1] = (short) 0x8000; mov word ptr[edi-2], 0x8000 no_oflowr1: */ movd mm7, dword ptr[edi-4] paddsw mm7, mm5 movd dword ptr[edi-4], mm7 //} #endif done: mov edx, this // get address of class object //m_vfLastLVolume = vfLVolume; //m_vfLastRVolume = vfRVolume; // need to shift volume back down to 12 bits before storing psrld mm2, 3 movd [edx]this.m_vfLastLVolume, mm2 psrlq mm2, 32 movd [edx]this.m_vfLastRVolume, mm2 //m_pfLastPitch = pfPitch; mov [edx]this.m_pfLastPitch, ebx //m_pfLastSample = pfSamplePos; mov [edx]this.m_pfLastSample, eax // put value back into dwI to be returned. This could just be passed back in eax I think. mov dwI, ecx emms } // ASM block return (dwI >> 1); } /***************************************************************************** * CDigitalAudio::MixMono16X() ***************************************************************************** * Implement a mono sixteen-bit mix. * Heavily optimized for MMX. */ DWORD CDigitalAudio::MixMono16X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod,VFRACT vfDeltaVolume, PFRACT pfDeltaPitch,PFRACT pfSampleLength, PFRACT pfLoopLength) { DWORD dwI,dwIncDelta = dwDeltaPeriod; short * pcWave = (short*) m_pnWave; PFRACT pfSamplePos = m_pfLastSample; VFRACT vfVolume = m_vfLastLVolume; PFRACT pfPitch = m_pfLastPitch; PFRACT pfPFract = pfPitch << 8; VFRACT vfVFract = vfVolume << 8; // Keep high res version around. QWORD dwFractMASK = 0x000000000FFF0FFF; QWORD dwFractOne = 0x0000000010001000; QWORD wordmask = 0x0000FFFF0000FFFF; QWORD vfDeltaLandRVolume; _asm{ // vfLVFract and vfRVFract are in mm0 //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around. //VFRACT vfRVFract = vfRVolume1 << 8; movd mm0, vfVolume movd mm7, vfVolume // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume movd mm1, vfDeltaVolume movd mm6, vfDeltaVolume punpckldq mm1, mm6 // dwI = 0 mov ecx, 0 movq vfDeltaLandRVolume, mm1 movq mm1, dwFractOne movq mm4, dwFractMASK mov eax, pfSamplePos punpckldq mm0, mm7 mov ebx, pfPitch pslld mm0, 8 mov edx, dwIncDelta movq mm2, mm0 // vfLVolume and vfRVolume in mm2 // need to be set before first pass. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value psrld mm2, 5 //for (dwI = 0; dwI < dwLength; ) //{ mainloop: cmp ecx, dwLength jae done cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength) jb NotPastEndOfSample1 //{ cmp pfLoopLength, 0 //if (!pfLoopLength) je done // break; sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength; NotPastEndOfSample1: //} mov esi, eax // dwPosition1 = pfSamplePos; add eax, ebx // pfSamplePos += pfPitch; sub edx, 2 // dwIncDelta-=2; jnz DontIncreaseValues1 //if (!dwIncDelta) { // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume // if it goes below zero. paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume; // vfVFract += vfDeltaVolume; pxor mm5, mm5 // TestLVol = 0; TestRVol = 0; mov edx, pfPFract // Temp = pfPFract; pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff; // if (TestRVol > vfRVFract) TestRVol = 0xffffffff; add edx, pfDeltaPitch // Temp += pfDeltaPitch; pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol); // TestRVol = vfRVFract & (~TestRVol); mov pfPFract, edx // pfPFract = Temp; movq mm2, mm5 // vfLVolume = TestLVol; // vfRVolume = TestRVol; shr edx, 8 // Temp = Temp >> 8; psrld mm2, 5 // vfLVolume = vfLVolume >> 5; // vfRVolume = vfRVolume >> 5; mov ebx, edx // pfPitch = Temp; mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod; //} DontIncreaseValues1: movd mm6, esi // dwFract1 = dwPosition1; movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000 shr esi, 12 // dwPosition1 = dwPosition1 >> 12; inc ecx //dwI++; // if ( dwI < dwLength) break; cmp ecx, dwLength jae StoreOne //if (pfSamplePos >= pfSampleLength) //{ cmp eax, pfSampleLength jb NotPastEndOfSample2 // Original if in C was not negated //if (!pfLoopLength) cmp pfLoopLength, 0 //break; je StoreOne //else //pfSamplePos -= pfLoopLength; sub eax, pfLoopLength //} NotPastEndOfSample2: shl esi, 1 // shift left since pcWave is array of shorts mov edi, eax // dwPosition2 = pfSamplePos; add esi, pcWave // Put address of pcWave[dwPosition1] in esi movd mm7, eax // dwFract2 = pfSamplePos; shr edi, 12 // dwPosition2 = dwPosition2 >> 12; punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are // 0, 0, dwFract2, dwFract1 pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff; movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1]; psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1 shl edi, 1 // shift left since pcWave is array of shorts punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1 add edi, pcWave // Put address of pcWave[dwPosition2] in edi mov esi, ecx // Temp = dWI; shl esi, 1 // Temp = Temp << 1; movq mm3, mm2 // put left and right volume levels in mm3 movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2]; packssdw mm3, mm2 // words in mm7 // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1 add esi, pBuffer // punpckldq mm7, mm6 // low four bytes bytes in // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1] pmaddwd mm7, mm5 // high dword = lM2 = //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) // low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) add eax, ebx //pfSamplePos += pfPitch; movd mm5, dword ptr[esi-2] // Load values from buffer inc ecx // dwI++; psrad mm7, 12 // shift back down to 16 bits. packssdw mm7, mm4 // only need one word in mono case. // low word are lm2 and lm1 // above multiplies and shifts are all done with this one pmul. Low two word are only // interest in mono case pmulhw mm3, mm7 // lLM1 *= vfVolume; // lLM2 *= vfVolume; paddsw mm5, mm3 // Add values to buffer with saturation movd dword ptr[esi-2], mm5 // Store values back into buffer. // } jmp mainloop // Need to write only one. //if (dwI < dwLength) //{ StoreOne: #if 1 // Linearly interpolate between points and store only one value. // combine dwFract Values. // Make mm7 zero for unpacking shl esi, 1 // shift left since pcWave is array of shorts add esi, pcWave // Put address of pcWave[dwPosition1] in esi pxor mm7, mm7 //lLM1 = pcWave[dwPosition1]; mov esi, dword ptr[esi] // Doing AND that was not done for dwFract1 and dwFract2 pand mm6, mm4 // words in MMX register after operation is complete. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1 punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1 // put values of pcWave into MMX registers. They are read into a regular register so // that the routine does not read past the end of the buffer otherwise, it could read // directly into the MMX registers. // words in MMX registers movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1] // *2 pmadd efficent code. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12; //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12; pmaddwd mm7, mm5// low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) psrad mm7, 12 // shift back down to 16 bits movq mm5, mm2 // move volume into mm5 /* // Set lLM to be same as lM lLM1 = lM1; lLM1 *= vfLVolume1; lLM1 >>= 5; // Signal bumps up to 15 bits. lM1 *= vfRVolume1; lM1 >>= 5; // Set lLM to be same as lM lLM2 = lM2; lLM2 *= vfLVolume2; lLM2 >>= 5; // Signal bumps up to 15 bits. lM2 *= vfRVolume2; lM2 >>= 5; */ // above multiplies and shifts are all done with this one pmul pmulhw mm5, mm7 // calculate buffer location. mov edi, ecx shl edi, 1 add edi, pBuffer movd edx, mm5 //pBuffer[dwI+1] += (short) lM1; add word ptr[edi-2], dx jno no_oflowr1 //pBuffer[dwI+1] = 0x7fff; mov word ptr[edi-2], 0x7fff js no_oflowr1 //pBuffer[dwI+1] = (short) 0x8000; mov word ptr[edi-2], 0x8000 no_oflowr1: //} #endif done: mov edx, this // get address of class object //m_vfLastLVolume = vfVolume; //m_vfLastRVolume = vfVolume; // need to shift volume back down to 12 bits before storing psrld mm2, 3 movd [edx]this.m_vfLastLVolume, mm2 movd [edx]this.m_vfLastRVolume, mm2 //m_pfLastPitch = pfPitch; mov [edx]this.m_pfLastPitch, ebx //m_pfLastSample = pfSamplePos; mov [edx]this.m_pfLastSample, eax // put value back into dwI to be returned. This could just be passed back in eax I think. mov dwI, ecx emms } // ASM block return (dwI); } /***************************************************************************** * CDigitalAudio::Mix16X() ***************************************************************************** * Implement a stereo sixteen-bit mix. * Heavily optimized for MMX. */ DWORD CDigitalAudio::Mix16X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod, VFRACT vfDeltaLVolume, VFRACT vfDeltaRVolume,PFRACT pfDeltaPitch, PFRACT pfSampleLength,PFRACT pfLoopLength) { DWORD dwI,dwIncDelta = dwDeltaPeriod; //DWORD dwPosition1, dwPosition2; //long lM1, lLM1; //long lM2, lLM2; //VFRACT dwFract1, dwFract2; short * pcWave = (short *) m_pnWave; PFRACT pfSamplePos = m_pfLastSample; VFRACT vfLVolume = m_vfLastLVolume; VFRACT vfRVolume = m_vfLastRVolume; VFRACT vfLVolume2 = m_vfLastLVolume; VFRACT vfRVolume2 = m_vfLastRVolume; PFRACT pfPitch = m_pfLastPitch; PFRACT pfPFract = pfPitch << 8; dwLength <<= 1; QWORD dwFractMASK = 0x000000000FFF0FFF; QWORD dwFractOne = 0x0000000010001000; QWORD wordmask = 0x0000FFFF0000FFFF; QWORD vfDeltaLandRVolume; _asm{ // vfLVFract and vfRVFract are in mm0 //VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around. //VFRACT vfRVFract = vfRVolume1 << 8; movd mm0, vfLVolume movd mm7, vfRVolume // vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume movd mm1, vfDeltaLVolume movd mm6, vfDeltaRVolume punpckldq mm1, mm6 // dwI = 0 mov ecx, 0 movq vfDeltaLandRVolume, mm1 movq mm1, dwFractOne movq mm4, dwFractMASK mov eax, pfSamplePos punpckldq mm0, mm7 mov ebx, pfPitch pslld mm0, 8 mov edx, dwIncDelta movq mm2, mm0 // vfLVolume and vfRVolume in mm2 // need to be set before first pass. // *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value psrld mm2, 5 //for (dwI = 0; dwI < dwLength; ) //{ mainloop: cmp ecx, dwLength jae done cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength) jb NotPastEndOfSample1 //{ cmp pfLoopLength, 0 //if (!pfLoopLength) je done // break; sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength; NotPastEndOfSample1: //} mov esi, eax // dwPosition1 = pfSamplePos; add eax, ebx // pfSamplePos += pfPitch; sub edx, 2 // dwIncDelta-=2; jnz DontIncreaseValues1 //if (!dwIncDelta) { // Since edx was use for dwIncDelta and now its zero, we can use if for a temporary // for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume // if it goes below zero. paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume; // vfRVFract += vfDeltaRVolume; pxor mm5, mm5 // TestLVol = 0; TestRVol = 0; mov edx, pfPFract // Temp = pfPFract; pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff; // if (TestRVol > vfRVFract) TestRVol = 0xffffffff; add edx, pfDeltaPitch // Temp += pfDeltaPitch; pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol); // TestRVol = vfRVFract & (~TestRVol); mov pfPFract, edx // pfPFract = Temp; movq mm2, mm5 // vfLVolume = TestLVol; // vfRVolume = TestRVol; shr edx, 8 // Temp = Temp >> 8; psrld mm2, 5 // vfLVolume = vfLVolume >> 5; // vfRVolume = vfRVolume >> 5; mov ebx, edx // pfPitch = Temp; mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod; //} DontIncreaseValues1: movd mm6, esi // dwFract1 = dwPosition1; movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000 shr esi, 12 // dwPosition1 = dwPosition1 >> 12; add ecx, 2 //dwI += 2; // if ( dwI < dwLength) break; cmp ecx, dwLength jae StoreOne //if (pfSamplePos >= pfSampleLength) //{ cmp eax, pfSampleLength jb NotPastEndOfSample2 // Original if in C was not negated //if (!pfLoopLength) cmp pfLoopLength, 0 //break; je StoreOne //else //pfSamplePos -= pfLoopLength; sub eax, pfLoopLength //} NotPastEndOfSample2: shl esi, 1 // shift left since pcWave is array of shorts mov edi, eax // dwPosition2 = pfSamplePos; add esi, pcWave // Put address of pcWave[dwPosition1] in esi movd mm7, eax // dwFract2 = pfSamplePos; shr edi, 12 // dwPosition2 = dwPosition2 >> 12; punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are // 0, 0, dwFract2, dwFract1 pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff; movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1]; psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1 shl edi, 1 // shift left since pcWave is array of shorts punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1 add edi, pcWave // Put address of pcWave[dwPosition2] in edi mov esi, ecx // Temp = dWI; shl esi, 1 // Temp = Temp << 1; movq mm3, mm2 // put left and right volume levels in mm3 movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2]; packssdw mm3, mm2 // words in mm7 // vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1 add esi, pBuffer // punpckldq mm7, mm6 // low four bytes bytes in // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1] pmaddwd mm7, mm5 // high dword = lM2 = //(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) // low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) add eax, ebx //pfSamplePos += pfPitch; movq mm5, qword ptr[esi-4] // Load values from buffer add ecx, 2 // dwI += 2; psrad mm7, 12 // shift back down to 16 bits. pand mm7, wordmask // combine results to get ready to multiply by left and right movq mm6, mm7 // volume levels. pslld mm6, 16 // por mm7, mm6 // words in mm7 // lM2, lM2, lM1, lM1 // above multiplies and shifts are all done with this one pmul pmulhw mm3, mm7 // lLM1 *= vfLVolume; // lM1 *= vfRVolume; // lLM2 *= vfLVolume; // lM2 *= vfRVolume; paddsw mm5, mm3 // Add values to buffer with saturation movq qword ptr[esi-4], mm5 // Store values back into buffer. // } jmp mainloop // Need to write only one. //if (dwI < dwLength) //{ StoreOne: #if 1 // Linearly interpolate between points and store only one value. // combine dwFract Values. // Make mm7 zero for unpacking shl esi, 1 // shift left since pcWave is array of shorts add esi, pcWave // Put address of pcWave[dwPosition1] in esi pxor mm7, mm7 //lLM1 = pcWave[dwPosition1]; mov esi, dword ptr[esi] // Doing AND that was not done for dwFract1 and dwFract2 pand mm6, mm4 // words in MMX register after operation is complete. psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1 punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1 // put values of pcWave into MMX registers. They are read into a regular register so // that the routine does not read past the end of the buffer otherwise, it could read // directly into the MMX registers. // words in MMX registers movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1] // *2 pmadd efficent code. //lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12; //lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12; pmaddwd mm7, mm5// low dword = lM1 = //(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) psrad mm7, 12 // shift back down to 16 bits pand mm7, wordmask // combine results to get ready to multiply by left and right movq mm6, mm7 // volume levels. pslld mm6, 16 // por mm7, mm6 // words in mm7 // lM2, lM2, lM1, lM1 pxor mm6, mm6 movq mm5, mm2 // move volume1 into mm5 // use pack to get 4 volume values together for multiplication. packssdw mm5, mm6 // words in mm7 // 0, 0, vfRVolume1, vfLVolume1 /* // Set lLM to be same as lM lLM1 = lM1; lLM1 *= vfLVolume1; lLM1 >>= 5; // Signal bumps up to 15 bits. lM1 *= vfRVolume1; lM1 >>= 5; // Set lLM to be same as lM lLM2 = lM2; lLM2 *= vfLVolume2; lLM2 >>= 5; // Signal bumps up to 15 bits. lM2 *= vfRVolume2; lM2 >>= 5; */ // above multiplies and shifts are all done with this one pmul pmulhw mm5, mm7 // calculate buffer location. mov edi, ecx shl edi, 1 add edi, pBuffer /* add word ptr[edi-4], si jno no_oflowl1 // pBuffer[dwI] = 0x7fff; mov word ptr[edi-4], 0x7fff js no_oflowl1 //pBuffer[dwI] = (short) 0x8000; mov word ptr[edi-4], 0x8000 no_oflowl1: //pBuffer[dwI+1] += (short) lM1; add word ptr[edi-2], dx jno no_oflowr1 //pBuffer[dwI+1] = 0x7fff; mov word ptr[edi-2], 0x7fff js no_oflowr1 //pBuffer[dwI+1] = (short) 0x8000; mov word ptr[edi-2], 0x8000 no_oflowr1: */ movd mm7, dword ptr[edi-4] paddsw mm7, mm5 movd dword ptr[edi-4], mm7 //} #endif done: mov edx, this // get address of class object //m_vfLastLVolume = vfLVolume; //m_vfLastRVolume = vfRVolume; // need to shift volume back down to 12 bits before storing psrld mm2, 3 movd [edx]this.m_vfLastLVolume, mm2 psrlq mm2, 32 movd [edx]this.m_vfLastRVolume, mm2 //m_pfLastPitch = pfPitch; mov [edx]this.m_pfLastPitch, ebx //m_pfLastSample = pfSamplePos; mov [edx]this.m_pfLastSample, eax // put value back into dwI to be returned. This could just be passed back in eax I think. mov dwI, ecx emms } // ASM block return (dwI >> 1); } /***************************************************************************** * MMXDisabled() ***************************************************************************** * Check the registry key to determine whether to ignore MMX. */ static BOOL MMXDisabled() { ULONG ulValue; if (!GetRegValueDword( TEXT("Software\\Microsoft\\DirectMusic"), TEXT("MMXDisabled"), &ulValue)) { return FALSE; } return (BOOL)ulValue; } #define CPU_ID _asm _emit 0x0f _asm _emit 0xa2 /***************************************************************************** * MultiMediaInstructionsSupported() ***************************************************************************** * Returns whether this CPU supports MMX. */ BOOL MultiMediaInstructionsSupported() { BOOL bMultiMediaInstructionsSupported; if (!MMXDisabled()) { _asm { pushfd // Store original EFLAGS on stack pop eax // Get original EFLAGS in EAX mov ecx, eax // Duplicate original EFLAGS in ECX for toggle check xor eax, 0x00200000L // Flip ID bit in EFLAGS push eax // Save new EFLAGS value on stack popfd // Replace current EFLAGS value pushfd // Store new EFLAGS on stack pop eax // Get new EFLAGS in EAX xor eax, ecx // Can we toggle ID bit? jz Done // Jump if no, Processor is older than a Pentium so CPU_ID is not supported mov eax, 1 // Set EAX to tell the CPUID instruction what to return push ebx CPU_ID // Get family/model/stepping/features pop ebx xor eax,eax // Assume failure test edx, 0x00800000L // Check if mmx technology available jz Done // Jump if no // Tests passed, this machine supports MMX inc eax // Set to success Done: mov bMultiMediaInstructionsSupported, eax } } else { bMultiMediaInstructionsSupported = 0; } return (bMultiMediaInstructionsSupported); }