Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
10 KiB

  1. ;////////////////////////////////////////////////////////////////////////////
  2. ;//
  3. ;// INTEL CORPORATION PROPRIETARY INFORMATION
  4. ;//
  5. ;// This software is supplied under the terms of a license
  6. ;// agreement or nondisclosure agreement with Intel Corporation
  7. ;// and may not be copied or disclosed except in accordance
  8. ;// with the terms of that agreement.
  9. ;//
  10. ;////////////////////////////////////////////////////////////////////////////
  11. ;//
  12. ;// $Header: R:\h26x\h26x\src\enc\e3msig.asv 1.2 04 Oct 1996 08:47:58 BNICKERS $
  13. ;//
  14. ;// $Log: R:\h26x\h26x\src\enc\e3msig.asv $
  15. ;//
  16. ;// Rev 1.2 04 Oct 1996 08:47:58 BNICKERS
  17. ;// Add EMV.
  18. ;//
  19. ;// Rev 1.1 08 Jul 1996 16:55:42 BNICKERS
  20. ;// Fix register initialization
  21. ;//
  22. ;// Rev 1.0 25 Jun 1996 14:24:54 BNICKERS
  23. ;// Initial revision.
  24. ;//
  25. ;////////////////////////////////////////////////////////////////////////////
  26. ;
  27. ; MMXMotionEstimationSignaturePrep -- This function pre-computes the signature
  28. ; inputs for the reference frame. It is
  29. ; used only by MMX ME, and only in AP mode.
  30. OPTION PROLOGUE:None
  31. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  32. OPTION M510
  33. OPTION CASEMAP:NONE
  34. include iammx.inc
  35. include e3inst.inc
  36. .xlist
  37. include memmodel.inc
  38. .list
  39. ;=============================================================================
  40. .CODE
  41. ASSUME cs : FLAT
  42. ASSUME ds : FLAT
  43. ASSUME es : FLAT
  44. ASSUME fs : FLAT
  45. ASSUME gs : FLAT
  46. ASSUME ss : FLAT
  47. MMxMESignaturePrep proc C APrev: DWORD,
  48. ASig: DWORD,
  49. AFrmWd: DWORD,
  50. AFrmHt: DWORD
  51. RegStoSize = 16
  52. ; Arguments:
  53. PreviousFrameBaseAddress = RegStoSize + 4
  54. SignatureFrameBaseAddress = RegStoSize + 8
  55. FrameWidth = RegStoSize + 12
  56. FrameHeight = RegStoSize + 16
  57. EndOfArgList = RegStoSize + 20
  58. push esi
  59. push edi
  60. push ebp
  61. push ebx
  62. ; ebp -- PITCH
  63. ; esi -- Cursor over reference frame.
  64. ; edi -- Cursor over frame of signature sums.
  65. ; edx -- Skip distance.
  66. ; ebx -- Outer loop counter.
  67. ; cl -- Initial value for inner loop counter.
  68. ; al -- Inner loop counter.
  69. ; ch -- Scratch.
  70. ; ah -- Scratch.
  71. mov esi,[esp+PreviousFrameBaseAddress]
  72. mov edi,[esp+SignatureFrameBaseAddress]
  73. mov ebx,[esp+FrameHeight]
  74. mov eax,[esp+FrameWidth]
  75. mov edx,PITCH*4-32
  76. mov ebp,PITCH
  77. sub edx,eax ; Distance from end of one row to start of next.
  78. add eax,32 ; Add the macroblocks off left and right edges.
  79. shr eax,4 ; Number of macroblocks in row.
  80. sub esi,16 ; Start at macroblock off left edge.
  81. mov cl,al ; To re-init inner loop counter.
  82. sub edi,16 ; Start at macroblock off left edge.
  83. pxor mm5,mm5
  84. pcmpeqb mm0,mm0
  85. pcmpeqb mm4,mm4
  86. psrlw mm0,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
  87. pxor mm6,mm6
  88. psrlw mm4,15 ; W:<0x0001 0x0001 0x0001 0x0001>
  89. movq mm2,[esi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
  90. movq mm1,mm0 ; W:< 00FF 00FF 00FF 00FF>
  91. movq mm3,[esi+8]
  92. pand mm0,mm2 ; W:<P06 P04 P02 P00>
  93. pxor mm7,mm7
  94. @@:
  95. pand mm1,mm3
  96. psllw mm0,2 ; W:<P06*4 P04*4 P02*4 P00*4>
  97. mov ah,[edi-PITCH*12]
  98. psrlw mm2,7 ; W:<P07*2 P05*2 P03*2 P01*2>
  99. movq [edi-PITCH*12],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
  100. pmaddwd mm2,mm4 ; D:<(P07+P05)*2 (P03+P01)*2>
  101. mov ch,[edi-PITCH*8+16]
  102. mov ah,[edi-PITCH*4]
  103. movq [edi-PITCH*8],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
  104. psllw mm1,2
  105. mov ch,[edi+16]
  106. psrlw mm3,7 ; W:<P07*2 P05*2 P03*2 P01*2>
  107. movq [edi-PITCH*4],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
  108. pmaddwd mm3,mm4
  109. movq [edi],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
  110. psllq mm0,2 ; W:<P06*16 P04*16 P02*16 P00*16>
  111. mov ah,[edi-PITCH*10-16]
  112. mov ch,[edi-PITCH*16]
  113. movq [edi-PITCH*16],mm0 ; Save W:<P06*16 P04*16 P02*16 P00*16>
  114. packssdw mm2,mm2 ; [0:31] W:<(P07+P05)*2 (P03+P01)*2>
  115. movq [edi-PITCH*12+8],mm1
  116. punpcklwd mm2,mm2 ; W:<(P07+P05)*2 (P07+P05)*2 (P03+P01)*2 ...>
  117. movq [edi-PITCH*8+8],mm1
  118. psubw mm2,mm5 ; Subtract sum of pels 15, 13, 11, and 9 to left.
  119. movq [edi-PITCH*4+8],mm1
  120. paddw mm7,mm2 ; Low DWORD: W:<sum(P0*)*2 sum(P0*)*2>, where
  121. ; ; "*" is odd columns from -11 thru +3.
  122. movq [edi+8],mm1
  123. paddw mm5,mm2 ; Save W:<(P27+P37+P25+P35) (P07+P17+P05+P15)...>
  124. mov ah,[edi-PITCH*14-32]
  125. mov ah,[edi-PITCH*6-32]
  126. mov ch,[edi-PITCH*2-16]
  127. movdf [edi-PITCH*14-12],mm7; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
  128. movdf [edi-PITCH*10-12],mm7; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
  129. psrlq mm2,32 ; Position 7, 5, and negative of 9, 11 to left.
  130. movdf [edi-PITCH*6-12],mm7 ; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
  131. paddw mm2,mm7 ; Low DWORD: W:<sum(P0*)*2 sum(P0*)*2>, where
  132. ; ; "*" is odd columns from -7 thru +7.
  133. movdf [edi-PITCH*2-12],mm7 ; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
  134. packssdw mm3,mm3
  135. movdf [edi-PITCH*10-8],mm2
  136. punpcklwd mm3,mm3
  137. movdf [edi-PITCH*6-8],mm2
  138. psubw mm3,mm6
  139. movdf [edi-PITCH*2-8],mm2
  140. paddw mm2,mm3
  141. add esi,16 ; Advance input cursor.
  142. dec al
  143. movdf [edi-PITCH*14-4],mm2
  144. movdf [edi-PITCH*10-4],mm2
  145. paddw mm6,mm3
  146. movdf [edi-PITCH*6-4],mm2
  147. psrlq mm3,32
  148. movdf [edi-PITCH*2-4],mm2
  149. paddw mm3,mm2
  150. movq mm2,[esi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
  151. movq mm7,mm3
  152. movq mm3,[esi+8]
  153. psllq mm1,2
  154. movdf [edi-PITCH*10],mm7
  155. pcmpeqb mm0,mm0
  156. movq [edi-PITCH*16+8],mm1
  157. psrlw mm0,8
  158. movdf [edi-PITCH*6],mm7
  159. movq mm1,mm0
  160. movdf [edi-PITCH*2],mm7
  161. pand mm0,mm2
  162. lea edi,[edi+16] ; Advance output cursor.
  163. jne @b
  164. lea esi,[esi+edx-PITCH*4] ; Get back to start of line 0.
  165. lea edi,[edi+edx-PITCH*4] ; Get back to start of line 0.
  166. pxor mm7,mm7
  167. add ebx,16 ; Do 4 extra sets of 4 lines at bottom.
  168. mov al,cl
  169. Next4LinesRefQuickSig:
  170. pxor mm5,mm5
  171. pcmpeqb mm0,mm0
  172. movq mm3,[esi+ebp*2] ; B:<P27 P26 P25 P24 P23 P22 P21 P20>
  173. psrlw mm0,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
  174. paddb mm3,[esi+PITCH*3] ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
  175. pcmpeqb mm4,mm4
  176. pxor mm6,mm6
  177. psrlw mm4,15 ; W:<0x0001 0x0001 0x0001 0x0001>
  178. @@:
  179. movq mm2,[esi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
  180. movq mm1,mm3 ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
  181. paddb mm2,[esi+ebp*1] ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
  182. psrlw mm3,8 ; W:<P27+P37 P25+P35 P23+P33 P21+P31>
  183. pmaddwd mm3,mm4 ; D:<P27+P37+P25+P35 P23+P33+P21+P31>
  184. pand mm1,mm0 ; W:<P26+P36 P24+P34 P22+P32 P20+P30>
  185. pand mm0,mm2 ; W:<P06+P16 P04+P14 P02+P12 P00+P10>
  186. psrlw mm2,8 ; W:<P07+P17 P05+P15 P03+P13 P01+P11>
  187. pmaddwd mm2,mm4 ; D:<P07+P17+P05+P15 P03+P13+P01+P11>
  188. paddw mm1,mm0 ; W:<(P06+P16+P26+P36) (P04+P14+P24+P34) ...>
  189. mov ah,[edi+ebp*2-16] ; Initiate cache line load.
  190. pslld mm3,16 ; D:<(P27+P37+P25+P35)<<16 (P23+P33+P21+P31)<<16>
  191. movq [edi+ebp*4],mm1 ; Save W:<(P06+P16+P26+P36) ...>
  192. pcmpeqb mm0,mm0
  193. paddw mm1,[edi-PITCH*16]; W:<Sum(P*6) Sum(P*4) Sum(P*2) Sum(P*0)>, where
  194. ; ; "*" is the 20 lines P-16 thru P3
  195. por mm2,mm3 ; W:<(P27+P37+P25+P35) (P07+P17+P05+P15)
  196. ; ; (P23+P33+P21+P31) (P03+P13+P01+P11)>
  197. psubw mm1,[edi-PITCH*12]; W:<Sum(P*6) Sum(P*4) Sum(P*2) Sum(P*0)>, where
  198. ; ; "*" is the 16 lines P-12 thru P3
  199. psubw mm2,mm5 ; Subtract sum of pels 15, 13, 11, and 9 to left.
  200. movq mm3,[esi+ebp*2+8]
  201. paddw mm7,mm2 ; Low DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)> where
  202. ; ; "*" is odd columns from -11 thru +3.
  203. movq [edi-PITCH*12],mm1; Save W:<P*6 P*4 P*2 P*0> where * is 16 rows.
  204. paddw mm5,mm2 ; Save W:<(P27+P37+P25+P35) (P07+P17+P05+P15)...>
  205. movdf [edi+ebp*2-12],mm7; Save DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)>
  206. psrlq mm2,32 ; Position 7, 5, and negative of 9, 11 to left.
  207. paddb mm3,[esi+PITCH*3+8]
  208. paddw mm7,mm2 ; Low DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)> where
  209. ; ; "*" is odd columns from -7 thru +7.
  210. movq mm2,[esi+8]
  211. psrlw mm0,8
  212. movdf [edi+ebp*2-8],mm7 ; Save DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)>
  213. movq mm1,mm3
  214. paddb mm2,[esi+ebp*1+8]
  215. psrlw mm3,8
  216. pmaddwd mm3,mm4
  217. pand mm1,mm0
  218. pand mm0,mm2
  219. psrlw mm2,8
  220. pmaddwd mm2,mm4
  221. paddw mm1,mm0
  222. mov ch,[edi+ebp*4+16] ; Initiate cache line load.
  223. pslld mm3,16
  224. movq [edi+ebp*4+8],mm1
  225. pcmpeqb mm0,mm0
  226. paddw mm1,[edi-PITCH*16+8]
  227. por mm2,mm3
  228. psubw mm1,[edi-PITCH*12+8]
  229. psubw mm2,mm6
  230. movq mm3,[esi+ebp*2+16]
  231. paddw mm7,mm2
  232. movq [edi-PITCH*12+8],mm1
  233. paddw mm6,mm2
  234. movdf [edi+ebp*2-4],mm7
  235. psrlq mm2,32
  236. paddb mm3,[esi+PITCH*3+16]
  237. paddw mm7,mm2
  238. add esi,16 ; Advance input cursor.
  239. dec al
  240. movdf [edi+ebp*2],mm7
  241. psrlw mm0,8
  242. lea edi,[edi+16] ; Advance output cursor.
  243. jne @b
  244. add esi,edx
  245. add edi,edx
  246. mov al,cl
  247. sub ebx,4
  248. pxor mm7,mm7
  249. jne Next4LinesRefQuickSig
  250. emms
  251. pop ebx
  252. pop ebp
  253. pop edi
  254. pop esi
  255. rturn
  256. MMxMESignaturePrep endp
  257. END