Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

686 lines
31 KiB

  1. ;--------------------------------------------------------------------------
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;--------------------------------------------------------------------------
  13. ;--------------------------------------------------------------------------
  14. ;
  15. ; $Author: SCDAY $
  16. ; $Date: 31 Oct 1996 09:00:56 $
  17. ; $Archive: S:\h26x\src\dec\d3mbkadd.asv $
  18. ; $Header: S:\h26x\src\dec\d3mbkadd.asv 1.8 31 Oct 1996 09:00:56 SCDAY $
  19. ; $Log: S:\h26x\src\dec\d3mbkadd.asv $
  20. ;//
  21. ;// Rev 1.8 31 Oct 1996 09:00:56 SCDAY
  22. ;// Raj added IFDEF H261 MMX_BlockAddSpecial and MMX_BlockCopySpecial
  23. ;//
  24. ;// Rev 1.7 09 Jul 1996 16:50:42 AGUPTA2
  25. ;// DC value for INTRA blocks is added back in ClipAndMove routine.
  26. ;// Cleaned-up code.
  27. ;//
  28. ;// Rev 1.6 04 Apr 1996 13:42:58 AGUPTA2
  29. ;// Removed a store stall from MMX_BlockAdd
  30. ;//
  31. ;// Rev 1.5 03 Apr 1996 17:42:30 AGUPTA2
  32. ;// Added MMX version of BlockCopy routine.
  33. ;//
  34. ;// Rev 1.4 03 Apr 1996 11:08:22 RMCKENZX
  35. ;// Added clearing of IDCT output. Cleaned comments.
  36. ;//
  37. ;// Rev 1.3 22 Mar 1996 15:43:30 AGUPTA2
  38. ;// Fixed fastcall bug: return from rtns with more than 2 params.
  39. ;//
  40. ;// Rev 1.2 14 Mar 1996 17:15:14 AGUPTA2
  41. ;//
  42. ;// Included Bob's MMX_ClipAndMove rtn. This rtn works on INTRA output.
  43. ;//
  44. ;// Rev 1.1 27 Feb 1996 16:48:52 RMCKENZX
  45. ;// Added rounding of IDCT output.
  46. ;
  47. ;--------------------------------------------------------------------------
  48. ;==========================================================================
  49. ;
  50. ; d3mbkadd.asm
  51. ;
  52. ; Routines:
  53. ; MMX_BlockAdd
  54. ; MMX_ClipAndMove
  55. ;
  56. ; Prototypes in d3mblk.h:
  57. ; extern "C" {
  58. ; void __fastcall MMX_BlockAdd(
  59. ; U32 uResidual, // pointer to IDCT output
  60. ; U32 uRefBlock, // pointer to predicted values
  61. ; U32 uDstBlock); // pointer to destination
  62. ;
  63. ; void __fastcall MMX_ClipAndMove(
  64. ; U32 uResidual, // pointer to IDCT output
  65. ; U32 uDstBlock, // pointer to destination
  66. ; U32 ScaledDC); // scaled DC
  67. ; }
  68. ;
  69. ;==========================================================================
  70. ;--------------------------------------------------------------------------
  71. ;
  72. ; MMX_BlockAdd
  73. ;
  74. ; Description:
  75. ; This routine performs block addition of the IDCT output with the
  76. ; predicted value to find the final value. The IDCT values are converted
  77. ; to integers then added to the prediction. The result of the addition is
  78. ; then clipped to 0...255. The routine is called with the __fastcall option,
  79. ; with the first two parameters in ecx and edx and the third on the stack.
  80. ;
  81. ; The routine clears the IDCT output after reading it.
  82. ; Parameters:
  83. ; ecx = uSrc1 pointer to IDCT output. Values are signed, 16 bit values with
  84. ; 6 fractional bits. They are not clipped to -256 ... +255.
  85. ; They are packed into a qword aligned 8x8 array of dwords.
  86. ;
  87. ; edx = uSrc2 pointer to prediction values. Vaules are unsigned, 8-bit
  88. ; values. They are packed into a (possibly unaligned) 8x8 array of
  89. ; bytes.
  90. ; esp+4 = uDst pointer to output values. Values will be unsigned, 8-bit
  91. ; values. They will be written into a qword aligned 8x8 array
  92. ; of bytes with a PITCH of 384 in between rows.
  93. ;
  94. ;--------------------------------------------------------------------------
  95. .586
  96. .MODEL FLAT
  97. OPTION CASEMAP:NONE
  98. OPTION PROLOGUE:None
  99. OPTION EPILOGUE:None
  100. .xlist
  101. include iammx.inc
  102. .list
  103. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  104. MMXCODE1 ENDS
  105. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  106. MMXDATA1 ENDS
  107. MMXDATA1 SEGMENT
  108. ALIGN 8
  109. MMX_Round32 DWORD 000200020H, 000200020H
  110. MMXDATA1 ENDS
  111. MMXCODE1 SEGMENT
  112. ALIGN 4
  113. @MMX_BlockAdd@12 PROC
  114. ; Parameters
  115. pSrc1 EQU ecx
  116. pSrc2 EQU edx
  117. pDst EQU eax
  118. PITCH EQU 384
  119. ;
  120. ; This loop is 2-folded and fully unrolled. 2-folded means that
  121. ; it works on 2 results per "pass" (8-pixel line). Fully unrolled means that
  122. ; it doesn't really loop at all -- all 8 "passes" are placed
  123. ; in succession.
  124. ;
  125. ; The result which each instruction is working on is identified
  126. ; by a number as the first item in the comment field.
  127. ;
  128. movq mm6, [MMX_Round32] ; rounding for IDC output
  129. ;
  130. movq mm3, [ecx+8] ; 1 - last 4 words of In1
  131. pxor mm7, mm7 ; zero for PUNPCK and clearing.
  132. movq mm1, [ecx] ; 1 - first 4 words of In1
  133. ;
  134. movq [ecx+8], mm7 ; 1 - zero last 4 words of In1
  135. paddw mm3, mm6 ; 1 - add in rounding
  136. movq [ecx], mm7 ; 1 - zero first 4 words of In1
  137. paddw mm1, mm6 ; 1 - add in rounding
  138. mov eax, [esp+4] ; destination pointer
  139. psraw mm3, 6 ; 1 - convert to int
  140. movq mm2, [edx] ; 1 - 8 bytes of In2
  141. psraw mm1, 6 ; 1 - convert to int
  142. ; pass 1
  143. movq mm0, mm2 ; 1 - second copy of In2
  144. punpckhbw mm2, mm7 ; 1 - last 4 bytes of In2
  145. paddw mm2, mm3 ; 1 - sum last 4 bytes
  146. punpcklbw mm0, mm7 ; 1 - first 4 bytes of In2
  147. movq mm3, [ecx+24] ; 2 - last 4 words of In1
  148. paddw mm0, mm1 ; 1 - sum first 4 bytes
  149. movq mm1, [ecx+16] ; 2 - first 4 words of In1
  150. packuswb mm0, mm2 ; 1 - combine & clip sum
  151. movq [ecx+24], mm7 ; 2 - zero last 4 words of In1
  152. paddw mm3, mm6 ; 2 - add in rounding
  153. movq [ecx+16], mm7 ; 2 - zero first 4 words of In1
  154. paddw mm1, mm6 ; 2 - add in rounding
  155. movq mm2, [edx+PITCH] ; 2 - 8 bytes of In2
  156. psraw mm3, 6 ; 2 - convert to int
  157. movq [eax], mm0 ; 1 - store result
  158. psraw mm1, 6 ; 2 - convert to int
  159. ; pass 2
  160. movq mm0, mm2 ; 2 - second copy of In2
  161. punpckhbw mm2, mm7 ; 2 - last 4 bytes of In2
  162. paddw mm2, mm3 ; 2 - sum last 4 bytes
  163. punpcklbw mm0, mm7 ; 2 - first 4 bytes of In2
  164. movq mm3, [ecx+40] ; 3 - last 4 words of In1
  165. paddw mm0, mm1 ; 2 - sum first 4 bytes
  166. movq mm1, [ecx+32] ; 3 - first 4 words of In1
  167. packuswb mm0, mm2 ; 2 - combine & clip sum
  168. movq [ecx+40], mm7 ; 3 - zero last 4 words of In1
  169. paddw mm3, mm6 ; 3 - add in rounding
  170. movq [ecx+32], mm7 ; 3 - zero first 4 words of In1
  171. paddw mm1, mm6 ; 3 - add in rounding
  172. movq mm2, [edx+2*PITCH] ; 3 - 8 bytes of In2
  173. psraw mm3, 6 ; 3 - convert to int
  174. movq [eax+PITCH], mm0 ; 2 - store result
  175. psraw mm1, 6 ; 3 - convert to int
  176. ; pass 3
  177. movq mm0, mm2 ; 3 - second copy of In2
  178. punpckhbw mm2, mm7 ; 3 - last 4 bytes of In2
  179. paddw mm2, mm3 ; 3 - sum last 4 bytes
  180. punpcklbw mm0, mm7 ; 3 - first 4 bytes of In2
  181. movq mm3, [ecx+56] ; 4 - last 4 words of In1
  182. paddw mm0, mm1 ; 3 - sum first 4 bytes
  183. movq mm1, [ecx+48] ; 4 - first 4 words of In1
  184. packuswb mm0, mm2 ; 3 - combine & clip sum
  185. movq [ecx+56], mm7 ; 4 - zero last 4 words of In1
  186. paddw mm3, mm6 ; 4 - add in rounding
  187. movq [ecx+48], mm7 ; 4 - zero first 4 words of In1
  188. paddw mm1, mm6 ; 4 - add in rounding
  189. movq mm2, [edx+3*PITCH] ; 4 - 8 bytes of In2
  190. psraw mm3, 6 ; 4 - convert to int
  191. movq [eax+2*PITCH], mm0 ; 3 - store result
  192. psraw mm1, 6 ; 4 - convert to int
  193. ; pass 4
  194. movq mm0, mm2 ; 4 - second copy of In2
  195. punpckhbw mm2, mm7 ; 4 - last 4 bytes of In2
  196. paddw mm2, mm3 ; 4 - sum last 4 bytes
  197. punpcklbw mm0, mm7 ; 4 - first 4 bytes of In2
  198. movq mm3, [ecx+72] ; 5 - last 4 words of In1
  199. paddw mm0, mm1 ; 4 - sum first 4 bytes
  200. movq mm1, [ecx+64] ; 5 - first 4 words of In1
  201. packuswb mm0, mm2 ; 4 - combine & clip sum
  202. movq [ecx+72], mm7 ; 5 - zero last 4 words of In1
  203. paddw mm3, mm6 ; 5 - add in rounding
  204. movq [ecx+64], mm7 ; 5 - zero first 4 words of In1
  205. paddw mm1, mm6 ; 5 - add in rounding
  206. movq mm2, [edx+4*PITCH] ; 5 - 8 bytes of In2
  207. psraw mm3, 6 ; 5 - convert to int
  208. movq [eax+3*PITCH], mm0 ; 4 - store result
  209. psraw mm1, 6 ; 5 - convert to int
  210. ; pass 5
  211. movq mm0, mm2 ; 5 - second copy of In2
  212. punpckhbw mm2, mm7 ; 5 - last 4 bytes of In2
  213. paddw mm2, mm3 ; 5 - sum last 4 bytes
  214. punpcklbw mm0, mm7 ; 5 - first 4 bytes of In2
  215. movq mm3, [ecx+88] ; 6 - last 4 words of In1
  216. paddw mm0, mm1 ; 5 - sum first 4 bytes
  217. movq mm1, [ecx+80] ; 6 - first 4 words of In1
  218. packuswb mm0, mm2 ; 5 - combine & clip sum
  219. movq [ecx+88], mm7 ; 6 - zero last 4 words of In1
  220. paddw mm3, mm6 ; 6 - add in rounding
  221. movq [ecx+80], mm7 ; 6 - zero first 4 words of In1
  222. paddw mm1, mm6 ; 6 - add in rounding
  223. movq mm2, [edx+5*PITCH] ; 6 - 8 bytes of In2
  224. psraw mm3, 6 ; 6 - convert to int
  225. movq [eax+4*PITCH], mm0 ; 5 - store result
  226. psraw mm1, 6 ; 6 - convert to int
  227. ; pass 6
  228. movq mm0, mm2 ; 6 - second copy of In2
  229. punpckhbw mm2, mm7 ; 6 - last 4 bytes of In2
  230. paddw mm2, mm3 ; 6 - sum last 4 bytes
  231. punpcklbw mm0, mm7 ; 6 - first 4 bytes of In2
  232. movq mm3, [ecx+104] ; 7 - last 4 words of In1
  233. paddw mm0, mm1 ; 6 - sum first 4 bytes
  234. movq mm1, [ecx+96] ; 7 - first 4 words of In1
  235. packuswb mm0, mm2 ; 6 - combine & clip sum
  236. movq [ecx+104], mm7 ; 7 - zero last 4 words of In1
  237. paddw mm3, mm6 ; 7 - add in rounding
  238. movq [ecx+96], mm7 ; 7 - zero first 4 words of In1
  239. paddw mm1, mm6 ; 7 - add in rounding
  240. movq mm2, [edx+6*PITCH] ; 7 - 8 bytes of In2
  241. psraw mm3, 6 ; 7 - convert to int
  242. movq [eax+5*PITCH], mm0 ; 6 - store result
  243. psraw mm1, 6 ; 7 - convert to int
  244. ; pass 7
  245. movq mm0, mm2 ; 7 - second copy of In2
  246. punpckhbw mm2, mm7 ; 7 - last 4 bytes of In2
  247. paddw mm2, mm3 ; 7 - sum last 4 bytes
  248. punpcklbw mm0, mm7 ; 7 - first 4 bytes of In2
  249. movq mm3, [ecx+120] ; 8 - last 4 words of In1
  250. paddw mm0, mm1 ; 7 - sum first 4 bytes
  251. movq mm1, [ecx+112] ; 8 - first 4 words of In1
  252. packuswb mm0, mm2 ; 7 - combine & clip sum
  253. movq [ecx+120], mm7 ; 8 - zero last 4 words of In1
  254. paddw mm3, mm6 ; 8 - add in rounding
  255. movq [ecx+112], mm7 ; 8 - zero first 4 words of In1
  256. paddw mm1, mm6 ; 8 - add in rounding
  257. movq mm2, [edx+7*PITCH] ; 8 - 8 bytes of In2
  258. psraw mm3, 6 ; 8 - convert to int
  259. movq [eax+6*PITCH], mm0 ; 7 - store result
  260. psraw mm1, 6 ; 8 - convert to int
  261. ;
  262. ; pass 8
  263. ; wrap up
  264. ;
  265. movq mm0, mm2 ; 8 - second copy of In2
  266. punpckhbw mm2, mm7 ; 8 - last 4 bytes of In2
  267. paddw mm2, mm3 ; 8 - sum last 4 bytes
  268. punpcklbw mm0, mm7 ; 8 - first 4 bytes of In2
  269. paddw mm0, mm1 ; 8 - sum first 4 bytes
  270. ;
  271. packuswb mm0, mm2 ; 8 - combine & clip sum
  272. ;
  273. movq [eax+7*PITCH], mm0 ; 8 - store result
  274. ret 4
  275. @MMX_BlockAdd@12 ENDP
  276. IFDEF H261
  277. ALIGN 4
  278. @MMX_BlockAddSpecial@12 PROC
  279. ; Parameters
  280. pSrc1 EQU ecx
  281. pSrc2 EQU edx
  282. pDst EQU eax
  283. PITCH EQU 384
  284. ;
  285. ; This loop is 2-folded and fully unrolled. 2-folded means that
  286. ; it works on 2 results per "pass" (8-pixel line). Fully unrolled means that
  287. ; it doesn't really loop at all -- all 8 "passes" are placed
  288. ; in succession.
  289. ;
  290. ; The result which each instruction is working on is identified
  291. ; by a number as the first item in the comment field.
  292. ;
  293. movq mm6, [MMX_Round32] ; rounding for IDC output
  294. ;
  295. movq mm3, [ecx+8] ; 1 - last 4 words of In1
  296. pxor mm7, mm7 ; zero for PUNPCK and clearing.
  297. movq mm1, [ecx] ; 1 - first 4 words of In1
  298. ;
  299. movq [ecx+8], mm7 ; 1 - zero last 4 words of In1
  300. paddw mm3, mm6 ; 1 - add in rounding
  301. movq [ecx], mm7 ; 1 - zero first 4 words of In1
  302. paddw mm1, mm6 ; 1 - add in rounding
  303. mov eax, [esp+4] ; destination pointer
  304. psraw mm3, 6 ; 1 - convert to int
  305. movq mm2, [edx] ; 1 - 8 bytes of In2
  306. psraw mm1, 6 ; 1 - convert to int
  307. ; pass 1
  308. movq mm0, mm2 ; 1 - second copy of In2
  309. punpckhbw mm2, mm7 ; 1 - last 4 bytes of In2
  310. paddw mm2, mm3 ; 1 - sum last 4 bytes
  311. punpcklbw mm0, mm7 ; 1 - first 4 bytes of In2
  312. movq mm3, [ecx+24] ; 2 - last 4 words of In1
  313. paddw mm0, mm1 ; 1 - sum first 4 bytes
  314. movq mm1, [ecx+16] ; 2 - first 4 words of In1
  315. packuswb mm0, mm2 ; 1 - combine & clip sum
  316. movq [ecx+24], mm7 ; 2 - zero last 4 words of In1
  317. paddw mm3, mm6 ; 2 - add in rounding
  318. movq [ecx+16], mm7 ; 2 - zero first 4 words of In1
  319. paddw mm1, mm6 ; 2 - add in rounding
  320. movq mm2, [edx+8] ; 2 - 8 bytes of In2
  321. psraw mm3, 6 ; 2 - convert to int
  322. movq [eax], mm0 ; 1 - store result
  323. psraw mm1, 6 ; 2 - convert to int
  324. ; pass 2
  325. movq mm0, mm2 ; 2 - second copy of In2
  326. punpckhbw mm2, mm7 ; 2 - last 4 bytes of In2
  327. paddw mm2, mm3 ; 2 - sum last 4 bytes
  328. punpcklbw mm0, mm7 ; 2 - first 4 bytes of In2
  329. movq mm3, [ecx+40] ; 3 - last 4 words of In1
  330. paddw mm0, mm1 ; 2 - sum first 4 bytes
  331. movq mm1, [ecx+32] ; 3 - first 4 words of In1
  332. packuswb mm0, mm2 ; 2 - combine & clip sum
  333. movq [ecx+40], mm7 ; 3 - zero last 4 words of In1
  334. paddw mm3, mm6 ; 3 - add in rounding
  335. movq [ecx+32], mm7 ; 3 - zero first 4 words of In1
  336. paddw mm1, mm6 ; 3 - add in rounding
  337. movq mm2, [edx+2*8] ; 3 - 8 bytes of In2
  338. psraw mm3, 6 ; 3 - convert to int
  339. movq [eax+PITCH], mm0 ; 2 - store result
  340. psraw mm1, 6 ; 3 - convert to int
  341. ; pass 3
  342. movq mm0, mm2 ; 3 - second copy of In2
  343. punpckhbw mm2, mm7 ; 3 - last 4 bytes of In2
  344. paddw mm2, mm3 ; 3 - sum last 4 bytes
  345. punpcklbw mm0, mm7 ; 3 - first 4 bytes of In2
  346. movq mm3, [ecx+56] ; 4 - last 4 words of In1
  347. paddw mm0, mm1 ; 3 - sum first 4 bytes
  348. movq mm1, [ecx+48] ; 4 - first 4 words of In1
  349. packuswb mm0, mm2 ; 3 - combine & clip sum
  350. movq [ecx+56], mm7 ; 4 - zero last 4 words of In1
  351. paddw mm3, mm6 ; 4 - add in rounding
  352. movq [ecx+48], mm7 ; 4 - zero first 4 words of In1
  353. paddw mm1, mm6 ; 4 - add in rounding
  354. movq mm2, [edx+3*8] ; 4 - 8 bytes of In2
  355. psraw mm3, 6 ; 4 - convert to int
  356. movq [eax+2*PITCH], mm0 ; 3 - store result
  357. psraw mm1, 6 ; 4 - convert to int
  358. ; pass 4
  359. movq mm0, mm2 ; 4 - second copy of In2
  360. punpckhbw mm2, mm7 ; 4 - last 4 bytes of In2
  361. paddw mm2, mm3 ; 4 - sum last 4 bytes
  362. punpcklbw mm0, mm7 ; 4 - first 4 bytes of In2
  363. movq mm3, [ecx+72] ; 5 - last 4 words of In1
  364. paddw mm0, mm1 ; 4 - sum first 4 bytes
  365. movq mm1, [ecx+64] ; 5 - first 4 words of In1
  366. packuswb mm0, mm2 ; 4 - combine & clip sum
  367. movq [ecx+72], mm7 ; 5 - zero last 4 words of In1
  368. paddw mm3, mm6 ; 5 - add in rounding
  369. movq [ecx+64], mm7 ; 5 - zero first 4 words of In1
  370. paddw mm1, mm6 ; 5 - add in rounding
  371. movq mm2, [edx+4*8] ; 5 - 8 bytes of In2
  372. psraw mm3, 6 ; 5 - convert to int
  373. movq [eax+3*PITCH], mm0 ; 4 - store result
  374. psraw mm1, 6 ; 5 - convert to int
  375. ; pass 5
  376. movq mm0, mm2 ; 5 - second copy of In2
  377. punpckhbw mm2, mm7 ; 5 - last 4 bytes of In2
  378. paddw mm2, mm3 ; 5 - sum last 4 bytes
  379. punpcklbw mm0, mm7 ; 5 - first 4 bytes of In2
  380. movq mm3, [ecx+88] ; 6 - last 4 words of In1
  381. paddw mm0, mm1 ; 5 - sum first 4 bytes
  382. movq mm1, [ecx+80] ; 6 - first 4 words of In1
  383. packuswb mm0, mm2 ; 5 - combine & clip sum
  384. movq [ecx+88], mm7 ; 6 - zero last 4 words of In1
  385. paddw mm3, mm6 ; 6 - add in rounding
  386. movq [ecx+80], mm7 ; 6 - zero first 4 words of In1
  387. paddw mm1, mm6 ; 6 - add in rounding
  388. movq mm2, [edx+5*8] ; 6 - 8 bytes of In2
  389. psraw mm3, 6 ; 6 - convert to int
  390. movq [eax+4*PITCH], mm0 ; 5 - store result
  391. psraw mm1, 6 ; 6 - convert to int
  392. ; pass 6
  393. movq mm0, mm2 ; 6 - second copy of In2
  394. punpckhbw mm2, mm7 ; 6 - last 4 bytes of In2
  395. paddw mm2, mm3 ; 6 - sum last 4 bytes
  396. punpcklbw mm0, mm7 ; 6 - first 4 bytes of In2
  397. movq mm3, [ecx+104] ; 7 - last 4 words of In1
  398. paddw mm0, mm1 ; 6 - sum first 4 bytes
  399. movq mm1, [ecx+96] ; 7 - first 4 words of In1
  400. packuswb mm0, mm2 ; 6 - combine & clip sum
  401. movq [ecx+104], mm7 ; 7 - zero last 4 words of In1
  402. paddw mm3, mm6 ; 7 - add in rounding
  403. movq [ecx+96], mm7 ; 7 - zero first 4 words of In1
  404. paddw mm1, mm6 ; 7 - add in rounding
  405. movq mm2, [edx+6*8] ; 7 - 8 bytes of In2
  406. psraw mm3, 6 ; 7 - convert to int
  407. movq [eax+5*PITCH], mm0 ; 6 - store result
  408. psraw mm1, 6 ; 7 - convert to int
  409. ; pass 7
  410. movq mm0, mm2 ; 7 - second copy of In2
  411. punpckhbw mm2, mm7 ; 7 - last 4 bytes of In2
  412. paddw mm2, mm3 ; 7 - sum last 4 bytes
  413. punpcklbw mm0, mm7 ; 7 - first 4 bytes of In2
  414. movq mm3, [ecx+120] ; 8 - last 4 words of In1
  415. paddw mm0, mm1 ; 7 - sum first 4 bytes
  416. movq mm1, [ecx+112] ; 8 - first 4 words of In1
  417. packuswb mm0, mm2 ; 7 - combine & clip sum
  418. movq [ecx+120], mm7 ; 8 - zero last 4 words of In1
  419. paddw mm3, mm6 ; 8 - add in rounding
  420. movq [ecx+112], mm7 ; 8 - zero first 4 words of In1
  421. paddw mm1, mm6 ; 8 - add in rounding
  422. movq mm2, [edx+7*8] ; 8 - 8 bytes of In2
  423. psraw mm3, 6 ; 8 - convert to int
  424. movq [eax+6*PITCH], mm0 ; 7 - store result
  425. psraw mm1, 6 ; 8 - convert to int
  426. ;
  427. ; pass 8
  428. ; wrap up
  429. ;
  430. movq mm0, mm2 ; 8 - second copy of In2
  431. punpckhbw mm2, mm7 ; 8 - last 4 bytes of In2
  432. paddw mm2, mm3 ; 8 - sum last 4 bytes
  433. punpcklbw mm0, mm7 ; 8 - first 4 bytes of In2
  434. paddw mm0, mm1 ; 8 - sum first 4 bytes
  435. ;
  436. packuswb mm0, mm2 ; 8 - combine & clip sum
  437. ;
  438. movq [eax+7*PITCH], mm0 ; 8 - store result
  439. ret 4
  440. @MMX_BlockAddSpecial@12 ENDP
  441. ENDIF
  442. ;----------------------------------------------------------------------------
  443. ;
  444. ; MMX_ClipAndMove
  445. ;
  446. ; Description:
  447. ; This routine takes the MMx IDCT output, converts (with round)
  448. ; to integer, and clips to 0...255. Routine is called with the
  449. ; __fastcall option, with the two parameters in ecx and edx.
  450. ;
  451. ; The routine clears the IDCT output after reading it.
  452. ;
  453. ; MMx version.
  454. ;
  455. ; Parameters:
  456. ; ecx = uSrc1 pointer to IDCT output. Values are signed, 16 bit values
  457. ; with 6 fractional bits. They are not clipped to -256 ...
  458. ; +255. They are packed into a qword aligned 8x8 array
  459. ; of words.
  460. ;
  461. ; edx = uDst pointer to output values. Values will be unsigned, 8-bit
  462. ; values. They will be written into a qword aligned 8x8 array
  463. ; of bytes with a PITCH of 384 in between rows.
  464. ; esp + 4 = Scaled DC value with 7 fraction bits
  465. ;----------------------------------------------------------------------------
  466. ALIGN 4
  467. @MMX_ClipAndMove@12 PROC
  468. ; Parameters
  469. pSrc1 EQU ecx
  470. pDst EQU edx
  471. ScaledDC EQU DWORD PTR [esp + 4]
  472. ;
  473. ; preamble
  474. ;
  475. movd mm0, ScaledDC ; Scaled DC value
  476. pxor mm6, mm6 ; zero
  477. movq mm1, mm0
  478. psllq mm0, 16
  479. movq mm2, [ecx] ; 3: fetch first 4 words
  480. por mm0, mm1 ; lower 2 WORDS have ScaledDC
  481. movq mm7, mm0
  482. psllq mm0, 32
  483. por mm7, mm0 ; all 4 WORDS have ScaledDC
  484. mov eax, 3 ; loop control
  485. movq mm3, [ecx+8] ; 3: fetch last 4 words
  486. psrlw mm7, 1 ; DC with 6 bits of fraction
  487. paddw mm7, [MMX_Round32] ; rounding+DC for IDCT output
  488. ;
  489. movq [ecx], mm6 ; 3: zero first 4 words
  490. paddw mm2, mm7 ; 3: add in round
  491. movq [ecx+8], mm6 ; 3: zero first 4 words
  492. paddw mm3, mm7 ; 3: add in round
  493. psraw mm2, 6 ; 3: convert to integer
  494. ;
  495. ;
  496. ; main loop:
  497. ; This loop is 3-folded and 2-unrolled. 3-folded means that it
  498. ; works on 3 different results per iteration. 2-unrolled that
  499. ; it produces 2 results per iteration.
  500. ;
  501. ; The result which each instruction works on is identified by a
  502. ; number (1:, 2:, or 3:) at the start of the comment field. These
  503. ; identify 3 stages as follows:
  504. ;
  505. ; Stage Description
  506. ; ----- -----------
  507. ; 1 Convert the last 4 words of a line to integer, pack together
  508. ; into 8 bytes, and write the result.
  509. ; 2 Do all processing for the next line: load and clear 8 words,
  510. ; add in round, convert to integer, pack to bytes, and write
  511. ; the result.
  512. ; 3 Load and zero all 8 words of a line, add in round,
  513. ; and convert the first 4 of them to integers. (Processing
  514. ; of this stage is completed as stage 1 of the next pass.)
  515. ;
  516. MainLoop:
  517. movq mm0, [ecx+16] ; 2: fetch first 4 words
  518. psraw mm3, 6 ; 1: convert to integer
  519. movq mm1, [ecx+24] ; 2: fetch last 4 words
  520. packuswb mm2, mm3 ; 1: pack and clip
  521. movq [ecx+16], mm6 ; 2: zero first 4 words
  522. paddw mm0, mm7 ; 2: add in round
  523. movq [ecx+24], mm6 ; 2: zero last 4 words
  524. paddw mm1, mm7 ; 2: add in round
  525. movq [edx], mm2 ; 1: store result
  526. psraw mm0, 6 ; 2: convert to integer
  527. movq mm2, [ecx+32] ; 3: fetch first 4 words
  528. psraw mm1, 6 ; 2: convert to integer
  529. movq mm3, [ecx+40] ; 3: fetch last 4 words
  530. packuswb mm0, mm1 ; 2: pack and clip
  531. movq [ecx+32], mm6 ; 2: zero first 4 words
  532. paddw mm2, mm7 ; 3: add in round
  533. movq [ecx+40], mm6 ; 2: zero first 4 words
  534. paddw mm3, mm7 ; 3: add in round
  535. movq [edx+PITCH], mm0 ; 2: store result
  536. psraw mm2, 6 ; 3: convert to integer
  537. add ecx, 32 ; increment source pointer
  538. add edx, 2*PITCH ; increment destination pointer
  539. dec eax ; decrement loop control
  540. jne MainLoop ; repeat three times
  541. ;
  542. ; postamble
  543. ;
  544. movq mm0, [ecx+16] ; 2: fetch first 4 words
  545. psraw mm3, 6 ; 1: convert to integer
  546. movq mm1, [ecx+24] ; 2: fetch last 4 words
  547. packuswb mm2, mm3 ; 1: pack and clip
  548. paddw mm0, mm7 ; 2: add in round
  549. paddw mm1, mm7 ; 2: add in round
  550. movq [edx], mm2 ; 1: store result
  551. psraw mm0, 6 ; 2: convert to integer
  552. movq [ecx+16], mm6 ; 2: zero first 4 words
  553. psraw mm1, 6 ; 2: convert to integer
  554. movq [ecx+24], mm6 ; 2: zero last 4 words
  555. packuswb mm0, mm1 ; 2: pack and clip
  556. movq [edx+PITCH], mm0 ; 2: store result
  557. ret 4
  558. @MMX_ClipAndMove@12 ENDP
  559. ;----------------------------------------------------------------------------
  560. ;
  561. ; MMX_BlockCopy
  562. ; Copy in chunks of 4 as suggested in MMX guide. (
  563. ; Parameters:
  564. ; ecx = Pointer to output values
  565. ;
  566. ; edx = Pointer to input values
  567. ;----------------------------------------------------------------------------
  568. ALIGN 4
  569. @MMX_BlockCopy@8 PROC
  570. ; Parameters
  571. pDst EQU ecx
  572. pSrc EQU edx
  573. movq mm0, [pSrc]
  574. ;
  575. movq mm1, [pSrc + PITCH]
  576. ;
  577. movq mm2, [pSrc + PITCH*2]
  578. ;
  579. movq mm3, [pSrc + PITCH*3]
  580. ;
  581. movq [pDst], mm0
  582. ;
  583. movq [pDst + PITCH], mm1
  584. ;
  585. movq [pDst + PITCH*2], mm2
  586. ;
  587. movq [pDst + PITCH*3], mm3
  588. ;
  589. movq mm4, [pSrc + PITCH*4]
  590. ;
  591. movq mm5, [pSrc + PITCH*5]
  592. ;
  593. movq mm6, [pSrc + PITCH*6]
  594. ;
  595. movq mm7, [pSrc + PITCH*7]
  596. ;
  597. movq [pDst + PITCH*4], mm4
  598. ;
  599. movq [pDst + PITCH*5], mm5
  600. ;
  601. movq [pDst + PITCH*6], mm6
  602. ;
  603. movq [pDst + PITCH*7], mm7
  604. ;
  605. ret
  606. @MMX_BlockCopy@8 ENDP
  607. IFDEF H261
  608. ;----------------------------------------------------------------------------
  609. ;
  610. ; MMX_BlockCopySpecial
  611. ; Copy in chunks of 4 as suggested in MMX guide. (
  612. ; Parameters:
  613. ; ecx = Pointer to output values
  614. ;
  615. ; edx = Pointer to input values
  616. ;----------------------------------------------------------------------------
  617. ALIGN 4
  618. @MMX_BlockCopySpecial@8 PROC
  619. ; Parameters
  620. pDst EQU ecx
  621. pSrc EQU edx
  622. PITCH8 EQU 8
  623. movq mm0, [pSrc]
  624. ;
  625. movq mm1, [pSrc + PITCH8]
  626. ;
  627. movq mm2, [pSrc + PITCH8*2]
  628. ;
  629. movq mm3, [pSrc + PITCH8*3]
  630. ;
  631. movq [pDst], mm0
  632. ;
  633. movq [pDst + PITCH], mm1
  634. ;
  635. movq [pDst + PITCH*2], mm2
  636. ;
  637. movq [pDst + PITCH*3], mm3
  638. ;
  639. movq mm4, [pSrc + PITCH8*4]
  640. ;
  641. movq mm5, [pSrc + PITCH8*5]
  642. ;
  643. movq mm6, [pSrc + PITCH8*6]
  644. ;
  645. movq mm7, [pSrc + PITCH8*7]
  646. ;
  647. movq [pDst + PITCH*4], mm4
  648. ;
  649. movq [pDst + PITCH*5], mm5
  650. ;
  651. movq [pDst + PITCH*6], mm6
  652. ;
  653. movq [pDst + PITCH*7], mm7
  654. ;
  655. ret
  656. @MMX_BlockCopySpecial@8 ENDP
  657. ENDIF
  658. MMXCODE1 ENDS
  659. END