Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1574 lines
33 KiB

  1. ;***************************************************************************/
  2. ;*
  3. ;* INTEL Corporation Proprietary Information
  4. ;*
  5. ;*
  6. ;* Copyright (c) 1996 Intel Corporation.
  7. ;* All rights reserved.
  8. ;*
  9. ;***************************************************************************/
  10. ; AUTHOR: Kumar Balasubramanian
  11. ;***************************************************************************/
  12. ;; MMX version of the "integer fast mode" within IJG decompressor code.
  13. .nolist
  14. include iammx.inc ; IAMMX Emulator Macros
  15. MMWORD TEXTEQU <DWORD>
  16. .list
  17. .586
  18. .model flat
  19. _DATA SEGMENT PARA PUBLIC USE32 'DATA'
  20. x0005000200010001 DQ 0005000200010001h
  21. x0040000000000000 DQ 40000000000000h
  22. x5a825a825a825a82 DW 16ah, 0h, 16ah, 0h ; 23170---1.414
  23. x539f539f539f539f DW 0fd63h, 0h, 0fd63h, 0h ; 21407---2.613
  24. x4546454645464546 DW 115h, 0h, 115h, 0h ; 17734---1.082
  25. x61f861f861f861f8 DW 1d9h, 0h, 1d9h, 0h ; 25080---1.847
  26. const_mask DQ 3ff03ff03ff03ffh
  27. const_zero DQ 0
  28. scratch1 DQ 0
  29. scratch3 DQ 0
  30. scratch5 DQ 0
  31. scratch7 DQ 0
  32. ; for debug only
  33. x0 DQ 0
  34. preSC DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  35. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  36. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  37. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  38. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  39. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  40. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  41. DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
  42. _DATA ENDS
  43. _TEXT SEGMENT PARA PUBLIC USE32 'CODE'
  44. PackMulW MACRO
  45. movq mm0, mmword ptr scratch1
  46. punpcklwd mm0, mmword ptr const_zero
  47. pmaddwd mm0, mmword ptr scratch3
  48. psrad mm0, 8
  49. movq mm1, mmword ptr scratch1
  50. punpckhwd mm1, mmword ptr const_zero
  51. pmaddwd mm1, mmword ptr scratch3
  52. psrad mm1, 8
  53. movq mmword ptr scratch1, mm1
  54. movq mm1, mm0
  55. punpcklwd mm0, mmword ptr scratch1
  56. punpckhwd mm1, mmword ptr scratch1
  57. punpcklwd mm0, mm1
  58. movq mmword ptr scratch1, mm0
  59. ENDM
  60. COMMENT ^
  61. void idct8x8aan (
  62. int16 *src_result);
  63. ^
  64. public _idct8x8aan
  65. _idct8x8aan proc USES eax ebx ecx edx esi edi ebp
  66. mov ebx, DWORD PTR [esp+32] ; source coeff
  67. mov esi, DWORD PTR [esp+36] ; temp results
  68. mov edi, DWORD PTR [esp+40] ; quant factors
  69. ;slot
  70. ; column 0: even part
  71. ; use V4, V12, V0, V8 to produce V22..V25
  72. ;slot
  73. movq mm0, mmword ptr [ebx+8*12] ; V12
  74. pmullw mm0, mmword ptr [edi+8*12]
  75. ;slot
  76. movq mm1, mmword ptr [ebx+8*4] ; V4
  77. pmullw mm1, mmword ptr [edi+8*4]
  78. ;slot
  79. movq mm3, mmword ptr [ebx+8*0] ; V0
  80. pmullw mm3, mmword ptr [edi+8*0]
  81. ;slot
  82. movq mm2, mm1 ; duplicate V4
  83. movq mm5, mmword ptr [ebx+8*8] ; V8
  84. pmullw mm5, mmword ptr [edi+8*8]
  85. psubw mm1, mm0 ; V16
  86. movq mmword ptr scratch1, mm1
  87. movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  88. movq mmword ptr scratch3, mm1
  89. movq mmword ptr scratch5, mm0
  90. PackMulW
  91. movq mm1, mmword ptr scratch1
  92. movq mm0, mmword ptr scratch5
  93. paddw mm2, mm0 ; V17
  94. movq mm0, mm2 ; duplicate V17
  95. movq mm4, mm3 ; duplicate V0
  96. paddw mm3, mm5 ; V19
  97. psubw mm4, mm5 ; V20 ;mm5 free
  98. movq mm6, mm3 ; duplicate t74=t81
  99. psubw mm1, mm0 ; V21 ; mm0 free
  100. paddw mm3, mm2 ; V22
  101. movq mm5, mm1 ; duplicate V21
  102. paddw mm1, mm4 ; V23
  103. movq mmword ptr [esi+8*4], mm3 ; V22
  104. psubw mm4, mm5 ; V24; mm5 free
  105. movq mmword ptr [esi+8*12], mm1 ; V23
  106. psubw mm6, mm2 ; V25; mm2 free
  107. movq mmword ptr [esi+8*0], mm4 ; V24
  108. ;slot
  109. movq mm7, mmword ptr [ebx+8*10] ; V10
  110. pmullw mm7, mmword ptr [edi+8*10]
  111. ;slot
  112. movq mm0, mmword ptr [ebx+8*6] ; V6
  113. pmullw mm0, mmword ptr [edi+8*6]
  114. ;slot
  115. movq mm3, mm7 ; duplicate V10
  116. movq mm5, mmword ptr [ebx+8*2] ; V2
  117. pmullw mm5, mmword ptr [edi+8*2]
  118. ;slot
  119. psubw mm7, mm0 ; V26
  120. movq mm4, mmword ptr [ebx+8*14] ; V14
  121. pmullw mm4, mmword ptr [edi+8*14]
  122. paddw mm3, mm0 ; V29 ; free mm0
  123. movq mm1, mm7 ; duplicate V26
  124. movq mmword ptr scratch1, mm7
  125. movq mm7, mmword ptr x539f539f539f539f ; 23170 ->V18
  126. movq mmword ptr scratch3, mm7
  127. movq mmword ptr scratch5, mm0
  128. movq mmword ptr scratch7, mm1
  129. PackMulW
  130. movq mm7, mmword ptr scratch1
  131. movq mm0, mmword ptr scratch5
  132. movq mm1, mmword ptr scratch7
  133. movq mm0, mm5 ; duplicate V2
  134. paddw mm5, mm4 ; V27
  135. psubw mm0, mm4 ; V28 ; free mm4
  136. movq mm2, mm0 ; duplicate V28
  137. movq mmword ptr scratch1, mm0
  138. movq mm0, mmword ptr x4546454645464546 ; 23170 ->V18
  139. movq mmword ptr scratch3, mm0
  140. movq mmword ptr scratch7, mm1
  141. PackMulW
  142. movq mm0, mmword ptr scratch1
  143. movq mm1, mmword ptr scratch7
  144. movq mm4, mm5 ; duplicate t90=t93
  145. paddw mm1, mm2 ; V32 ; free mm2
  146. movq mmword ptr scratch1, mm1
  147. movq mm1, mmword ptr x61f861f861f861f8 ; 23170 ->V18
  148. movq mmword ptr scratch3, mm1
  149. movq mmword ptr scratch5, mm0
  150. PackMulW
  151. movq mm1, mmword ptr scratch1
  152. movq mm0, mmword ptr scratch5
  153. paddw mm5, mm3 ; V31
  154. psubw mm4, mm3 ; V30 ; free mm3
  155. movq mmword ptr scratch1, mm4
  156. movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  157. movq mmword ptr scratch3, mm4
  158. movq mmword ptr scratch5, mm0
  159. movq mmword ptr scratch7, mm1
  160. PackMulW
  161. movq mm4, mmword ptr scratch1
  162. movq mm0, mmword ptr scratch5
  163. movq mm1, mmword ptr scratch7
  164. psubw mm0, mm1 ; V38
  165. paddw mm1, mm7 ; V37 ; free mm7
  166. movq mm3, mm6 ; duplicate V25
  167. ;move from the next block
  168. movq mm7, mmword ptr [esi+8*4] ; V22
  169. psubw mm1, mm5 ; V39 (mm5 still needed for next block)
  170. ;move from the next block
  171. movq mm2, mmword ptr [esi+8*12] ; V23
  172. psubw mm4, mm1 ; V40
  173. paddw mm0, mm4 ; V41; free mm0
  174. ; column 0: output butterfly
  175. psubw mm6, mm0 ; tm6
  176. paddw mm3, mm0 ; tm8; free mm1
  177. movq mm0, mm1 ; line added by Kumar
  178. movq mm1, mm7 ; duplicate V22
  179. paddw mm7, mm5 ; tm0
  180. movq mmword ptr [esi+8*8], mm3 ; tm8; free mm3
  181. psubw mm1, mm5 ; tm14; free mm5
  182. movq mmword ptr [esi+8*6], mm6 ; tm6; free mm6
  183. movq mm3, mm2 ; duplicate t117=t125
  184. movq mm6, mmword ptr [esi+8*0] ; V24
  185. paddw mm2, mm0 ; tm2
  186. movq mmword ptr [esi+8*0], mm7 ; tm0; free mm7
  187. psubw mm3, mm0 ; tm12; free mm0
  188. movq mmword ptr [esi+8*14], mm1 ; tm14; free mm1
  189. movq mmword ptr [esi+8*2], mm2 ; tm2; free mm2
  190. movq mm0, mm6 ; duplicate t119=t123
  191. movq mmword ptr [esi+8*12], mm3 ; tm12; free mm3
  192. paddw mm6, mm4 ; tm4
  193. psubw mm0, mm4 ; tm10; free mm4
  194. movq mm1, mmword ptr [ebx+8*5] ; V5
  195. pmullw mm1, mmword ptr [edi+8*5]
  196. movq mmword ptr [esi+8*4], mm6 ; tm4; free mm6
  197. movq mmword ptr [esi+8*10], mm0 ; tm10; free mm0
  198. ; column 1: even part
  199. ; use V5, V13, V1, V9 to produce V56..V59
  200. movq mm7, mmword ptr [ebx+8*13] ; V13
  201. pmullw mm7, mmword ptr [edi+8*13]
  202. movq mm2, mm1 ; duplicate t128=t130
  203. movq mm3, mmword ptr [ebx+8*1] ; V1
  204. pmullw mm3, mmword ptr [edi+8*1]
  205. psubw mm1, mm7 ; V50
  206. movq mm5, mmword ptr [ebx+8*9] ; V9
  207. pmullw mm5, mmword ptr [edi+8*9]
  208. paddw mm2, mm7 ; V51
  209. movq mmword ptr scratch1, mm1
  210. movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  211. movq mmword ptr scratch3, mm1
  212. movq mmword ptr scratch5, mm0
  213. PackMulW
  214. movq mm1, mmword ptr scratch1
  215. movq mm0, mmword ptr scratch5
  216. movq mm6, mm2 ; duplicate V51
  217. movq mm4, mm3 ; duplicate V1
  218. paddw mm3, mm5 ; V53
  219. psubw mm4, mm5 ; V54 ;mm5 free
  220. movq mm7, mm3 ; duplicate V53
  221. psubw mm1, mm6 ; V55 ; mm6 free
  222. paddw mm3, mm2 ; V56
  223. movq mm5, mm4 ; duplicate t140=t142
  224. paddw mm4, mm1 ; V57
  225. movq mmword ptr [esi+8*5], mm3 ; V56
  226. psubw mm5, mm1 ; V58; mm1 free
  227. movq mmword ptr [esi+8*13], mm4 ; V57
  228. psubw mm7, mm2 ; V59; mm2 free
  229. movq mmword ptr [esi+8*9], mm5 ; V58
  230. movq mm0, mmword ptr [ebx+8*11] ; V11
  231. pmullw mm0, mmword ptr [edi+8*11]
  232. movq mm6, mmword ptr [ebx+8*7] ; V7
  233. pmullw mm6, mmword ptr [edi+8*7]
  234. movq mm3, mm0 ; duplicate V11
  235. movq mm4, mmword ptr [ebx+8*15] ; V15
  236. pmullw mm4, mmword ptr [edi+8*15]
  237. movq mm5, mmword ptr [ebx+8*3] ; V3
  238. pmullw mm5, mmword ptr [edi+8*3]
  239. paddw mm0, mm6 ; V63
  240. ; note that V15 computation has a correction step:
  241. ; this is a 'magic' constant that rebiases the results to be closer to the expected result
  242. ; this magic constant can be refined to reduce the error even more
  243. ; by doing the correction step in a later stage when the number is actually multiplied by 16
  244. psubw mm3, mm6 ; V60 ; free mm6
  245. movq mm1, mm3 ; duplicate V60
  246. movq mmword ptr scratch1, mm1
  247. movq mm1, mmword ptr x539f539f539f539f ; 23170 ->V18
  248. movq mmword ptr scratch3, mm1
  249. movq mmword ptr scratch5, mm0
  250. PackMulW
  251. movq mm1, mmword ptr scratch1
  252. movq mm0, mmword ptr scratch5
  253. movq mm6, mm5 ; duplicate V3
  254. paddw mm5, mm4 ; V61
  255. psubw mm6, mm4 ; V62 ; free mm4
  256. movq mm4, mm5 ; duplicate V61
  257. paddw mm5, mm0 ; V65 -> result
  258. psubw mm4, mm0 ; V64 ; free mm0
  259. movq mmword ptr scratch1, mm4
  260. movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  261. movq mmword ptr scratch3, mm4
  262. movq mmword ptr scratch5, mm0
  263. movq mmword ptr scratch7, mm1
  264. PackMulW
  265. movq mm4, mmword ptr scratch1
  266. movq mm0, mmword ptr scratch5
  267. movq mm1, mmword ptr scratch7
  268. paddw mm3, mm6 ; V66
  269. movq mm2, mm5 ; duplicate V65
  270. movq mmword ptr scratch1, mm3
  271. movq mm3, mmword ptr x61f861f861f861f8 ; 23170 ->V18
  272. movq mmword ptr scratch3, mm3
  273. movq mmword ptr scratch5, mm0
  274. movq mmword ptr scratch7, mm1
  275. PackMulW
  276. movq mm3, mmword ptr scratch1
  277. movq mm0, mmword ptr scratch5
  278. movq mm1, mmword ptr scratch7
  279. movq mmword ptr scratch1, mm6
  280. movq mm6, mmword ptr x4546454645464546 ; 23170 ->V18
  281. movq mmword ptr scratch3, mm6
  282. movq mmword ptr scratch5, mm0
  283. movq mmword ptr scratch7, mm1
  284. PackMulW
  285. movq mm6, mmword ptr scratch1
  286. movq mm0, mmword ptr scratch5
  287. movq mm1, mmword ptr scratch7
  288. movq mm0, mmword ptr [esi+8*5] ; V56
  289. psubw mm6, mm3 ; V72
  290. paddw mm3, mm1 ; V71 ; free mm1
  291. psubw mm3, mm2 ; V73 ; free mm2
  292. psubw mm4, mm3 ; V74
  293. ;moved from next block
  294. movq mm1, mm0 ; duplicate t177=t188
  295. paddw mm6, mm4 ; V75
  296. ;moved from next block
  297. paddw mm0, mm5 ; tm1
  298. ;location
  299. ; 5 - V56
  300. ; 13 - V57
  301. ; 9 - V58
  302. ; X - V59, mm7
  303. ; X - V65, mm5
  304. ; X - V73, mm6
  305. ; X - V74, mm4
  306. ; X - V75, mm3
  307. ; free mm0, mm1 & mm2
  308. ;move above
  309. movq mm2, mmword ptr [esi+8*13] ; V57
  310. psubw mm1, mm5 ; tm15; free mm5
  311. movq mmword ptr [esi+8*1], mm0 ; tm1; free mm0
  312. ;save the store as used directly in the transpose
  313. ;movq mmword ptr [esi+8*15], mm1 ; tm15; free mm1
  314. movq mm5, mm7 ; duplicate t182=t184
  315. psubw mm7, mm6 ; tm7
  316. paddw mm5, mm6 ; tm9; free mm3
  317. ;slot
  318. movq mm6, mm3
  319. movq mm0, mmword ptr [esi+8*9] ; V58
  320. movq mm3, mm2 ; duplicate V57
  321. movq mmword ptr [esi+8*7], mm7 ; tm7; free mm7
  322. psubw mm3, mm6 ; tm13
  323. paddw mm2, mm6 ; tm3 ; free mm6
  324. movq mm6, mm0 ; duplicate V58
  325. movq mmword ptr [esi+8*3], mm2 ; tm3; free mm2
  326. paddw mm0, mm4 ; tm5
  327. psubw mm6, mm4 ; tm11; free mm4
  328. movq mmword ptr [esi+8*5], mm0 ; tm5; free mm0
  329. ; transpose the bottom right quadrant(4X4) of the matrix
  330. ; --------- ---------
  331. ; | M1 | M2 | | M1'| M3'|
  332. ; --------- --> ---------
  333. ; | M3 | M4 | | M2'| M4'|
  334. ; --------- ---------
  335. movq mm0, mm5 ; copy w4---0,1,3,5,6
  336. punpcklwd mm5, mm6 ;
  337. punpckhwd mm0, mm6 ;---0,1,3,5,6
  338. movq mm6, mmword ptr [esi+8*0] ;get w0 of top left quadrant
  339. movq mm2, mm3 ;---0,1,2,3,5,6
  340. punpcklwd mm3, mm1 ;
  341. movq mm7, mmword ptr [esi+8*2] ;get w1 of top left quadrant
  342. punpckhwd mm2, mm1 ;---0,2,3,5,6,7
  343. movq mm4, mm5 ;---0,2,3,4,5,6,7
  344. punpckldq mm5, mm3 ; transposed w4
  345. movq mmword ptr [esi+8*9], mm5 ; store w4
  346. punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
  347. movq mm3, mm0 ;---0,2,3,4,6,7
  348. punpckldq mm0, mm2 ; transposed w6
  349. movq mmword ptr [esi+8*11], mm4 ; store w5
  350. punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
  351. movq mmword ptr [esi+8*13], mm0 ; store w6---3,5,6,7
  352. movq mm5, mm6 ; copy w0
  353. movq mmword ptr [esi+8*15], mm3 ; store w7---5,6,7
  354. punpcklwd mm6, mm7
  355. ; transpose the top left quadrant(4X4) of the matrix
  356. punpckhwd mm5, mm7 ;---5,6,7
  357. movq mm7, mmword ptr [esi+8*4] ; get w2 of TL quadrant
  358. movq mm4, mmword ptr [esi+8*6] ; get w3 of TL quadrant
  359. movq mm3, mm7 ; copy w2---3,4,5,6,7
  360. movq mm2, mm6
  361. punpcklwd mm7, mm4 ;---2,3,4,5,6,7
  362. punpckhwd mm3, mm4 ;---2,3,4,5,6,7
  363. movq mm4, mm5 ;
  364. movq mm1, mm5
  365. punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
  366. movq mmword ptr [esi+8*0], mm6 ; store w0 of TL quadrant
  367. punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
  368. movq mmword ptr [esi+8*2], mm2 ; store w1 of TL quadrant
  369. punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
  370. movq mmword ptr [esi+8*4], mm5 ; store w2 of TL quadrant
  371. punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
  372. movq mmword ptr [esi+8*6], mm1 ; store w3 of TL quadrant
  373. ; transpose the top right quadrant(4X4) of the matrix
  374. movq mm0, mmword ptr [esi+8*1] ;---0
  375. movq mm1, mmword ptr [esi+8*3] ;---0,1,2
  376. movq mm2, mm0
  377. movq mm3, mmword ptr [esi+8*5]
  378. punpcklwd mm0, mm1 ;---0,1,2,3
  379. punpckhwd mm2, mm1
  380. movq mm1, mmword ptr [esi+8*7] ;---0,1,2,3
  381. movq mm4, mm3
  382. punpcklwd mm3, mm1 ;---0,1,2,3,4
  383. punpckhwd mm4, mm1 ;---0,1,2,3,4
  384. movq mm1, mm0
  385. movq mm5, mm2
  386. punpckldq mm0, mm3 ;---0,1,2,3,4,5
  387. punpckhdq mm1, mm3 ;---0,1,2,3,4,5
  388. movq mm3, mmword ptr [esi+8*8]
  389. movq mmword ptr [esi+8*8], mm0
  390. punpckldq mm2, mm4 ;---1,2,3,4,5
  391. punpckhdq mm5, mm4 ;---1,2,3,4,5
  392. movq mm4, mmword ptr [esi+8*10]
  393. ; transpose the bottom left quadrant(4X4) of the matrix
  394. ; Also store w1,w2,w3 of top right quadrant into
  395. ; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
  396. ; of BL is already done.
  397. movq mmword ptr [esi+8*10], mm1
  398. movq mm1, mm3 ;---1,2,3,4,5
  399. movq mm0, mmword ptr [esi+8*12]
  400. punpcklwd mm3, mm4 ;---0,1,2,3,4,5
  401. punpckhwd mm1, mm4 ;---0,1,2,3,4,5
  402. movq mm4, mmword ptr [esi+8*14]
  403. movq mmword ptr [esi+8*12], mm2
  404. movq mm2, mm0
  405. movq mmword ptr [esi+8*14], mm5
  406. punpcklwd mm0, mm4 ;---0,1,2,3,4
  407. punpckhwd mm2, mm4 ;---0,1,2,3,4
  408. movq mm4, mm3
  409. movq mm5, mm1
  410. punpckldq mm3, mm0 ;---0,1,2,3,4,5
  411. movq mmword ptr [esi+8*1], mm3
  412. punpckhdq mm4, mm0 ;---1,2,4,5
  413. movq mmword ptr [esi+8*3], mm4
  414. punpckldq mm1, mm2 ;---1,2,5
  415. movq mmword ptr [esi+8*5], mm1
  416. punpckhdq mm5, mm2 ;---5
  417. movq mmword ptr [esi+8*7], mm5
  418. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  419. ;;;;;;;;; 1D DCT of the rows ;;;;;;;;;;;
  420. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  421. mov esi, DWORD PTR [esp+36] ; source
  422. ;slot
  423. ; column 0: even part
  424. ; use V4, V12, V0, V8 to produce V22..V25
  425. movq mm0, mmword ptr [esi+8*12] ; V12
  426. movq mm1, mmword ptr [esi+8*4] ; V4
  427. movq mm3, mmword ptr [esi+8*0] ; V0
  428. movq mm2, mm1 ; duplicate V4
  429. movq mm5, mmword ptr [esi+8*8] ; V8
  430. psubw mm1, mm0 ; V16
  431. movq mmword ptr scratch1, mm1
  432. movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  433. movq mmword ptr scratch3, mm1
  434. movq mmword ptr scratch5, mm0
  435. PackMulW
  436. movq mm1, mmword ptr scratch1
  437. movq mm0, mmword ptr scratch5
  438. paddw mm2, mm0 ; V17
  439. movq mm0, mm2 ; duplicate V17
  440. movq mm4, mm3 ; duplicate V0
  441. paddw mm3, mm5 ; V19
  442. psubw mm4, mm5 ; V20 ;mm5 free
  443. ;moved from the block below
  444. movq mm6, mm3 ; duplicate t74=t81
  445. psubw mm1, mm0 ; V21 ; mm0 free
  446. paddw mm3, mm2 ; V22
  447. movq mm5, mm1 ; duplicate V21
  448. paddw mm1, mm4 ; V23
  449. movq mmword ptr [esi+8*4], mm3 ; V22
  450. psubw mm4, mm5 ; V24; mm5 free
  451. movq mmword ptr [esi+8*12], mm1 ; V23
  452. psubw mm6, mm2 ; V25; mm2 free
  453. movq mmword ptr [esi+8*0], mm4 ; V24
  454. ; keep mm6 alive all along the next block
  455. ; column 0: odd part
  456. ; use V2, V6, V10, V14 to produce V31, V39, V40, V41
  457. movq mm7, mmword ptr [esi+8*10] ; V10
  458. movq mm0, mmword ptr [esi+8*6] ; V6
  459. movq mm3, mm7 ; duplicate V10
  460. movq mm5, mmword ptr [esi+8*2] ; V2
  461. psubw mm7, mm0 ; V26
  462. movq mm4, mmword ptr [esi+8*14] ; V14
  463. paddw mm3, mm0 ; V29 ; free mm0
  464. movq mm1, mm7 ; duplicate V26
  465. movq mmword ptr scratch1, mm7
  466. movq mm7, mmword ptr x539f539f539f539f ; 23170 ->V18
  467. movq mmword ptr scratch3, mm7
  468. movq mmword ptr scratch5, mm0
  469. movq mmword ptr scratch7, mm1
  470. PackMulW
  471. movq mm7, mmword ptr scratch1
  472. movq mm0, mmword ptr scratch5
  473. movq mm1, mmword ptr scratch7
  474. movq mm0, mm5 ; duplicate V2
  475. paddw mm5, mm4 ; V27
  476. psubw mm0, mm4 ; V28 ; free mm4
  477. movq mm2, mm0 ; duplicate V28
  478. movq mmword ptr scratch1, mm0
  479. movq mm0, mmword ptr x4546454645464546 ; 23170 ->V18
  480. movq mmword ptr scratch3, mm0
  481. movq mmword ptr scratch7, mm1
  482. PackMulW
  483. movq mm0, mmword ptr scratch1
  484. movq mm1, mmword ptr scratch7
  485. movq mm4, mm5 ; duplicate t90=t93
  486. paddw mm1, mm2 ; V32 ; free mm2
  487. movq mmword ptr scratch1, mm1
  488. movq mm1, mmword ptr x61f861f861f861f8 ; 23170 ->V18
  489. movq mmword ptr scratch3, mm1
  490. movq mmword ptr scratch5, mm0
  491. PackMulW
  492. movq mm1, mmword ptr scratch1
  493. movq mm0, mmword ptr scratch5
  494. paddw mm5, mm3 ; V31
  495. psubw mm4, mm3 ; V30 ; free mm3
  496. movq mmword ptr scratch1, mm4
  497. movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  498. movq mmword ptr scratch3, mm4
  499. movq mmword ptr scratch5, mm0
  500. movq mmword ptr scratch7, mm1
  501. PackMulW
  502. movq mm4, mmword ptr scratch1
  503. movq mm0, mmword ptr scratch5
  504. movq mm1, mmword ptr scratch7
  505. psubw mm0, mm1 ; V38
  506. paddw mm1, mm7 ; V37 ; free mm7
  507. ;move from the next block
  508. movq mm3, mm6 ; duplicate V25
  509. ;move from the next block
  510. movq mm7, mmword ptr [esi+8*4] ; V22
  511. psubw mm1, mm5 ; V39 (mm5 still needed for next block)
  512. ;move from the next block
  513. movq mm2, mmword ptr [esi+8*12] ; V23
  514. psubw mm4, mm1 ; V40
  515. paddw mm0, mm4 ; V41; free mm0
  516. ;move from the next block
  517. ; column 0: output butterfly
  518. ;move above
  519. psubw mm6, mm0 ; tm6
  520. paddw mm3, mm0 ; tm8; free mm1
  521. movq mm0, mm1 ; line added by Kumar
  522. movq mm1, mm7 ; duplicate V22
  523. paddw mm7, mm5 ; tm0
  524. movq mmword ptr [esi+8*8], mm3 ; tm8; free mm3
  525. psubw mm1, mm5 ; tm14; free mm5
  526. movq mmword ptr [esi+8*6], mm6 ; tm6; free mm6
  527. movq mm3, mm2 ; duplicate t117=t125
  528. movq mm6, mmword ptr [esi+8*0] ; V24
  529. paddw mm2, mm0 ; tm2
  530. movq mmword ptr [esi+8*0], mm7 ; tm0; free mm7
  531. psubw mm3, mm0 ; tm12; free mm0
  532. movq mmword ptr [esi+8*14], mm1 ; tm14; free mm1
  533. movq mmword ptr [esi+8*2], mm2 ; tm2; free mm2
  534. movq mm0, mm6 ; duplicate t119=t123
  535. movq mmword ptr [esi+8*12], mm3 ; tm12; free mm3
  536. paddw mm6, mm4 ; tm4
  537. ;moved from next block
  538. psubw mm0, mm4 ; tm10; free mm4
  539. ;moved from next block
  540. movq mm1, mmword ptr [esi+8*5] ; V5
  541. movq mmword ptr [esi+8*4], mm6 ; tm4; free mm6
  542. movq mmword ptr [esi+8*10], mm0 ; tm10; free mm0
  543. ; column 1: even part
  544. ; use V5, V13, V1, V9 to produce V56..V59
  545. ;moved to prev block
  546. movq mm7, mmword ptr [esi+8*13] ; V13
  547. movq mm2, mm1 ; duplicate t128=t130
  548. movq mm3, mmword ptr [esi+8*1] ; V1
  549. psubw mm1, mm7 ; V50
  550. movq mm5, mmword ptr [esi+8*9] ; V9
  551. paddw mm2, mm7 ; V51
  552. movq mmword ptr scratch1, mm1
  553. movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  554. movq mmword ptr scratch3, mm1
  555. movq mmword ptr scratch5, mm0
  556. PackMulW
  557. movq mm1, mmword ptr scratch1
  558. movq mm0, mmword ptr scratch5
  559. movq mm6, mm2 ; duplicate V51
  560. movq mm4, mm3 ; duplicate V1
  561. paddw mm3, mm5 ; V53
  562. psubw mm4, mm5 ; V54 ;mm5 free
  563. movq mm7, mm3 ; duplicate V53
  564. ;moved from next block
  565. psubw mm1, mm6 ; V55 ; mm6 free
  566. paddw mm3, mm2 ; V56
  567. movq mm5, mm4 ; duplicate t140=t142
  568. paddw mm4, mm1 ; V57
  569. movq mmword ptr [esi+8*5], mm3 ; V56
  570. psubw mm5, mm1 ; V58; mm1 free
  571. movq mmword ptr [esi+8*13], mm4 ; V57
  572. psubw mm7, mm2 ; V59; mm2 free
  573. movq mmword ptr [esi+8*9], mm5 ; V58
  574. ; keep mm7 alive all along the next block
  575. movq mm0, mmword ptr [esi+8*11] ; V11
  576. movq mm6, mmword ptr [esi+8*7] ; V7
  577. movq mm3, mm0 ; duplicate V11
  578. movq mm4, mmword ptr [esi+8*15] ; V15
  579. movq mm5, mmword ptr [esi+8*3] ; V3
  580. paddw mm0, mm6 ; V63
  581. ; note that V15 computation has a correction step:
  582. ; this is a 'magic' constant that rebiases the results to be closer to the expected result
  583. ; this magic constant can be refined to reduce the error even more
  584. ; by doing the correction step in a later stage when the number is actually multiplied by 16
  585. psubw mm3, mm6 ; V60 ; free mm6
  586. movq mm1, mm3 ; duplicate V60
  587. movq mmword ptr scratch1, mm1
  588. movq mm1, mmword ptr x539f539f539f539f ; 23170 ->V18
  589. movq mmword ptr scratch3, mm1
  590. movq mmword ptr scratch5, mm0
  591. PackMulW
  592. movq mm1, mmword ptr scratch1
  593. movq mm0, mmword ptr scratch5
  594. movq mm6, mm5 ; duplicate V3
  595. paddw mm5, mm4 ; V61
  596. psubw mm6, mm4 ; V62 ; free mm4
  597. movq mm4, mm5 ; duplicate V61
  598. paddw mm5, mm0 ; V65 -> result
  599. psubw mm4, mm0 ; V64 ; free mm0
  600. movq mmword ptr scratch1, mm4
  601. movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
  602. movq mmword ptr scratch3, mm4
  603. movq mmword ptr scratch5, mm0
  604. movq mmword ptr scratch7, mm1
  605. PackMulW
  606. movq mm4, mmword ptr scratch1
  607. movq mm0, mmword ptr scratch5
  608. movq mm1, mmword ptr scratch7
  609. paddw mm3, mm6 ; V66
  610. movq mm2, mm5 ; duplicate V65
  611. movq mmword ptr scratch1, mm3
  612. movq mm3, mmword ptr x61f861f861f861f8 ; 23170 ->V18
  613. movq mmword ptr scratch3, mm3
  614. movq mmword ptr scratch5, mm0
  615. movq mmword ptr scratch7, mm1
  616. PackMulW
  617. movq mm3, mmword ptr scratch1
  618. movq mm0, mmword ptr scratch5
  619. movq mm1, mmword ptr scratch7
  620. movq mmword ptr scratch1, mm6
  621. movq mm6, mmword ptr x4546454645464546 ; 23170 ->V18
  622. movq mmword ptr scratch3, mm6
  623. movq mmword ptr scratch5, mm0
  624. movq mmword ptr scratch7, mm1
  625. PackMulW
  626. movq mm6, mmword ptr scratch1
  627. movq mm0, mmword ptr scratch5
  628. movq mm1, mmword ptr scratch7
  629. ;moved from next block
  630. movq mm0, mmword ptr [esi+8*5] ; V56
  631. psubw mm6, mm3 ; V72
  632. paddw mm3, mm1 ; V71 ; free mm1
  633. psubw mm3, mm2 ; V73 ; free mm2
  634. psubw mm4, mm3 ; V74
  635. ;moved from next block
  636. movq mm1, mm0 ; duplicate t177=t188
  637. paddw mm6, mm4 ; V75
  638. ;moved from next block
  639. paddw mm0, mm5 ; tm1
  640. ;location
  641. ; 5 - V56
  642. ; 13 - V57
  643. ; 9 - V58
  644. ; X - V59, mm7
  645. ; X - V65, mm5
  646. ; X - V73, mm6
  647. ; X - V74, mm4
  648. ; X - V75, mm3
  649. ; free mm0, mm1 & mm2
  650. ;move above
  651. movq mm2, mmword ptr [esi+8*13] ; V57
  652. psubw mm1, mm5 ; tm15; free mm5
  653. movq mmword ptr [esi+8*1], mm0 ; tm1; free mm0
  654. ;save the store as used directly in the transpose
  655. movq mm5, mm7 ; duplicate t182=t184
  656. psubw mm7, mm6 ; tm7
  657. paddw mm5, mm6 ; tm9; free mm3
  658. movq mm6, mm3
  659. movq mm0, mmword ptr [esi+8*9] ; V58
  660. movq mm3, mm2 ; duplicate V57
  661. movq mmword ptr [esi+8*7], mm7 ; tm7; free mm7
  662. psubw mm3, mm6 ; tm13
  663. paddw mm2, mm6 ; tm3 ; free mm6
  664. movq mm6, mm0 ; duplicate V58
  665. movq mmword ptr [esi+8*3], mm2 ; tm3; free mm2
  666. paddw mm0, mm4 ; tm5
  667. psubw mm6, mm4 ; tm11; free mm4
  668. movq mmword ptr [esi+8*5], mm0 ; tm5; free mm0
  669. ; Final results to be stored after the transpose
  670. ; transpose the bottom right quadrant(4X4) of the matrix
  671. ; --------- ---------
  672. ; | M1 | M2 | | M1'| M3'|
  673. ; --------- --> ---------
  674. ; | M3 | M4 | | M2'| M4'|
  675. ; --------- ---------
  676. ;
  677. ; get the pointer to array "range"
  678. mov edi, [esp+52]
  679. ; calculate the destination address
  680. mov ebp, [esp+44] ; get output_buf[4]
  681. mov ebx, [ebp+20]
  682. mov ecx, [ebp+24]
  683. mov edx, [ebp+28]
  684. mov ebp, [ebp+16]
  685. add ebp, [esp+48] ; add to output_col
  686. add ebx, [esp+48] ; add to output_col
  687. add ecx, [esp+48] ; add to output_col
  688. add edx, [esp+48] ; add to output_col
  689. movq mm0, mm5 ; copy w4---0,1,3,5,6
  690. punpcklwd mm5, mm6 ;
  691. punpckhwd mm0, mm6 ;---0,1,3,5,6
  692. movq mm6, mmword ptr [esi+8*0] ;get w0 of top left quadrant
  693. movq mm2, mm3 ;---0,1,2,3,5,6
  694. punpcklwd mm3, mm1 ;
  695. movq mm7, mmword ptr [esi+8*2] ;get w1 of top left quadrant
  696. punpckhwd mm2, mm1 ;---0,2,3,5,6,7
  697. movq mm4, mm5 ;---0,2,3,4,5,6,7
  698. punpckldq mm5, mm3 ; transposed w4
  699. psrlw mm5, 5
  700. movd eax, mm5
  701. and eax, 03ffh
  702. mov al, byte ptr [edi][eax]
  703. mov byte ptr [ebp+4], al
  704. psrlq mm5, 16
  705. movd eax, mm5
  706. and eax, 03ffh
  707. mov al, byte ptr [edi][eax]
  708. mov byte ptr [ebp+5], al
  709. psrlq mm5, 16
  710. movd eax, mm5
  711. and eax, 03ffh
  712. mov al, byte ptr [edi][eax]
  713. mov byte ptr [ebp+6], al
  714. psrlq mm5, 16
  715. movd eax, mm5
  716. and eax, 03ffh
  717. mov al, byte ptr [edi][eax]
  718. mov byte ptr [ebp+7], al
  719. punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
  720. movq mm3, mm0 ;---0,2,3,4,6,7
  721. punpckldq mm0, mm2 ; transposed w6
  722. psrlw mm4, 5
  723. movd eax, mm4
  724. and eax, 03ffh
  725. mov al, byte ptr [edi][eax]
  726. mov byte ptr [ebx+4], al
  727. psrlq mm4, 16
  728. movd eax, mm4
  729. and eax, 03ffh
  730. mov al, byte ptr [edi][eax]
  731. mov byte ptr [ebx+5], al
  732. psrlq mm4, 16
  733. movd eax, mm4
  734. and eax, 03ffh
  735. mov al, byte ptr [edi][eax]
  736. mov byte ptr [ebx+6], al
  737. psrlq mm4, 16
  738. movd eax, mm4
  739. and eax, 03ffh
  740. mov al, byte ptr [edi][eax]
  741. mov byte ptr [ebx+7], al
  742. punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
  743. psrlw mm0, 5
  744. movd eax, mm0
  745. and eax, 03ffh
  746. mov al, byte ptr [edi][eax]
  747. mov byte ptr [ecx+4], al
  748. psrlq mm0, 16
  749. movd eax, mm0
  750. and eax, 03ffh
  751. mov al, byte ptr [edi][eax]
  752. mov byte ptr [ecx+5], al
  753. psrlq mm0, 16
  754. movd eax, mm0
  755. and eax, 03ffh
  756. mov al, byte ptr [edi][eax]
  757. mov byte ptr [ecx+6], al
  758. psrlq mm0, 16
  759. movd eax, mm0
  760. and eax, 03ffh
  761. mov al, byte ptr [edi][eax]
  762. mov byte ptr [ecx+7], al
  763. movq mm5, mm6 ; copy w0
  764. psrlw mm3, 5
  765. movd eax, mm3
  766. and eax, 03ffh
  767. mov al, byte ptr [edi][eax]
  768. mov byte ptr [edx+4], al
  769. psrlq mm3, 16
  770. movd eax, mm3
  771. and eax, 03ffh
  772. mov al, byte ptr [edi][eax]
  773. mov byte ptr [edx+5], al
  774. psrlq mm3, 16
  775. movd eax, mm3
  776. and eax, 03ffh
  777. mov al, byte ptr [edi][eax]
  778. mov byte ptr [edx+6], al
  779. psrlq mm3, 16
  780. movd eax, mm3
  781. and eax, 03ffh
  782. mov al, byte ptr [edi][eax]
  783. mov byte ptr [edx+7], al
  784. punpcklwd mm6, mm7
  785. ; transpose the top left quadrant(4X4) of the matrix
  786. ; calculate the destination address
  787. mov ebp, [esp+44] ; get output_buf[0]
  788. mov ebx, [ebp+4]
  789. mov ecx, [ebp+8]
  790. mov edx, [ebp+12]
  791. mov ebp, [ebp+0]
  792. add ebp, [esp+48] ; add to output_col
  793. add ebx, [esp+48] ; add to output_col
  794. add ecx, [esp+48] ; add to output_col
  795. add edx, [esp+48] ; add to output_col
  796. punpckhwd mm5, mm7 ;---5,6,7
  797. movq mm7, mmword ptr [esi+8*4] ; get w2 of TL quadrant
  798. movq mm4, mmword ptr [esi+8*6] ; get w3 of TL quadrant
  799. movq mm3, mm7 ; copy w2---3,4,5,6,7
  800. movq mm2, mm6
  801. punpcklwd mm7, mm4 ;---2,3,4,5,6,7
  802. punpckhwd mm3, mm4 ;---2,3,4,5,6,7
  803. movq mm4, mm5 ;
  804. movq mm1, mm5
  805. punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
  806. psrlw mm6, 5
  807. movd eax, mm6
  808. and eax, 03ffh
  809. mov al, byte ptr [edi][eax]
  810. mov byte ptr [ebp], al
  811. psrlq mm6, 16
  812. movd eax, mm6
  813. and eax, 03ffh
  814. mov al, byte ptr [edi][eax]
  815. mov byte ptr [ebp+1], al
  816. psrlq mm6, 16
  817. movd eax, mm6
  818. and eax, 03ffh
  819. mov al, byte ptr [edi][eax]
  820. mov byte ptr [ebp+2], al
  821. psrlq mm6, 16
  822. movd eax, mm6
  823. and eax, 03ffh
  824. mov al, byte ptr [edi][eax]
  825. mov byte ptr [ebp+3], al
  826. punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
  827. psrlw mm2, 5
  828. movd eax, mm2
  829. and eax, 03ffh
  830. mov al, byte ptr [edi][eax]
  831. mov byte ptr [ebx], al
  832. psrlq mm2, 16
  833. movd eax, mm2
  834. and eax, 03ffh
  835. mov al, byte ptr [edi][eax]
  836. mov byte ptr [ebx+1], al
  837. psrlq mm2, 16
  838. movd eax, mm2
  839. and eax, 03ffh
  840. mov al, byte ptr [edi][eax]
  841. mov byte ptr [ebx+2], al
  842. psrlq mm2, 16
  843. movd eax, mm2
  844. and eax, 03ffh
  845. mov al, byte ptr [edi][eax]
  846. mov byte ptr [ebx+3], al
  847. punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
  848. psrlw mm5, 5
  849. movd eax, mm5
  850. and eax, 03ffh
  851. mov al, byte ptr [edi][eax]
  852. mov byte ptr [ecx], al
  853. psrlq mm5, 16
  854. movd eax, mm5
  855. and eax, 03ffh
  856. mov al, byte ptr [edi][eax]
  857. mov byte ptr [ecx+1], al
  858. psrlq mm5, 16
  859. movd eax, mm5
  860. and eax, 03ffh
  861. mov al, byte ptr [edi][eax]
  862. mov byte ptr [ecx+2], al
  863. psrlq mm5, 16
  864. movd eax, mm5
  865. and eax, 03ffh
  866. mov al, byte ptr [edi][eax]
  867. mov byte ptr [ecx+3], al
  868. punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
  869. psrlw mm1, 5
  870. movd eax, mm1
  871. and eax, 03ffh
  872. mov al, byte ptr [edi][eax]
  873. mov byte ptr [edx], al
  874. psrlq mm1, 16
  875. movd eax, mm1
  876. and eax, 03ffh
  877. mov al, byte ptr [edi][eax]
  878. mov byte ptr [edx+1], al
  879. psrlq mm1, 16
  880. movd eax, mm1
  881. and eax, 03ffh
  882. mov al, byte ptr [edi][eax]
  883. mov byte ptr [edx+2], al
  884. psrlq mm1, 16
  885. movd eax, mm1
  886. and eax, 03ffh
  887. mov al, byte ptr [edi][eax]
  888. mov byte ptr [edx+3], al
  889. ; transpose the top right quadrant(4X4) of the matrix
  890. ; calculate the destination address for **bottom left quadrant
  891. mov ebp, [esp+44] ; get output_buf[4]
  892. mov ebx, [ebp+20]
  893. mov ecx, [ebp+24]
  894. mov edx, [ebp+28]
  895. mov ebp, [ebp+16]
  896. add ebp, [esp+48] ; add to output_col
  897. add ebx, [esp+48] ; add to output_col
  898. add ecx, [esp+48] ; add to output_col
  899. add edx, [esp+48] ; add to output_col
  900. movq mm0, mmword ptr [esi+8*1] ;---0
  901. movq mm1, mmword ptr [esi+8*3] ;---0,1,2
  902. movq mm2, mm0
  903. movq mm3, mmword ptr [esi+8*5]
  904. punpcklwd mm0, mm1 ;---0,1,2,3
  905. punpckhwd mm2, mm1
  906. movq mm1, mmword ptr [esi+8*7] ;---0,1,2,3
  907. movq mm4, mm3
  908. punpcklwd mm3, mm1 ;---0,1,2,3,4
  909. punpckhwd mm4, mm1 ;---0,1,2,3,4
  910. movq mm1, mm0
  911. movq mm5, mm2
  912. punpckldq mm0, mm3 ;---0,1,2,3,4,5
  913. punpckhdq mm1, mm3 ;---0,1,2,3,4,5
  914. movq mm3, mmword ptr [esi+8*8]
  915. psrlw mm0, 5
  916. movd eax, mm0
  917. and eax, 03ffh
  918. mov al, byte ptr [edi][eax]
  919. mov byte ptr [ebp], al
  920. psrlq mm0, 16
  921. movd eax, mm0
  922. and eax, 03ffh
  923. mov al, byte ptr [edi][eax]
  924. mov byte ptr [ebp+1], al
  925. psrlq mm0, 16
  926. movd eax, mm0
  927. and eax, 03ffh
  928. mov al, byte ptr [edi][eax]
  929. mov byte ptr [ebp+2], al
  930. psrlq mm0, 16
  931. movd eax, mm0
  932. and eax, 03ffh
  933. mov al, byte ptr [edi][eax]
  934. mov byte ptr [ebp+3], al
  935. punpckldq mm2, mm4 ;---1,2,3,4,5
  936. punpckhdq mm5, mm4 ;---1,2,3,4,5
  937. movq mm4, mmword ptr [esi+8*10]
  938. ; transpose the bottom left quadrant(4X4) of the matrix
  939. ; Also store w1,w2,w3 of top right quadrant into
  940. ; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
  941. ; of BL is already done.
  942. psrlw mm1, 5
  943. movd eax, mm1
  944. and eax, 03ffh
  945. mov al, byte ptr [edi][eax]
  946. mov byte ptr [ebx], al
  947. psrlq mm1, 16
  948. movd eax, mm1
  949. and eax, 03ffh
  950. mov al, byte ptr [edi][eax]
  951. mov byte ptr [ebx+1], al
  952. psrlq mm1, 16
  953. movd eax, mm1
  954. and eax, 03ffh
  955. mov al, byte ptr [edi][eax]
  956. mov byte ptr [ebx+2], al
  957. psrlq mm1, 16
  958. movd eax, mm1
  959. and eax, 03ffh
  960. mov al, byte ptr [edi][eax]
  961. mov byte ptr [ebx+3], al
  962. movq mm1, mm3 ;---1,2,3,4,5
  963. movq mm0, mmword ptr [esi+8*12]
  964. punpcklwd mm3, mm4 ;---0,1,2,3,4,5
  965. punpckhwd mm1, mm4 ;---0,1,2,3,4,5
  966. movq mm4, mmword ptr [esi+8*14]
  967. psrlw mm2, 5
  968. movd eax, mm2
  969. and eax, 03ffh
  970. mov al, byte ptr [edi][eax]
  971. mov byte ptr [ecx], al
  972. psrlq mm2, 16
  973. movd eax, mm2
  974. and eax, 03ffh
  975. mov al, byte ptr [edi][eax]
  976. mov byte ptr [ecx+1], al
  977. psrlq mm2, 16
  978. movd eax, mm2
  979. and eax, 03ffh
  980. mov al, byte ptr [edi][eax]
  981. mov byte ptr [ecx+2], al
  982. psrlq mm2, 16
  983. movd eax, mm2
  984. and eax, 03ffh
  985. mov al, byte ptr [edi][eax]
  986. mov byte ptr [ecx+3], al
  987. movq mm2, mm0
  988. psrlw mm5, 5
  989. movd eax, mm5
  990. and eax, 03ffh
  991. mov al, byte ptr [edi][eax]
  992. mov byte ptr [edx], al
  993. psrlq mm5, 16
  994. movd eax, mm5
  995. and eax, 03ffh
  996. mov al, byte ptr [edi][eax]
  997. mov byte ptr [edx+1], al
  998. psrlq mm5, 16
  999. movd eax, mm5
  1000. and eax, 03ffh
  1001. mov al, byte ptr [edi][eax]
  1002. mov byte ptr [edx+2], al
  1003. psrlq mm5, 16
  1004. movd eax, mm5
  1005. and eax, 03ffh
  1006. mov al, byte ptr [edi][eax]
  1007. mov byte ptr [edx+3], al
  1008. punpcklwd mm0, mm4 ;---0,1,2,3,4
  1009. punpckhwd mm2, mm4 ;---0,1,2,3,4
  1010. movq mm4, mm3
  1011. movq mm5, mm1
  1012. punpckldq mm3, mm0 ;---0,1,2,3,4,5
  1013. ; calculate the destination address for **top right quadrant
  1014. mov ebp, [esp+44] ; get output_buf[0]
  1015. mov ebx, [ebp+4]
  1016. mov ecx, [ebp+8]
  1017. mov edx, [ebp+12]
  1018. mov ebp, [ebp+0]
  1019. add ebp, [esp+48] ; add to output_col
  1020. add ebx, [esp+48] ; add to output_col
  1021. add ecx, [esp+48] ; add to output_col
  1022. add edx, [esp+48] ; add to output_col
  1023. psrlw mm3, 5
  1024. movd eax, mm3
  1025. and eax, 03ffh
  1026. mov al, byte ptr [edi][eax]
  1027. mov byte ptr [ebp+4], al
  1028. psrlq mm3, 16
  1029. movd eax, mm3
  1030. and eax, 03ffh
  1031. mov al, byte ptr [edi][eax]
  1032. mov byte ptr [ebp+5], al
  1033. psrlq mm3, 16
  1034. movd eax, mm3
  1035. and eax, 03ffh
  1036. mov al, byte ptr [edi][eax]
  1037. mov byte ptr [ebp+6], al
  1038. psrlq mm3, 16
  1039. movd eax, mm3
  1040. and eax, 03ffh
  1041. mov al, byte ptr [edi][eax]
  1042. mov byte ptr [ebp+7], al
  1043. punpckhdq mm4, mm0 ;---1,2,4,5
  1044. psrlw mm4, 5
  1045. movd eax, mm4
  1046. and eax, 03ffh
  1047. mov al, byte ptr [edi][eax]
  1048. mov byte ptr [ebx+4], al
  1049. psrlq mm4, 16
  1050. movd eax, mm4
  1051. and eax, 03ffh
  1052. mov al, byte ptr [edi][eax]
  1053. mov byte ptr [ebx+5], al
  1054. psrlq mm4, 16
  1055. movd eax, mm4
  1056. and eax, 03ffh
  1057. mov al, byte ptr [edi][eax]
  1058. mov byte ptr [ebx+6], al
  1059. psrlq mm4, 16
  1060. movd eax, mm4
  1061. and eax, 03ffh
  1062. mov al, byte ptr [edi][eax]
  1063. mov byte ptr [ebx+7], al
  1064. punpckldq mm1, mm2 ;---1,2,5
  1065. psrlw mm1, 5
  1066. movd eax, mm1
  1067. and eax, 03ffh
  1068. mov al, byte ptr [edi][eax]
  1069. mov byte ptr [ecx+4], al
  1070. psrlq mm1, 16
  1071. movd eax, mm1
  1072. and eax, 03ffh
  1073. mov al, byte ptr [edi][eax]
  1074. mov byte ptr [ecx+5], al
  1075. psrlq mm1, 16
  1076. movd eax, mm1
  1077. and eax, 03ffh
  1078. mov al, byte ptr [edi][eax]
  1079. mov byte ptr [ecx+6], al
  1080. psrlq mm1, 16
  1081. movd eax, mm1
  1082. and eax, 03ffh
  1083. mov al, byte ptr [edi][eax]
  1084. mov byte ptr [ecx+7], al
  1085. punpckhdq mm5, mm2 ;---5
  1086. psrlw mm5, 5
  1087. movd eax, mm5
  1088. and eax, 03ffh
  1089. mov al, byte ptr [edi][eax]
  1090. mov byte ptr [edx+4], al
  1091. psrlq mm5, 16
  1092. movd eax, mm5
  1093. and eax, 03ffh
  1094. mov al, byte ptr [edi][eax]
  1095. mov byte ptr [edx+5], al
  1096. psrlq mm5, 16
  1097. movd eax, mm5
  1098. and eax, 03ffh
  1099. mov al, byte ptr [edi][eax]
  1100. mov byte ptr [edx+6], al
  1101. psrlq mm5, 16
  1102. movd eax, mm5
  1103. and eax, 03ffh
  1104. mov al, byte ptr [edi][eax]
  1105. mov byte ptr [edx+7], al
  1106. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1107. emms
  1108. ret
  1109. _idct8x8aan ENDP
  1110. _TEXT ENDS
  1111. END