Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

832 lines
38 KiB

  1. ;/* *************************************************************************
  2. ;** INTEL Corporation Proprietary Information
  3. ;**
  4. ;** This listing is supplied under the terms of a license
  5. ;** agreement with INTEL Corporation and may not be copied
  6. ;** nor disclosed except in accordance with the terms of
  7. ;** that agreement.
  8. ;**
  9. ;** Copyright (c) 1995 Intel Corporation.
  10. ;** Copyright (c) 1996 Intel Corporation.
  11. ;** All Rights Reserved.
  12. ;**
  13. ;** *************************************************************************
  14. ;*/
  15. ;/* *************************************************************************
  16. ;** $Header: S:\h26x\src\dec\dxmidct.asv 1.5 09 Jul 1996 16:51:26 AGUPTA2 $
  17. ;** $Log: S:\h26x\src\dec\dxmidct.asv $
  18. ;//
  19. ;// Rev 1.5 09 Jul 1996 16:51:26 AGUPTA2
  20. ;// IDCT now expects actual number of coeffs.
  21. ;//
  22. ;// Rev 1.4 08 Jul 1996 11:42:50 AGUPTA2
  23. ;// Fixed the accuracy problem where a shift was in the wrong place.
  24. ;//
  25. ;// Rev 1.3 30 May 1996 12:25:02 AGUPTA2
  26. ;// Fixed the overflow problem in computing u0-u3 in first four columns.
  27. ;//
  28. ;// Rev 1.2 09 Apr 1996 09:42:08 agupta2
  29. ;// Code to clear IDCT buffer moved to MMX_BlockCopy and MMX_BlockMove.
  30. ;//
  31. ;// Rev 1.1 22 Mar 1996 10:17:26 agupta2
  32. ;// Initial revision of MMX version of IDCT.
  33. ;//
  34. ;// Rev 1.0 14 Mar 1996 14:38:02 AGUPTA2
  35. ;// Initial revision.
  36. ;** *************************************************************************
  37. ;*/
  38. .586
  39. .model flat
  40. OPTION PROLOGUE:None
  41. OPTION EPILOGUE:None
  42. .xlist
  43. include iammx.inc
  44. .list
  45. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  46. MMXCODE1 ENDS
  47. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  48. MMXDATA1 ENDS
  49. MMXDATA1 SEGMENT
  50. ;
  51. ;Constants CONSTBITS, BETA1, NEGBETA2, BETA3, BETA4, and BETA5 are used in the
  52. ;IDCT. All *BETA* constants are represented in CONSTBITS fraction bits. Their
  53. ;floating-point values are:
  54. ; BETA1 = 1.414213562
  55. ; BETA2 = 2.613125930
  56. ; BETA3 = 1.414213562
  57. ; BETA4 = 1.082392200
  58. ; BETA5 = 0.765366865
  59. ;Thus scaled integral value of BETA1 is computed as:
  60. ; BETA1 = ROUND(1.414213562*2^13) = 02D41H
  61. ;
  62. CONSTBITS = 13
  63. ALIGN 8
  64. BETA1 LABEL DWORD
  65. BETA3 LABEL DWORD
  66. DWORD 02D410000H, 02D410000H
  67. ALIGN 8
  68. NEGBETA2 LABEL DWORD
  69. DWORD 0AC610000H, 0AC610000H
  70. ALIGN 8
  71. BETA4 LABEL DWORD
  72. DWORD 022A30000H, 022A30000H
  73. ALIGN 8
  74. BETA5 LABEL DWORD
  75. DWORD 0187E0000H, 0187E0000H
  76. ALIGN 8
  77. CONSTBITS_P_1_RND LABEL DWORD
  78. DWORD 02000H, 02000H
  79. ALIGN 8
  80. CONSTBITS_RND LABEL DWORD
  81. DWORD 01000H, 01000H
  82. ALIGN 8
  83. ONE LABEL DWORD
  84. DWORD 000010001H, 000010001H
  85. MMXDATA1 ENDS
  86. MMXCODE1 SEGMENT
  87. ;
  88. ;
  89. ;
  90. @MMX_DecodeBlock_IDCT@12 PROC
  91. ; Parameters:
  92. ; pIQ_INDEX: DWORD PTR (in ecx)
  93. ; Pointer to an array of coeff. structures; each structure consists of
  94. ; DWORD of inverse quantized and scaled coeff. and a DWORD of its index.
  95. ; CountCoeff: DWORD (in edx)
  96. ; Number of coefficients <= 64.
  97. ; pBuf: WORD PTR (at <[esp+4]> at the entry of this routine
  98. ; Output area for the IDCT; an 8X8 matrix of WORD values with 6 frac. bits
  99. ; Algorithm:
  100. ; It uses scaled IDCT algorithm credited to Arai, Agui, and Nakajima (AAN).
  101. ; High-level steps are:
  102. ; 1) Decode pIQ_INDEX array and populate the output buffer
  103. ; 2) IDCT and write to output buffer
  104. ; Note:
  105. ; If called from a C function, this routine must be declared as:
  106. ; extern "C" void _fastcall MMX_DecodeBlock_IDCT(...)
  107. ;
  108. LocalFrameSize = 24
  109. Tu7 textequ <[esp+0]>
  110. Tv5 textequ <[esp+8]>
  111. StashESP textequ <[esp+16]>
  112. push esi
  113. push edi
  114. mov edi, esp
  115. sub esp, LocalFrameSize
  116. and esp, 0FFFFFFF8H ;Align at 8-byte boundary
  117. pxor mm0, mm0
  118. mov StashESP, edi
  119. mov edi, DWORD PTR [edi+12] ;pBuf
  120. add edi, 64 ;pBuf+64
  121. xor eax, eax
  122. ;
  123. ; Decode coefficients and place them in the output buffer
  124. ; ecx: pIQ_INDEX
  125. ; edx: No_Coeff
  126. ; edi: pBuf+64
  127. ; eax, esi: available
  128. ;
  129. decode_coeff:
  130. mov esi, [ecx+edx*8-4] ;Index
  131. mov eax, [ecx+edx*8-8] ;Inverse quantized scaled coeff
  132. mov WORD PTR [edi+esi*2-64], ax ;
  133. dec edx
  134. jnz decode_coeff
  135. IDCT_Start:
  136. cols_0_3:
  137. CLINE0 = 0 - 64
  138. CLINE1 = 16 - 64
  139. CLINE2 = 32 - 64
  140. CLINE3 = 48 - 64
  141. CLINE4 = 64 - 64
  142. CLINE5 = 80 - 64
  143. CLINE6 = 96 - 64
  144. CLINE7 = 112- 64
  145. pxor mm4, mm4 ;
  146. movq mm0, [edi+CLINE5] ;
  147. pxor mm5, mm5 ;
  148. movq mm1, [edi+CLINE1] ;
  149. pxor mm2, mm2 ;
  150. psubw mm0, [edi+CLINE3] ;q4=r4
  151. pxor mm3, mm3 ;
  152. psubw mm1, [edi+CLINE7] ;q6=r6
  153. punpcklwd mm4, mm0 ;
  154. pmaddwd mm4, NEGBETA2 ;
  155. punpckhwd mm5, mm0 ;
  156. pmaddwd mm5, NEGBETA2 ;
  157. psubw mm0, mm1 ;r4-r6
  158. punpcklwd mm2, mm0 ;
  159. pxor mm6, mm6 ;
  160. pmaddwd mm2, BETA5 ;
  161. punpckhwd mm3, mm0 ;
  162. pmaddwd mm3, BETA5 ;
  163. punpcklwd mm6, mm1 ;
  164. pmaddwd mm6, BETA4 ;
  165. pxor mm7, mm7 ;
  166. punpckhwd mm7, mm1 ;
  167. paddd mm4, mm2 ;s4l
  168. pmaddwd mm7, BETA4 ;
  169. paddd mm5, mm3 ;s4h
  170. paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
  171. psubd mm6, mm2 ;s6l
  172. paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
  173. psrad mm4, CONSTBITS+1 ;s4l rounded descaled
  174. psubd mm7, mm3 ;s6h
  175. psrad mm5, CONSTBITS+1 ;s4h rounded descaled
  176. paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
  177. packssdw mm4, mm5 ;s4
  178. paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
  179. psrad mm6, CONSTBITS+1 ;s6l rounded descaled
  180. movq mm0, [edi+CLINE1] ;
  181. psrad mm7, CONSTBITS+1 ;s6h rounded descaled
  182. ;mm0=q5 mm4=s4
  183. ;mm2=q7 mm6=s6
  184. paddw mm0, [edi+CLINE7] ;q5
  185. packssdw mm6, mm7 ;s6
  186. movq mm2, [edi+CLINE3] ;
  187. pxor mm5, mm5 ;
  188. paddw mm2, [edi+CLINE5] ;q7
  189. movq mm7, mm0 ;q5
  190. psubw mm0, mm2 ;r5=q5-q7
  191. psraw mm7, 1 ;q5>>1
  192. punpcklwd mm5, mm0
  193. pxor mm3, mm3
  194. pmaddwd mm5, BETA3 ;s5l
  195. punpckhwd mm3, mm0 ;
  196. pmaddwd mm3, BETA3 ;s5h
  197. psraw mm2, 1 ;q7>>1
  198. movq mm0, [edi+CLINE2]
  199. paddw mm7, mm2 ;r7=s7=u7
  200. paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
  201. psubw mm6, mm7 ;u6
  202. paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
  203. psrad mm5, CONSTBITS+1 ;s5l rounded descaled
  204. psubw mm0, [edi+CLINE6] ;r2
  205. psrad mm3, CONSTBITS+1 ;s5h rounded descaled
  206. packssdw mm5, mm3 ;s5
  207. pxor mm1, mm1
  208. ;mm0=r2 mm4=s4
  209. ;mm1 mm5=u5
  210. ;mm2 mm6=u6
  211. ;mm3 mm7=Tu7
  212. movq Tu7, mm7 ;Save u7
  213. pxor mm7, mm7
  214. movq mm2, [edi+CLINE0]
  215. punpcklwd mm1, mm0
  216. pmaddwd mm1, BETA1 ;s2l
  217. punpckhwd mm7, mm0
  218. pmaddwd mm7, BETA1 ;s2h
  219. psubw mm5, mm6 ;u5
  220. movq mm0, [edi+CLINE2]
  221. paddw mm4, mm5 ;-u4
  222. ;mm4=-u4 mm5=u5
  223. ;mm6=u6 mm7=u7
  224. paddd mm1, CONSTBITS_RND ;s2l rounded
  225. ;
  226. paddd mm7, CONSTBITS_RND ;s2h rounded
  227. psrad mm1, CONSTBITS ;s2l rounded descaled
  228. paddw mm0, [edi+CLINE6] ;r3=s3=t3
  229. psrad mm7, CONSTBITS ;s2h rounded descaled
  230. movq mm3, mm2 ;
  231. packssdw mm1, mm7 ;s2
  232. psubw mm2, [edi+CLINE4] ;t1
  233. psubw mm1, mm0 ;t2=s2-s3
  234. psraw mm0, 1 ;t3>>1
  235. ;
  236. psraw mm2, 1 ;t1>>1
  237. ;
  238. psraw mm1, 1 ;t2>>1
  239. ;
  240. paddw mm3, [edi+CLINE4] ;t0
  241. movq mm7, mm0 ;t3>>1 copy
  242. psraw mm3, 1 ;t0>>1
  243. ;
  244. paddw mm0, mm3 ;u0=t3+t0
  245. psubw mm3, mm7 ;u3=t0-t3
  246. ; psraw mm3, 1 ;u3>>1
  247. movq mm7, mm1 ;t2
  248. paddw mm1, mm2 ;u1=t2+t1
  249. psubw mm2, mm7 ;u2=t1-t2
  250. ;mm0=u0 mm4=-u4
  251. ;mm1=u1 mm5=u5
  252. ;mm2=u2 mm6=u6
  253. ;mm3=u3 mm7=avail.
  254. ; psraw mm2, 1 ;u2>>1
  255. movq mm7, mm3 ;u3>>1
  256. psubw mm3, mm4 ;v3=u3-(-u4)
  257. paddw mm4, mm7 ;v4=-u4+u3
  258. ; psraw mm1, 1 ;u1>>1
  259. movq mm7, mm2 ;u2>>1
  260. ; psraw mm0, 1 ;u0>>1
  261. psubw mm2, mm5 ;v5=u2-u5
  262. paddw mm5, mm7 ;v2=u5+u2
  263. movq mm7, mm1 ;u1>>1
  264. psubw mm1, mm6 ;v6=u1-u6
  265. paddw mm6, mm7 ;v1=u6+u1
  266. movq Tv5, mm2 ;Save v5
  267. movq mm7, mm0 ;
  268. movq mm2, mm5 ;T1
  269. punpckhwd mm5, mm3 ;T1(c,d)
  270. paddw mm7, Tu7 ;v0
  271. ;v0=mm7 v4=mm4
  272. ;v1=mm6 v5=Tv5 (to mm2 later)
  273. ;v2=mm5 v6=mm1
  274. ;v3=mm3 v7=mm0 (later)
  275. punpcklwd mm2, mm3 ;T1(c,d);mm3 free
  276. movq mm3, mm7 ;T1(a,b)
  277. punpckhwd mm7, mm6 ;T1(a,b)
  278. punpcklwd mm3, mm6 ;T1(a,b);mm6 free
  279. movq mm6, mm7 ;T1
  280. psubw mm0, Tu7 ;v7
  281. punpckldq mm7, mm5 ;T1
  282. punpckhdq mm6, mm5 ;T1;mm5 free
  283. movq mm5, mm3 ;T1
  284. movq [edi+CLINE2], mm7 ;T1
  285. punpckldq mm3, mm2 ;T1
  286. movq [edi+CLINE3], mm6 ;T1
  287. punpckhdq mm5, mm2 ;T1
  288. movq [edi+CLINE0], mm3 ;T1
  289. movq mm6, mm1 ;T2(c,d)
  290. movq [edi+CLINE1], mm5 ;T1
  291. punpckhwd mm1, mm0 ;T2(c,d)
  292. movq mm2, Tv5
  293. punpcklwd mm6, mm0 ;T2(c,d);mm0 free
  294. movq mm7, mm4 ;T2(a,b)
  295. punpckhwd mm4, mm2 ;T2(a,b)
  296. punpcklwd mm7, mm2 ;T2(a,b);mm2 free
  297. movq mm2, mm4 ;T2
  298. punpckldq mm4, mm1 ;T2
  299. ; ;cols 4-7
  300. punpckhdq mm2, mm1 ;T2
  301. movq mm1, mm7 ;T2
  302. movq [edi+CLINE6], mm4 ;T2
  303. punpckhdq mm1, mm6 ;T2
  304. movq [edi+CLINE7], mm2 ;T2
  305. punpckldq mm7, mm6 ;T2
  306. movq [edi+CLINE5], mm1 ;T2
  307. ; ;cols 4-7
  308. movq [edi+CLINE4], mm7 ;T2
  309. ; ;cols 4-7
  310. cols_4_7:
  311. ; Add 8 to CLINE offsets
  312. pxor mm4, mm4 ;
  313. movq mm0, [edi+CLINE5+8] ;
  314. pxor mm5, mm5 ;
  315. movq mm1, [edi+CLINE1+8] ;
  316. pxor mm2, mm2 ;
  317. psubw mm0, [edi+CLINE3+8] ;q4=r4
  318. pxor mm3, mm3 ;
  319. psubw mm1, [edi+CLINE7+8] ;q6=r6
  320. punpcklwd mm4, mm0 ;
  321. pmaddwd mm4, NEGBETA2 ;
  322. punpckhwd mm5, mm0 ;
  323. pmaddwd mm5, NEGBETA2 ;
  324. psubw mm0, mm1 ;r4-r6
  325. punpcklwd mm2, mm0 ;
  326. pxor mm6, mm6 ;
  327. pmaddwd mm2, BETA5 ;
  328. punpckhwd mm3, mm0 ;
  329. pmaddwd mm3, BETA5 ;
  330. punpcklwd mm6, mm1 ;
  331. pmaddwd mm6, BETA4 ;
  332. pxor mm7, mm7 ;
  333. punpckhwd mm7, mm1 ;
  334. paddd mm4, mm2 ;s4l
  335. pmaddwd mm7, BETA4 ;
  336. paddd mm5, mm3 ;s4h
  337. paddd mm4, CONSTBITS_RND ;s4l rounded
  338. psubd mm6, mm2 ;s6l
  339. paddd mm5, CONSTBITS_RND ;s4h rounded
  340. psrad mm4, CONSTBITS ;s4l rounded descaled
  341. psubd mm7, mm3 ;s6h
  342. psrad mm5, CONSTBITS ;s4h rounded descaled
  343. paddd mm6, CONSTBITS_RND ;s6l rounded
  344. packssdw mm4, mm5 ;s4
  345. paddd mm7, CONSTBITS_RND ;s6h rounded
  346. psrad mm6, CONSTBITS ;s6l rounded descaled
  347. movq mm0, [edi+CLINE1+8] ;
  348. psrad mm7, CONSTBITS ;s6h rounded descaled
  349. ;mm0=q5 mm4=s4
  350. ;mm2=q7 mm6=s6
  351. paddw mm0, [edi+CLINE7+8] ;q5
  352. packssdw mm6, mm7 ;s6
  353. movq mm2, [edi+CLINE3+8] ;
  354. pxor mm5, mm5 ;
  355. paddw mm2, [edi+CLINE5+8] ;q7
  356. movq mm7, mm0 ;q5
  357. psubw mm0, mm2 ;r5=q5-q7
  358. ;TODO
  359. punpcklwd mm5, mm0
  360. pxor mm3, mm3
  361. pmaddwd mm5, BETA3 ;s5l
  362. punpckhwd mm3, mm0 ;
  363. pmaddwd mm3, BETA3 ;s5h
  364. ;TODO
  365. movq mm0, [edi+CLINE2+8]
  366. paddw mm7, mm2 ;r7=s7=u7
  367. paddd mm5, CONSTBITS_RND ;s5l rounded
  368. psubw mm6, mm7 ;u6
  369. paddd mm3, CONSTBITS_RND ;s5h rounded
  370. psrad mm5, CONSTBITS ;s5l rounded descaled
  371. psubw mm0, [edi+CLINE6+8] ;r2
  372. psrad mm3, CONSTBITS ;s5h rounded descaled
  373. packssdw mm5, mm3 ;s5
  374. pxor mm1, mm1
  375. ;mm0=r2 mm4=s4
  376. ;mm1 mm5=u5
  377. ;mm2 mm6=u6
  378. ;mm3 mm7=Tu7
  379. movq Tu7, mm7 ;Save u7
  380. pxor mm7, mm7
  381. movq mm2, [edi+CLINE0+8]
  382. punpcklwd mm1, mm0
  383. pmaddwd mm1, BETA1 ;s2l
  384. punpckhwd mm7, mm0
  385. pmaddwd mm7, BETA1 ;s2h
  386. psubw mm5, mm6 ;u5
  387. movq mm0, [edi+CLINE2+8]
  388. paddw mm4, mm5 ;-u4
  389. ;mm4=-u4 mm5=u5
  390. ;mm6=u6 mm7=u7
  391. paddd mm1, CONSTBITS_RND ;s2l rounded
  392. ;
  393. paddd mm7, CONSTBITS_RND ;s2h rounded
  394. psrad mm1, CONSTBITS ;s2l rounded descaled
  395. paddw mm0, [edi+CLINE6+8] ;r3=s3=t3
  396. psrad mm7, CONSTBITS ;s2h rounded descaled
  397. movq mm3, mm2 ;
  398. packssdw mm1, mm7 ;s2
  399. psubw mm2, [edi+CLINE4+8] ;t1
  400. psubw mm1, mm0 ;t2=s2-s3
  401. paddw mm3, [edi+CLINE4+8] ;t0
  402. movq mm7, mm0 ;t3
  403. paddw mm0, mm3 ;u0=t3+t0
  404. psubw mm3, mm7 ;u3=t0-t3
  405. movq mm7, mm1 ;t2
  406. paddw mm1, mm2 ;u1=t2+t1
  407. psubw mm2, mm7 ;u2=t1-t2
  408. ;mm0=u0 mm4=-u4
  409. ;mm1=u1 mm5=u5
  410. ;mm2=u2 mm6=u6
  411. ;mm3=u3 mm7=avail.
  412. movq mm7, mm3 ;
  413. psubw mm3, mm4 ;u3-(-u4)
  414. paddw mm4, mm7 ;-u4+u3
  415. psraw mm3, 1 ;v3
  416. movq mm7, mm2 ;
  417. psraw mm4, 1 ;v4
  418. psubw mm2, mm5 ;u2-u5
  419. psraw mm2, 1 ;v5
  420. paddw mm5, mm7 ;u5+u2
  421. psraw mm5, 1 ;v2
  422. movq mm7, mm1 ;
  423. psubw mm1, mm6 ;u1-u6
  424. paddw mm6, mm7 ;u6+u1
  425. movq Tv5, mm2 ;Save v5
  426. psraw mm1, 1 ;v6
  427. psraw mm6, 1 ;v1
  428. movq mm7, mm0 ;
  429. movq mm2, mm5 ;T1
  430. punpckhwd mm5, mm3 ;T1(c,d)
  431. paddw mm7, Tu7 ;
  432. ;TODO
  433. psraw mm7, 1 ;v0
  434. ;TODO
  435. ;v0=mm7 v4=mm4
  436. ;v1=mm6 v5=Tv5 (to mm2 later)
  437. ;v2=mm5 v6=mm1
  438. ;v3=mm3 v7=mm0 (later)
  439. punpcklwd mm2, mm3 ;T1(c,d);mm3 free
  440. movq mm3, mm7 ;T1(a,b)
  441. punpckhwd mm7, mm6 ;T1(a,b)
  442. punpcklwd mm3, mm6 ;T1(a,b);mm6 free
  443. movq mm6, mm7 ;T1
  444. psubw mm0, Tu7 ;
  445. punpckldq mm7, mm5 ;T1
  446. psraw mm0, 1 ;v7
  447. ;TODO
  448. punpckhdq mm6, mm5 ;T1;mm5 free
  449. movq mm5, mm3 ;T1
  450. movq [edi+CLINE2+8], mm7 ;T1
  451. punpckldq mm3, mm2 ;T1
  452. movq [edi+CLINE3+8], mm6 ;T1
  453. punpckhdq mm5, mm2 ;T1
  454. movq [edi+CLINE0+8], mm3 ;T1
  455. movq mm6, mm1 ;T2(c,d)
  456. movq [edi+CLINE1+8], mm5 ;T1
  457. punpckhwd mm1, mm0 ;T2(c,d)
  458. movq mm2, Tv5
  459. punpcklwd mm6, mm0 ;T2(c,d);mm0 free
  460. movq mm7, mm4 ;T2(a,b)
  461. punpckhwd mm4, mm2 ;T2(a,b)
  462. punpcklwd mm7, mm2 ;T2(a,b);mm2 free
  463. movq mm2, mm4 ;T2
  464. punpckldq mm4, mm1 ;T2
  465. ; ;cols 4-7
  466. punpckhdq mm2, mm1 ;T2
  467. movq mm1, mm7 ;T2
  468. movq [edi+CLINE6+8], mm4 ;T2
  469. punpckhdq mm1, mm6 ;T2
  470. movq [edi+CLINE7+8], mm2 ;T2
  471. punpckldq mm7, mm6 ;T2
  472. movq [edi+CLINE5+8], mm1 ;T2
  473. ; ;cols 4-7
  474. movq [edi+CLINE4+8], mm7 ;T2
  475. ; ;cols 4-7
  476. rows_0_3:
  477. RLINE0 = 0 - 64
  478. RLINE1 = 16 - 64
  479. RLINE2 = 32 - 64
  480. RLINE3 = 48 - 64
  481. RLINE4 = 8 - 64
  482. RLINE5 = 24 - 64
  483. RLINE6 = 40 - 64
  484. RLINE7 = 56 - 64
  485. pxor mm4, mm4 ;
  486. movq mm0, [edi+RLINE5] ;
  487. pxor mm5, mm5 ;
  488. movq mm1, [edi+RLINE1] ;
  489. pxor mm2, mm2 ;
  490. psubw mm0, [edi+RLINE3] ;q4=r4
  491. pxor mm3, mm3 ;
  492. psubw mm1, [edi+RLINE7] ;q6=r6
  493. punpcklwd mm4, mm0 ;
  494. pmaddwd mm4, NEGBETA2 ;
  495. punpckhwd mm5, mm0 ;
  496. pmaddwd mm5, NEGBETA2 ;
  497. psubw mm0, mm1 ;r4-r6
  498. punpcklwd mm2, mm0 ;
  499. pxor mm6, mm6 ;
  500. pmaddwd mm2, BETA5 ;
  501. punpckhwd mm3, mm0 ;
  502. pmaddwd mm3, BETA5 ;
  503. punpcklwd mm6, mm1 ;
  504. pmaddwd mm6, BETA4 ;
  505. pxor mm7, mm7 ;
  506. punpckhwd mm7, mm1 ;
  507. paddd mm4, mm2 ;s4l
  508. pmaddwd mm7, BETA4 ;
  509. paddd mm5, mm3 ;s4h
  510. paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
  511. psubd mm6, mm2 ;s6l
  512. paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
  513. psrad mm4, CONSTBITS+1 ;s4l rounded descaled
  514. psubd mm7, mm3 ;s6h
  515. psrad mm5, CONSTBITS+1 ;s4h rounded descaled
  516. paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
  517. packssdw mm4, mm5 ;s4
  518. paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
  519. psrad mm6, CONSTBITS+1 ;s6l rounded descaled
  520. movq mm0, [edi+RLINE1] ;
  521. psrad mm7, CONSTBITS+1 ;s6h rounded descaled
  522. ;mm0=q5 mm4=s4
  523. ;mm2=q7 mm6=s6
  524. paddw mm0, [edi+RLINE7] ;q5
  525. packssdw mm6, mm7 ;s6
  526. movq mm2, [edi+RLINE3] ;
  527. pxor mm5, mm5 ;
  528. paddw mm2, [edi+RLINE5] ;q7
  529. movq mm7, mm0 ;q5
  530. psubw mm0, mm2 ;r5=q5-q7
  531. paddw mm7, mm2 ;r7=q5+q7
  532. punpcklwd mm5, mm0
  533. pxor mm3, mm3
  534. pmaddwd mm5, BETA3 ;s5l
  535. punpckhwd mm3, mm0 ;
  536. pmaddwd mm3, BETA3 ;s5h
  537. ;TODO
  538. paddw mm7, ONE ;
  539. ;TODO
  540. movq mm0, [edi+RLINE2]
  541. psraw mm7, 1 ;s7
  542. paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
  543. psubw mm6, mm7 ;u6
  544. paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
  545. psrad mm5, CONSTBITS+1 ;s5l rounded descaled
  546. psubw mm0, [edi+RLINE6] ;r2
  547. psrad mm3, CONSTBITS+1 ;s5h rounded descaled
  548. packssdw mm5, mm3 ;s5
  549. pxor mm1, mm1
  550. ;mm0=r2 mm4=s4
  551. ;mm1 mm5=u5
  552. ;mm2 mm6=u6
  553. ;mm3 mm7=Tu7
  554. psllw mm7, 1 ;u7<<1
  555. ;
  556. movq Tu7, mm7 ;Save u7<<1
  557. pxor mm7, mm7
  558. movq mm2, [edi+RLINE0]
  559. punpcklwd mm1, mm0
  560. pmaddwd mm1, BETA1 ;s2l
  561. punpckhwd mm7, mm0
  562. pmaddwd mm7, BETA1 ;s2h
  563. psubw mm5, mm6 ;u5
  564. movq mm0, [edi+RLINE2]
  565. paddw mm4, mm5 ;-u4
  566. ;mm4=-u4 mm5=u5
  567. ;mm6=u6 mm7=
  568. paddd mm1, CONSTBITS_RND ;s2l rounded
  569. ;
  570. paddd mm7, CONSTBITS_RND ;s2h rounded
  571. psrad mm1, CONSTBITS ;s2l rounded descaled
  572. paddw mm0, [edi+RLINE6] ;r3=s3=t3
  573. psrad mm7, CONSTBITS ;s2h rounded descaled
  574. movq mm3, mm2 ;
  575. packssdw mm1, mm7 ;s2
  576. psubw mm2, [edi+RLINE4] ;t1
  577. psubw mm1, mm0 ;t2=s2-s3
  578. paddw mm3, [edi+RLINE4] ;t0
  579. movq mm7, mm0 ;t3
  580. paddw mm0, mm3 ;u0=t3+t0
  581. psubw mm3, mm7 ;u3=t0-t3
  582. ;TODO
  583. movq mm7, mm1 ;t2
  584. paddw mm1, mm2 ;u1=t2+t1
  585. psubw mm2, mm7 ;u2=t1-t2
  586. ;mm0=u0 mm4=-u4
  587. ;mm1=u1 mm5=u5
  588. ;mm2=u2 mm6=u6
  589. ;mm3=u3 mm7=avail.
  590. psllw mm4, 1 ;-u4<<1
  591. movq mm7, mm3 ;
  592. psubw mm3, mm4 ;v3=u3-(-u4<<1)
  593. paddw mm4, mm7 ;v4=(-u4<<1)+u3
  594. psllw mm5, 1 ;u5<<1
  595. movq mm7, mm2 ;
  596. psubw mm2, mm5 ;v5=u2-(u5<<1)
  597. paddw mm5, mm7 ;v2=(u5<<1)+u2
  598. psllw mm6, 1 ;u6<<1
  599. movq mm7, mm1 ;
  600. psubw mm1, mm6 ;v6=u1-(u6<<1)
  601. paddw mm6, mm7 ;v1=(u6<<1)+u1
  602. movq Tv5, mm2 ;Save v5
  603. movq mm7, mm0 ;
  604. movq mm2, mm5 ;T1
  605. punpckhwd mm5, mm3 ;T1(c,d)
  606. paddw mm7, Tu7 ;v0=u0+(u7<<1)
  607. ;v0=mm7 v4=mm4
  608. ;v1=mm6 v5=Tv5 (to mm2 later)
  609. ;v2=mm5 v6=mm1
  610. ;v3=mm3 v7=mm0 (later)
  611. punpcklwd mm2, mm3 ;T1(c,d);mm3 free
  612. movq mm3, mm7 ;T1(a,b)
  613. punpckhwd mm7, mm6 ;T1(a,b)
  614. punpcklwd mm3, mm6 ;T1(a,b);mm6 free
  615. movq mm6, mm7 ;T1
  616. psubw mm0, Tu7 ;v7=u0-(u7<<1)
  617. punpckldq mm7, mm5 ;T1
  618. punpckhdq mm6, mm5 ;T1;mm5 free
  619. movq mm5, mm3 ;T1
  620. movq [edi+RLINE2], mm7 ;T1
  621. punpckldq mm3, mm2 ;T1
  622. movq [edi+RLINE3], mm6 ;T1
  623. punpckhdq mm5, mm2 ;T1
  624. movq [edi+RLINE0], mm3 ;T1
  625. movq mm6, mm1 ;T2(c,d)
  626. movq [edi+RLINE1], mm5 ;T1
  627. punpckhwd mm1, mm0 ;T2(c,d)
  628. movq mm2, Tv5
  629. punpcklwd mm6, mm0 ;T2(c,d);mm0 free
  630. movq mm7, mm4 ;T2(a,b)
  631. punpckhwd mm4, mm2 ;T2(a,b)
  632. punpcklwd mm7, mm2 ;T2(a,b);mm2 free
  633. movq mm2, mm4 ;T2
  634. punpckldq mm4, mm1 ;T2
  635. ; ;cols 4-7
  636. punpckhdq mm2, mm1 ;T2
  637. movq mm1, mm7 ;T2
  638. movq [edi+RLINE6], mm4 ;T2
  639. punpckhdq mm1, mm6 ;T2
  640. movq [edi+RLINE7], mm2 ;T2
  641. punpckldq mm7, mm6 ;T2
  642. movq [edi+RLINE5], mm1 ;T2
  643. ; ;cols 4-7
  644. movq [edi+RLINE4], mm7 ;T2
  645. ; ;cols 4-7
  646. rows_4_7:
  647. ; Add 64 to RLINE offsets
  648. pxor mm4, mm4 ;
  649. movq mm0, [edi+RLINE5+64] ;
  650. pxor mm5, mm5 ;
  651. movq mm1, [edi+RLINE1+64] ;
  652. pxor mm2, mm2 ;
  653. psubw mm0, [edi+RLINE3+64] ;q4=r4
  654. pxor mm3, mm3 ;
  655. psubw mm1, [edi+RLINE7+64] ;q6=r6
  656. punpcklwd mm4, mm0 ;
  657. pmaddwd mm4, NEGBETA2 ;
  658. punpckhwd mm5, mm0 ;
  659. pmaddwd mm5, NEGBETA2 ;
  660. psubw mm0, mm1 ;r4-r6
  661. punpcklwd mm2, mm0 ;
  662. pxor mm6, mm6 ;
  663. pmaddwd mm2, BETA5 ;
  664. punpckhwd mm3, mm0 ;
  665. pmaddwd mm3, BETA5 ;
  666. punpcklwd mm6, mm1 ;
  667. pmaddwd mm6, BETA4 ;
  668. pxor mm7, mm7 ;
  669. punpckhwd mm7, mm1 ;
  670. paddd mm4, mm2 ;s4l
  671. pmaddwd mm7, BETA4 ;
  672. paddd mm5, mm3 ;s4h
  673. paddd mm4, CONSTBITS_P_1_RND ;s4l rounded
  674. psubd mm6, mm2 ;s6l
  675. paddd mm5, CONSTBITS_P_1_RND ;s4h rounded
  676. psrad mm4, CONSTBITS+1 ;s4l rounded descaled
  677. psubd mm7, mm3 ;s6h
  678. psrad mm5, CONSTBITS+1 ;s4h rounded descaled
  679. paddd mm6, CONSTBITS_P_1_RND ;s6l rounded
  680. packssdw mm4, mm5 ;s4
  681. paddd mm7, CONSTBITS_P_1_RND ;s6h rounded
  682. psrad mm6, CONSTBITS+1 ;s6l rounded descaled
  683. movq mm0, [edi+RLINE1+64] ;
  684. psrad mm7, CONSTBITS+1 ;s6h rounded descaled
  685. ;mm0=q5 mm4=s4
  686. ;mm2=q7 mm6=s6
  687. paddw mm0, [edi+RLINE7+64] ;q5
  688. packssdw mm6, mm7 ;s6
  689. movq mm2, [edi+RLINE3+64] ;
  690. pxor mm5, mm5 ;
  691. paddw mm2, [edi+RLINE5+64] ;q7
  692. movq mm7, mm0 ;q5
  693. psubw mm0, mm2 ;r5=q5-q7
  694. paddw mm7, mm2 ;r7=q5+q7
  695. punpcklwd mm5, mm0
  696. pxor mm3, mm3
  697. pmaddwd mm5, BETA3 ;s5l
  698. punpckhwd mm3, mm0 ;
  699. pmaddwd mm3, BETA3 ;s5h
  700. ;TODO
  701. paddw mm7, ONE ;
  702. ;TODO
  703. movq mm0, [edi+RLINE2+64]
  704. psraw mm7, 1 ;s7
  705. paddd mm5, CONSTBITS_P_1_RND ;s5l rounded
  706. psubw mm6, mm7 ;u6
  707. paddd mm3, CONSTBITS_P_1_RND ;s5h rounded
  708. psrad mm5, CONSTBITS+1 ;s5l rounded descaled
  709. psubw mm0, [edi+RLINE6+64] ;r2
  710. psrad mm3, CONSTBITS+1 ;s5h rounded descaled
  711. packssdw mm5, mm3 ;s5
  712. pxor mm1, mm1
  713. ;mm0=r2 mm4=s4
  714. ;mm1 mm5=u5
  715. ;mm2 mm6=u6
  716. ;mm3 mm7=Tu7
  717. psllw mm7, 1 ;u7<<1
  718. ;
  719. movq Tu7, mm7 ;Save u7<<1
  720. pxor mm7, mm7
  721. movq mm2, [edi+RLINE0+64]
  722. punpcklwd mm1, mm0
  723. pmaddwd mm1, BETA1 ;s2l
  724. punpckhwd mm7, mm0
  725. pmaddwd mm7, BETA1 ;s2h
  726. psubw mm5, mm6 ;u5
  727. movq mm0, [edi+RLINE2+64]
  728. paddw mm4, mm5 ;-u4
  729. ;mm4=-u4 mm5=u5
  730. ;mm6=u6 mm7=
  731. paddd mm1, CONSTBITS_RND ;s2l rounded
  732. ;
  733. paddd mm7, CONSTBITS_RND ;s2h rounded
  734. psrad mm1, CONSTBITS ;s2l rounded descaled
  735. paddw mm0, [edi+RLINE6+64] ;r3=s3=t3
  736. psrad mm7, CONSTBITS ;s2h rounded descaled
  737. movq mm3, mm2 ;
  738. packssdw mm1, mm7 ;s2
  739. psubw mm2, [edi+RLINE4+64] ;t1
  740. psubw mm1, mm0 ;t2=s2-s3
  741. paddw mm3, [edi+RLINE4+64] ;t0
  742. movq mm7, mm0 ;t3
  743. paddw mm0, mm3 ;u0=t3+t0
  744. psubw mm3, mm7 ;u3=t0-t3
  745. ;TODO
  746. movq mm7, mm1 ;t2
  747. paddw mm1, mm2 ;u1=t2+t1
  748. psubw mm2, mm7 ;u2=t1-t2
  749. ;mm0=u0 mm4=-u4
  750. ;mm1=u1 mm5=u5
  751. ;mm2=u2 mm6=u6
  752. ;mm3=u3 mm7=avail.
  753. psllw mm4, 1 ;-u4<<1
  754. movq mm7, mm3 ;
  755. psubw mm3, mm4 ;v3=u3-(-u4<<1)
  756. paddw mm4, mm7 ;v4=(-u4<<1)+u3
  757. psllw mm5, 1 ;u5<<1
  758. movq mm7, mm2 ;
  759. psubw mm2, mm5 ;v5=u2-(u5<<1)
  760. paddw mm5, mm7 ;v2=(u5<<1)+u2
  761. psllw mm6, 1 ;u6<<1
  762. movq mm7, mm1 ;
  763. psubw mm1, mm6 ;v6=u1-(u6<<1)
  764. paddw mm6, mm7 ;v1=(u6<<1)+u1
  765. movq Tv5, mm2 ;Save v5
  766. movq mm7, mm0 ;
  767. movq mm2, mm5 ;T1
  768. punpckhwd mm5, mm3 ;T1(c,d)
  769. paddw mm7, Tu7 ;v0=u0+(u7<<1)
  770. ;v0=mm7 v4=mm4
  771. ;v1=mm6 v5=Tv5 (to mm2 later)
  772. ;v2=mm5 v6=mm1
  773. ;v3=mm3 v7=mm0 (later)
  774. punpcklwd mm2, mm3 ;T1(c,d);mm3 free
  775. movq mm3, mm7 ;T1(a,b)
  776. punpckhwd mm7, mm6 ;T1(a,b)
  777. punpcklwd mm3, mm6 ;T1(a,b);mm6 free
  778. movq mm6, mm7 ;T1
  779. psubw mm0, Tu7 ;v7=u0-(u7<<1)
  780. punpckldq mm7, mm5 ;T1
  781. punpckhdq mm6, mm5 ;T1;mm5 free
  782. movq mm5, mm3 ;T1
  783. movq [edi+RLINE2+64], mm7 ;T1
  784. punpckldq mm3, mm2 ;T1
  785. movq [edi+RLINE3+64], mm6 ;T1
  786. punpckhdq mm5, mm2 ;T1
  787. movq [edi+RLINE0+64], mm3 ;T1
  788. movq mm6, mm1 ;T2(c,d)
  789. movq [edi+RLINE1+64], mm5 ;T1
  790. punpckhwd mm1, mm0 ;T2(c,d)
  791. movq mm2, Tv5
  792. punpcklwd mm6, mm0 ;T2(c,d);mm0 free
  793. movq mm7, mm4 ;T2(a,b)
  794. punpckhwd mm4, mm2 ;T2(a,b)
  795. punpcklwd mm7, mm2 ;T2(a,b);mm2 free
  796. movq mm2, mm4 ;T2
  797. punpckldq mm4, mm1 ;T2
  798. ; ;cols 4-7
  799. punpckhdq mm2, mm1 ;T2
  800. movq mm1, mm7 ;T2
  801. movq [edi+RLINE6+64], mm4 ;T2
  802. punpckhdq mm1, mm6 ;T2
  803. movq [edi+RLINE7+64], mm2 ;T2
  804. punpckldq mm7, mm6 ;T2
  805. movq [edi+RLINE5+64], mm1 ;T2
  806. ; ;cols 4-7
  807. movq [edi+RLINE4+64], mm7 ;T2
  808. ; ;cols 4-7
  809. IDCT_Done:
  810. mov esp, StashESP
  811. pop edi
  812. pop esi
  813. ret 4
  814. @MMX_DecodeBlock_IDCT@12 endp
  815. MMXCODE1 ENDS
  816. END