Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

770 lines
36 KiB

  1. ;/* *************************************************************************
  2. ;** INTEL Corporation Proprietary Information
  3. ;**
  4. ;** This listing is supplied under the terms of a license
  5. ;** agreement with INTEL Corporation and may not be copied
  6. ;** nor disclosed except in accordance with the terms of
  7. ;** that agreement.
  8. ;**
  9. ;** Copyright (c) 1995 Intel Corporation.
  10. ;** All Rights Reserved.
  11. ;**
  12. ;** *************************************************************************
  13. ;*/
  14. ;////////////////////////////////////////////////////////////////////////////
  15. ;//
  16. ;// $Header: R:\h26x\h26x\src\enc\exmfdct.asv 1.3 22 Jul 1996 15:23:20 BNICKERS $
  17. ;// $Log: R:\h26x\h26x\src\enc\exmfdct.asv $
  18. ;//
  19. ;// Rev 1.3 22 Jul 1996 15:23:20 BNICKERS
  20. ;// Reduce code size. Implement H261 spatial filter.
  21. ;//
  22. ;// Rev 1.2 02 May 1996 12:00:54 BNICKERS
  23. ;// Initial integration of B Frame ME, MMX version.
  24. ;//
  25. ;// Rev 1.1 15 Mar 1996 15:52:44 BECHOLS
  26. ;//
  27. ;// Completed monolithic - Brian
  28. ;//
  29. ;// Rev 1.0 22 Feb 1996 20:04:46 BECHOLS
  30. ;// Initial revision.
  31. ;//
  32. ;//
  33. ;////////////////////////////////////////////////////////////////////////////
  34. ;
  35. ; exmfdct -- This function performs a Forward Discrete Cosine Transform for
  36. ; H263, on a stream of macroblocks comprised of 8*8 blocks of pels or pel
  37. ; differences. It is tightly coupled with its caller, the frame differencing
  38. ; code, and its callee, the Quantization/Run-length-encoding code.
  39. ;
  40. .xlist
  41. include memmodel.inc
  42. include e3inst.inc ; Encoder instance data
  43. include e3mbad.inc ; MacroBlock Action Descriptor struct layout
  44. include exEDTQ.inc ; Data structures for motion -E-stimation, frame -D-iff,
  45. ; Forward DCT -T-ransform, and -Q-uant/RLE.
  46. include iammx.inc ; MMx instructions
  47. .list
  48. .CODE EDTQ
  49. EXTERN MMxQuantRLE:NEAR
  50. ;ASSUME cs : FLAT
  51. ;ASSUME ds : FLAT
  52. ;ASSUME es : FLAT
  53. ;ASSUME fs : FLAT
  54. ;ASSUME gs : FLAT
  55. ;ASSUME ss : FLAT
  56. PUBLIC MMxDoForwardDCT
  57. PUBLIC MMxDoForwardDCTx
  58. PUBLIC MMxDoForwardDCTy
  59. MMxDoForwardDCTx:
  60. movq PelDiffsLine7,mm1
  61. MMxDoForwardDCTy:
  62. mov ebp,16
  63. lea esi,PelDiffs
  64. MMxDoForwardDCT:
  65. StackOffset TEXTEQU <8>
  66. ; ++ ========================================================================
  67. ; The Butterfly macro performs a 4x8 symetrical butterfly on half of an
  68. ; 8x8 block of memory. Given rows r0 to r7 the Butterfly gives the following
  69. ; results. q0 = r0+r7, q7 = r0-r7
  70. ; q1 = r1+r6, q6 = r1-r6
  71. ; q2 = r2+r5, q5 = r2-r5
  72. ; q3 = r3+r4, q4 = r3-r4
  73. ; This code has been optimized, but still gives up three half clocks. The
  74. ; butterflies are numbered 10 -> 16, 20 -> 26, 30 -> 36, and 40 -> 46.
  75. ; -- ========================================================================
  76. Butterfly1 MACRO
  77. punpcklbw mm7,[esi] ;10 -- Fetch line 0 of input.
  78. punpcklbw mm0,[esi+ecx*1] ;11 -- Fetch line 7 of input.
  79. pmulhw mm7,mm4 ;12 -- Sign extend the 4 pels or pel diffs.
  80. punpcklbw mm6,[esi+ebp*1] ; 20
  81. pmulhw mm0,mm4 ;13 -- Sign extend the 4 pels or pel diffs.
  82. punpcklbw mm1,[esi+eax*2] ; 21
  83. pmulhw mm6,mm4 ; 22
  84. punpcklbw mm5,[esi+ebp*2] ; 30
  85. pmulhw mm1,mm4 ; 23
  86. punpcklbw mm2,[esi+ebx*1] ; 31
  87. psubw mm7,mm0 ;14 -- Line0 - Line7
  88. punpcklbw mm4,[esi+eax*1] ; 40
  89. paddw mm0,mm0 ;15 -- 2 * Line7
  90. punpcklbw mm3,[esi+ebp*4] ; 41
  91. paddw mm0,mm7 ;16 -- Line0 + Line7
  92. psraw mm5,8 ; 32
  93. psubw mm6,mm1 ; 24
  94. psraw mm2,8 ; 33
  95. paddw mm1,mm1 ; 25
  96. psraw mm4,8 ; 42
  97. paddw mm1,mm6 ; 26
  98. psraw mm3,8 ; 43
  99. psubw mm5,mm2 ; 34
  100. movq [edi+7*8*2],mm7 ;17 -- Save Line0 - Line7
  101. psubw mm4,mm3 ; 44
  102. movq [edi+0*8*2],mm0 ;18 -- Save Line0 + Line7
  103. paddw mm2,mm2 ; 35
  104. movq [edi+6*8*2],mm6 ; 27
  105. paddw mm3,mm3 ; 45
  106. movq [edi+1*8*2],mm1 ; 28
  107. paddw mm2,mm5 ; 36
  108. movq [edi+5*8*2],mm5 ; 37
  109. paddw mm3,mm4 ; 46
  110. movq [edi+2*8*2],mm2 ; 38
  111. movq [edi+4*8*2],mm4 ; 47
  112. movq [edi+3*8*2],mm3 ; 48
  113. ENDM
  114. Butterfly2 MACRO
  115. movq mm0,[edi+0*8*2] ;10
  116. movq mm1,[edi+7*8*2] ;11
  117. movq mm2,mm0 ;12
  118. movq mm3,[edi+1*8*2] ; 20
  119. paddw mm0,mm1 ;13
  120. movq mm4,[edi+6*8*2] ; 21
  121. psubw mm2,mm1 ;14
  122. movq [edi+0*8*2],mm0 ;15
  123. movq [edi+7*8*2],mm2 ;16
  124. movq mm5,mm3 ; 22
  125. movq mm6,[edi+2*8*2] ; 30
  126. paddw mm3,mm4 ; 23
  127. movq mm7,[edi+5*8*2] ; 31
  128. psubw mm5,mm4 ; 24
  129. movq [edi+1*8*2],mm3 ; 25
  130. movq mm0,mm6 ; 32
  131. movq [edi+6*8*2],mm5 ; 26
  132. paddw mm6,mm7 ; 33
  133. movq mm1,[edi+3*8*2] ; 40
  134. psubw mm0,mm7 ; 34
  135. movq mm2,[edi+4*8*2] ; 41
  136. movq mm3,mm1 ; 42
  137. movq [edi+2*8*2],mm6 ; 35
  138. paddw mm1,mm2 ; 43
  139. movq [edi+5*8*2],mm0 ; 36
  140. psubw mm3,mm2 ; 44
  141. movq [edi+3*8*2],mm1 ; 45
  142. movq [edi+4*8*2],mm3 ; 46
  143. ENDM
  144. ; ++ ========================================================================
  145. ; The StageOne macro performs a 4x4 Butterfly on rows q0 to q4 such that:
  146. ; p0 = q0+q3, p3 = q0-q3
  147. ; p1 = q1+q2, p2 = q1-q2
  148. ; A scaled butterflyon rows q5 and q6 yield the following equations.
  149. ; p5 = C4*(q6-q5), p6 = C4*(q6+q5)
  150. ; This has been optimized, but gives up four half clocks. The two simple
  151. ; butterflies are numbered 10 -> 16 and 30 -> 36.
  152. ; The scaled butterfly is numbered 20 -> 2c.
  153. ; -- ========================================================================
  154. StageOne MACRO
  155. movq mm4,[edi+0*8*2] ;10
  156. movq mm5,[edi+3*8*2] ;11
  157. movq mm6,mm4 ;12
  158. movq mm0,[edi+6*8*2] ; 20
  159. paddw mm4,mm5 ;13
  160. movq mm1,[edi+5*8*2] ; 21
  161. psubw mm6,mm5 ;14
  162. movq [edi+0*8*2],mm4 ;15
  163. movq mm2,mm0 ; 22
  164. movq [edi+3*8*2],mm6 ;16
  165. paddw mm2,mm1 ; 23
  166. psubw mm0,mm1 ; 24
  167. movq mm3,[edi+1*8*2] ; 30
  168. psllw mm0,2 ; 25
  169. movq mm4,[edi+2*8*2] ; 31
  170. psllw mm2,2 ; 26
  171. pmulhw mm0,PD C4 ; 27
  172. movq mm5,mm3 ; 32
  173. pmulhw mm2,PD C4 ; 28
  174. paddw mm3,mm4 ; 33
  175. psubw mm5,mm4 ; 34
  176. movq [edi+1*8*2],mm3 ; 35
  177. psraw mm0,1 ; 29
  178. movq [edi+2*8*2],mm5 ; 36
  179. psraw mm2,1 ; 2a
  180. movq [edi+5*8*2],mm0 ; 2b
  181. movq [edi+6*8*2],mm2 ; 2c
  182. ENDM
  183. ; ++ ========================================================================
  184. ; The StageTwo macro performs two simple butterflies on rows p4,p5 and
  185. ; p6,p7 such that:
  186. ; n4 = p4+p5, n5 = p4-p5
  187. ; n6 = p7-p6, n7 = p7+p6
  188. ; They are numbered 20 -> 26 and 40 -> 46.
  189. ;
  190. ; It also performs a scaled butterflies on rows p0,p1 such that:
  191. ; n0 = C4*(p0+p1), n1 = C4*(p0-p1)
  192. ; This are numbered 10 -> 1c.
  193. ;
  194. ; Finally, it performs a butterfly on the scaled rows p2,p3 such that:
  195. ; n2 = C2*p3+C6*p2, n3 = C6*p6-C2*p2
  196. ; This is numbered 30 -> 3f.
  197. ;
  198. ; This macro has been optimized, but gives up four half clocks.
  199. ; -- ========================================================================
  200. StageTwo MACRO
  201. movq mm1,[edi+3*8*2] ; 30
  202. movq mm2,[edi+2*8*2] ; 31
  203. psllw mm1,2 ; 32
  204. movq mm5,[edi+4*8*2] ; 20
  205. psllw mm2,2 ; 33
  206. movq mm6,[edi+5*8*2] ; 21
  207. movq mm3,mm1 ; 34
  208. pmulhw mm1,PD C2 ; 36
  209. movq mm4,mm2 ; 35
  210. pmulhw mm2,PD C6 ; 37
  211. movq mm7,mm5 ; 22
  212. pmulhw mm3,PD C6 ; 38
  213. paddw mm5,mm6 ; 23
  214. pmulhw mm4,PD C2 ; 39
  215. psubw mm7,mm6 ; 24
  216. movq [edi+4*8*2],mm5 ; 25
  217. paddw mm1,mm2 ; 3a
  218. movq [edi+5*8*2],mm7 ; 26
  219. psraw mm1,1 ; 3c
  220. movq mm6,[edi+0*8*2] ;10
  221. psubw mm3,mm4 ; 3b
  222. movq mm0,[edi+1*8*2] ;11
  223. psraw mm3,1 ; 3d
  224. movq [edi+2*8*2],mm1 ; 3e
  225. movq mm7,mm6 ;12
  226. movq [edi+3*8*2],mm3 ; 3f
  227. paddw mm6,mm0 ;13
  228. movq mm3,[edi+7*8*2] ; 40
  229. psubw mm7,mm0 ;14
  230. movq mm5,[edi+6*8*2] ; 41
  231. psllw mm6,2 ;15
  232. psllw mm7,2 ;16
  233. pmulhw mm6,PD C4 ;17
  234. movq mm4,mm3 ; 42
  235. pmulhw mm7,PD C4 ;18
  236. paddw mm3,mm5 ; 43
  237. psubw mm4,mm5 ; 44
  238. movq [edi+7*8*2],mm3 ; 45
  239. psraw mm6,1 ;19
  240. movq [edi+6*8*2],mm4 ; 46
  241. psraw mm7,1 ;1a
  242. movq [edi+0*8*2],mm6 ;1b
  243. movq [edi+1*8*2],mm7 ;1c
  244. ENDM
  245. ; ++ ========================================================================
  246. ; The StageThree macro performs a butterfly on the scaled rows n4,n7 and
  247. ; n5,n6 such that:
  248. ; m4 = C7*n4+C1*n7, m7 = C1*n7-C7*n4
  249. ; m5 = C5*n6+C3*n5, m6 = C3*n6-C5*n5
  250. ; Steps 10 -> 1f determine m4,m7 and 20 -> 2f determine m5,m6.
  251. ; The outputs m0-m7 are put into reverse binary order as follows:
  252. ; 0 = 000 -> 000 = 0
  253. ; 1 = 001 -> 100 = 4
  254. ; 2 = 010 -> 010 = 2
  255. ; 3 = 011 -> 110 = 6
  256. ; 4 = 100 -> 001 = 1
  257. ; 5 = 101 -> 101 = 5
  258. ; 6 = 110 -> 011 = 3
  259. ; 7 = 111 -> 111 = 7
  260. ;
  261. ; This macro has been optimized, but I had to give up 10 half clocks.
  262. ; -- ========================================================================
  263. StageThree MACRO
  264. movq mm0,[edi+7*8*2] ;10
  265. movq mm4,[edi+6*8*2] ; 20
  266. movq mm1,[edi+4*8*2] ;11
  267. psllw mm0,2 ;12
  268. movq mm5,[edi+5*8*2] ; 21
  269. psllw mm4,2 ; 22
  270. movq mm3,[edi+1*8*2] ;
  271. psllw mm1,2 ;13
  272. movq mm7,[edi+3*8*2] ;
  273. psllw mm5,2 ; 23
  274. movq [edi+4*8*2],mm3 ;
  275. movq mm2,mm0 ;14
  276. movq [edi+6*8*2],mm7 ;
  277. movq mm6,mm4 ; 24
  278. pmulhw mm0,PD C1 ;16
  279. movq mm3,mm1 ;15
  280. pmulhw mm1,PD C7 ;17
  281. pmulhw mm2,PD C7 ;18
  282. pmulhw mm3,PD C1 ;19
  283. movq mm7,mm5 ; 25
  284. pmulhw mm4,PD C5 ; 26
  285. paddw mm0,mm1 ;1a
  286. pmulhw mm5,PD C3 ; 27
  287. psubw mm2,mm3 ;1b
  288. pmulhw mm6,PD C3 ; 28
  289. pmulhw mm7,PD C5 ; 29
  290. psraw mm0,1 ;1c
  291. psraw mm2,1 ;1d
  292. paddw mm4,mm5 ; 2a
  293. movq [edi+1*8*2],mm0 ;1e
  294. psubw mm6,mm7 ; 2b
  295. movq [edi+7*8*2],mm2 ;1f
  296. psraw mm4,1 ; 2c
  297. psraw mm6,1 ; 2d
  298. movq [edi+5*8*2],mm4 ; 2e
  299. movq [edi+3*8*2],mm6 ; 2f
  300. ENDM
  301. OPTION NOM510
  302. ;============================================================================
  303. ; This section does the Forward Discrete Cosine Transform. It performs a
  304. ; DCT on an 8*8 block of pels or pel differences.
  305. ;
  306. ; Upon input:
  307. ;
  308. ; esi -- Address of block of pels or pel differences on which to perform FDCT.
  309. ; ebp -- Pitch of block (8, 16, or 384).
  310. ; edx -- Reserved.
  311. ;
  312. ; After setup:
  313. ;
  314. ; esi -- Address of block of pels or pel differences on which to perform FDCT.
  315. ; ebp -- Pitch of block (8, 16, or 384). After Quant RLE, this gets set to 384.
  316. ; edx -- Reserved.
  317. ; edi -- Address at which to place intermediate and final coefficients.
  318. ; eax -- Pitch times 3
  319. ; ebx -- Pitch times 5
  320. ; ecx -- Pitch times 7
  321. ; mm5 -- 4 words of 256.
  322. ; mm0:mm7 -- Scratch.
  323. lea edi,Coeffs
  324. lea eax,[ebp+ebp*2]
  325. movq mm4,PD C0100010001000100
  326. lea ebx,[ebp+ebp*4]
  327. lea ecx,[eax+ebp*4]
  328. RepeatFirstTransform:
  329. ; ++ ========================================================================
  330. ; The Butterfly performs a 4x8 symetrical butterfly on half of an
  331. ; 8x8 block of memory. Given rows r0 to r7 the Butterfly gives the following
  332. ; results. q0 = r0+r7, q7 = r0-r7
  333. ; q1 = r1+r6, q6 = r1-r6
  334. ; q2 = r2+r5, q5 = r2-r5
  335. ; q3 = r3+r4, q4 = r3-r4
  336. ; This code has been optimized, but still gives up three half clocks. The
  337. ; butterflies are numbered 10 -> 16, 20 -> 26, 30 -> 36, and 40 -> 46.
  338. ; -- ========================================================================
  339. punpcklbw mm7,[esi] ;10 -- Fetch line 0 of input.
  340. punpcklbw mm0,[esi+ecx*1] ;11 -- Fetch line 7 of input.
  341. pmulhw mm7,mm4 ;12 -- Sign extend the 4 pels or pel diffs.
  342. punpcklbw mm6,[esi+ebp*1] ; 20
  343. pmulhw mm0,mm4 ;13 -- Sign extend the 4 pels or pel diffs.
  344. punpcklbw mm1,[esi+eax*2] ; 21
  345. pmulhw mm6,mm4 ; 22
  346. punpcklbw mm5,[esi+ebp*2] ; 30
  347. pmulhw mm1,mm4 ; 23
  348. punpcklbw mm2,[esi+ebx*1] ; 31
  349. psubw mm7,mm0 ;14 -- Line0 - Line7
  350. punpcklbw mm4,[esi+eax*1] ; 40
  351. paddw mm0,mm0 ;15 -- 2 * Line7
  352. punpcklbw mm3,[esi+ebp*4] ; 41
  353. paddw mm0,mm7 ;16 -- Line0 + Line7
  354. psraw mm5,8 ; 32
  355. psubw mm6,mm1 ; 24
  356. psraw mm2,8 ; 33
  357. paddw mm1,mm1 ; 25
  358. psraw mm4,8 ; 42
  359. paddw mm1,mm6 ; 26
  360. psraw mm3,8 ; 43
  361. psubw mm5,mm2 ; 34
  362. psubw mm4,mm3 ; 44
  363. paddw mm2,mm2 ; 35
  364. paddw mm3,mm3 ; 45
  365. paddw mm2,mm5 ; 36
  366. paddw mm3,mm4 ; 46
  367. ; ++ ========================================================================
  368. ; The StageOne performs a 4x4 Butterfly on rows q0 to q4 such that:
  369. ; p0 = q0+q3, p3 = q0-q3
  370. ; p1 = q1+q2, p2 = q1-q2
  371. ; A scaled butterflyon rows q5 and q6 yield the following equations.
  372. ; p5 = C4*(q6-q5), p6 = C4*(q6+q5)
  373. ; This has been optimized, but gives up four half clocks. The two simple
  374. ; butterflies are numbered 10 -> 16 and 30 -> 36.
  375. ; The scaled butterfly is numbered 20 -> 2c.
  376. ; -- ========================================================================
  377. psubw mm1,mm2 ; 30 -- p2 = q1 - q2
  378. psubw mm6,mm5 ; 20 -- q6 - q5
  379. paddw mm5,mm5 ; 21 -- 2q5
  380. paddw mm5,mm6 ; 22 -- q6 + q5
  381. psllw mm6,2 ; 23 -- scale
  382. pmulhw mm6,PD C4 ; 24 -- C4*(q6-q5) scaled
  383. psllw mm5,2 ; 23 -- scale
  384. pmulhw mm5,PD C4 ; 24 -- C4*(q6+q5) scaled
  385. psubw mm0,mm3 ; 10 -- p3 = q0 - q3
  386. paddw mm3,mm3 ; 11 -- 2q3
  387. paddw mm2,mm2 ; 31 -- 2q2
  388. paddw mm3,mm0 ; 12 -- p0 = q0 + q3
  389. psraw mm6,1 ; 25 -- p5 = C4*(q6-q5)
  390. paddw mm2,mm1 ; 32 -- p1 = q1 + q2
  391. psraw mm5,1 ; 26 -- p6 = C4*(q6+q5)
  392. ; ++ ========================================================================
  393. ; The StageTwo performs two simple butterflies on rows p4,p5 and
  394. ; p6,p7 such that:
  395. ; n4 = p4+p5, n5 = p4-p5
  396. ; n6 = p7-p6, n7 = p7+p6
  397. ; They are numbered 20 -> 26 and 40 -> 46.
  398. ;
  399. ; It also performs a scaled butterflies on rows p0,p1 such that:
  400. ; n0 = C4*(p0+p1), n1 = C4*(p0-p1)
  401. ; This are numbered 10 -> 1c.
  402. ;
  403. ; Finally, it performs a butterfly on the scaled rows p2,p3 such that:
  404. ; n2 = C2*p3+C6*p2, n3 = C6*p6-C2*p2
  405. ; This is numbered 30 -> 3f.
  406. ; -- ========================================================================
  407. psubw mm3,mm2 ; 10 -- p0 - p1
  408. paddw mm2,mm2 ; 11 -- 2p1
  409. paddw mm2,mm3 ; 12 -- p0 + p1
  410. psllw mm3,2 ; 13 -- scale
  411. pmulhw mm3,PD C4 ; 14 -- C4*(p0-p1)
  412. psllw mm2,2 ; 15 -- scale
  413. pmulhw mm2,PD C4 ; 16 -- C4*(p0+p1)
  414. psllw mm0,2 ; 30 -- scale p3
  415. psubw mm4,mm6 ; 20 -- n5 = p4 - p5
  416. psllw mm1,2 ; 31 -- scale p2
  417. psubw mm7,mm5 ; 40 -- n6 = p7 - p6
  418. psraw mm3,1 ; 17 -- n1 = C4*(p0-p1)
  419. paddw mm6,mm6 ; 21 -- 2p5
  420. psraw mm2,1 ; 18 -- n0 = C4*(p0+p1)
  421. movq [edi+4*8*2],mm3 ; 19 -- Save n1 (stage 3)
  422. movq mm3,mm0 ; 32 -- Copy scaled p3
  423. movq [edi+0*8*2],mm2 ; 1a -- Save n0 (stage 3)
  424. movq mm2,mm1 ; 33 -- Copy scaled p2
  425. pmulhw mm0,PD C2 ; 34 -- C2*p3 scaled
  426. paddw mm5,mm5 ; 41 -- 2p6
  427. pmulhw mm1,PD C6 ; 35 -- C6*p2 scaled
  428. paddw mm6,mm4 ; 22 -- n4 = p4 + p5
  429. pmulhw mm3,PD C6 ; 36 -- C6*p3 scaled
  430. paddw mm5,mm7 ; 42 -- n7 = p7 + p6
  431. pmulhw mm2,PD C2 ; 37 -- C2*p2 scaled
  432. psllw mm5,2 ; 10 -- scale n7 (stage 3)
  433. paddw mm0,mm1 ; 38 -- C2*p3 + C6*p2 scaled
  434. psllw mm7,2 ; 20 -- scale n6 (stage 3)
  435. movq mm1,mm5 ; 11 -- copy scaled n7 (stage 3)
  436. psraw mm0,1 ; 39 -- n2 = C2*p3 + C6*p2
  437. pmulhw mm5,PD C1 ; 12 -- C1*n7 scaled (stage 3)
  438. psllw mm6,2 ; 13 -- scale n4 (stage 3)
  439. movq [edi+2*8*2],mm0 ; 3c -- Save n2 (stage 3)
  440. psubw mm3,mm2 ; 3a -- C6*p3 - C2*p2 scaled
  441. ; ++ ========================================================================
  442. ; The StageThree macro performs a butterfly on the scaled rows n4,n7 and
  443. ; n5,n6 such that:
  444. ; m4 = C7*n4+C1*n7, m7 = C7*n7-C1*n4
  445. ; m5 = C5*n6+C3*n5, m6 = C3*n6-C5*n5
  446. ; Steps 10 -> 1f determine m4,m7 and 20 -> 2f determine m5,m6.
  447. ; The outputs m0-m7 are put into reverse binary order as follows:
  448. ; 0 = 000 -> 000 = 0
  449. ; 1 = 001 -> 100 = 4
  450. ; 2 = 010 -> 010 = 2
  451. ; 3 = 011 -> 110 = 6
  452. ; 4 = 100 -> 001 = 1
  453. ; 5 = 101 -> 101 = 5
  454. ; 6 = 110 -> 011 = 3
  455. ; 7 = 111 -> 111 = 7
  456. ; -- ========================================================================
  457. pmulhw mm1,PD C7 ; 14 -- C7*n7 scaled
  458. movq mm0,mm6 ; 15 -- copy scaled n4
  459. pmulhw mm6,PD C7 ; 16 -- C7*n4 scaled
  460. psraw mm3,1 ; 3b -- n3 = C6*p6 - C2*p2
  461. pmulhw mm0,PD C1 ; 17 -- C1*n4 scaled
  462. movq mm2,mm7 ; 21 -- copy scaled n6
  463. movq [edi+6*8*2],mm3 ; 3d -- Save n3
  464. psllw mm4,2 ; 22 -- scale n5
  465. pmulhw mm7,PD C5 ; 23 -- C5*n6 scaled
  466. movq mm3,mm4 ; 24 -- copy scaled n5
  467. pmulhw mm4,PD C3 ; 25 -- C3*n5 scaled
  468. paddw mm5,mm6 ; 18 -- C7*n4+C1*n7 scaled
  469. pmulhw mm2,PD C3 ; 26 -- C3*n6 scaled
  470. psubw mm1,mm0 ; 19 -- C7*n7-C1*n4 scaled
  471. pmulhw mm3,PD C5 ; 27 -- C5*n5 scaled
  472. psraw mm5,1 ; 1a -- m4 = C7*n4+C1*n7
  473. paddw mm7,mm4 ; 28 -- C5*n6+C3*n5 scaled
  474. psraw mm1,1 ; 1b -- m7 = C7*n7-C1*n4
  475. movq [edi+1*8*2],mm5 ; 1c -- Save m4
  476. psraw mm7,1 ; 29 -- m5 = C5*n6+C3*n5
  477. movq [edi+7*8*2],mm1 ; 1d -- Save m7
  478. psubw mm2,mm3 ; 2a -- C3*n6-C5*n5 scaled
  479. movq [edi+5*8*2],mm7 ; 2b -- Save m5
  480. psraw mm2,1 ; 2c -- m6 = C3*n6-C5*n5
  481. movq mm4,PD C0100010001000100 ; Prepare for next iteration.
  482. ;
  483. movq [edi+3*8*2],mm2 ; 2d -- Save m6
  484. ;
  485. add edi,8
  486. add esi,4
  487. test esi,4
  488. ;
  489. jne RepeatFirstTransform
  490. sub edi,16
  491. mov esi,2
  492. ; ++ ========================================================================
  493. ; The Transpose performs four 4x4 transpositions as described in the
  494. ; MMx User's Guide. This of course rotates the 8x8 matrix on its diagonal.
  495. ;
  496. ; This routine is more expensive than I had hoped. I need to revisit this.
  497. ; -- ========================================================================
  498. movq mm0,[edi+0*8*2] ;10 <C03 C02 C01 C00>
  499. ;
  500. movq mm1,[edi+1*8*2] ;11 <C13 C12 C11 C01>
  501. movq mm4,mm0 ;12 <C03 C02 C01 C00>
  502. movq mm2,[edi+2*8*2] ;13 <C23 C22 C21 C20>
  503. punpckhwd mm0,mm1 ;14 <C13 C03 C12 C02>
  504. movq mm3,[edi+3*8*2] ;15 <C33 C32 C31 C30>
  505. punpcklwd mm4,mm1 ;16 <C11 C01 C10 C00>
  506. movq mm6,mm2 ;17 <C23 C22 C21 C20>
  507. punpckhwd mm2,mm3 ;18 <C33 C23 C32 C22>
  508. movq mm1,mm0 ;19 <C13 C03 C12 C02>
  509. punpckldq mm0,mm2 ;1a <C32 C22 C12 C02>
  510. movq mm7,[edi+4*8*2] ; 20
  511. punpcklwd mm6,mm3 ;1b <C31 C21 C30 C20>
  512. movq [edi+2*8*2],mm0 ;1c <C32 C22 C12 C02> saved
  513. punpckhdq mm1,mm2 ;1d <C33 C23 C13 C03>
  514. movq mm5,mm4 ;1e <C11 C01 C10 C00>
  515. punpckldq mm4,mm6 ;1f <C30 C20 C10 C00>
  516. movq [edi+3*8*2],mm1 ;1g <C33 C23 C13 C03> saved
  517. punpckhdq mm5,mm6 ;1h <C31 C21 C11 C01>
  518. movq mm3,[edi+5*8*2] ; 21
  519. movq mm0,mm7 ; 22
  520. movq mm2,[edi+6*8*2] ; 23
  521. punpckhwd mm7,mm3 ; 24
  522. movq mm1,[edi+7*8*2] ; 25
  523. punpcklwd mm0,mm3 ; 26
  524. movq [edi+0*8*2],mm4 ;1i <C30 C20 C10 C00> saved
  525. movq mm6,mm2 ; 27
  526. movq [edi+1*8*2],mm5 ;1j <C31 C21 C11 C01> saved
  527. punpckhwd mm2,mm1 ; 28
  528. movq mm3,mm7 ; 29
  529. punpckldq mm7,mm2 ; 2a
  530. movq mm4,[edi+0*8*2+8] ; 30
  531. punpcklwd mm6,mm1 ; 2b
  532. movq mm1,[edi+2*8*2+8] ; 33
  533. punpckhdq mm3,mm2 ; 2d
  534. movq [edi+2*8*2+8],mm7 ; 2c
  535. movq mm5,mm0 ; 2e
  536. movq mm7,[edi+1*8*2+8] ; 31
  537. punpckldq mm0,mm6 ; 2f
  538. movq mm2,[edi+3*8*2+8] ; 35
  539. punpckhdq mm5,mm6 ; 2h
  540. movq [edi+3*8*2+8],mm3 ; 2g
  541. movq mm6,mm4 ; 32
  542. movq [edi+0*8*2+8],mm0 ; 2i
  543. punpckhwd mm4,mm7 ; 34
  544. movq [edi+1*8*2+8],mm5 ; 2j
  545. punpcklwd mm6,mm7 ; 36
  546. movq mm3,mm1 ; 37
  547. punpckhwd mm1,mm2 ; 38
  548. movq mm7,mm4 ; 39
  549. punpckldq mm4,mm1 ; 3a
  550. movq mm0,[edi+4*8*2+8] ; 40
  551. punpcklwd mm3,mm2 ; 3b
  552. movq [edi+6*8*2],mm4 ; 3c
  553. punpckhdq mm7,mm1 ; 3d
  554. movq mm5,mm6 ; 3e
  555. punpckldq mm6,mm3 ; 3f
  556. movq [edi+7*8*2],mm7 ; 3g
  557. punpckhdq mm5,mm3 ; 3h
  558. movq mm2,[edi+5*8*2+8] ; 41
  559. movq mm4,mm0 ; 42
  560. movq mm1,[edi+6*8*2+8] ; 43
  561. punpckhwd mm0,mm2 ; 44
  562. movq mm7,[edi+7*8*2+8] ; 45
  563. punpcklwd mm4,mm2 ; 46
  564. movq [edi+4*8*2],mm6 ; 3i
  565. movq mm3,mm1 ; 47
  566. movq [edi+5*8*2],mm5 ; 3j
  567. punpckhwd mm1,mm7 ; 48
  568. movq mm2,mm0 ; 49
  569. punpckldq mm0,mm1 ; 4a
  570. punpcklwd mm3,mm7 ; 4b
  571. ;
  572. movq [edi+6*8*2+8],mm0 ; 4c
  573. punpckhdq mm2,mm1 ; 4d
  574. movq mm6,mm4 ; 4e
  575. punpckldq mm4,mm3 ; 4f
  576. movq [edi+7*8*2+8],mm2 ; 4g
  577. punpckhdq mm6,mm3 ; 4h
  578. movq [edi+4*8*2+8],mm4 ; 4i
  579. ;
  580. movq [edi+5*8*2+8],mm6 ; 4j
  581. ;
  582. RepeatSecondTransform:
  583. ; ++ ========================================================================
  584. ; The Butterfly performs a 4x8 symetrical butterfly on half of an
  585. ; 8x8 block of memory. Given rows r0 to r7 the Butterfly gives the following
  586. ; results. q0 = r0+r7, q7 = r0-r7
  587. ; q1 = r1+r6, q6 = r1-r6
  588. ; q2 = r2+r5, q5 = r2-r5
  589. ; q3 = r3+r4, q4 = r3-r4
  590. ; This code has been optimized, but still gives up three half clocks. The
  591. ; butterflies are numbered 10 -> 16, 20 -> 26, 30 -> 36, and 40 -> 46.
  592. ; -- ========================================================================
  593. movq mm7,[edi] ;10 -- Fetch line 0 of input.
  594. movq mm0,[edi+7*8*2] ;11 -- Fetch line 7 of input.
  595. movq mm6,[edi+1*8*2] ; 20
  596. psubw mm7,mm0 ;14 -- Line0 - Line7
  597. movq mm1,[edi+6*8*2] ; 21
  598. paddw mm0,mm0 ;15 -- 2 * Line7
  599. movq mm5,[edi+2*8*2] ; 30
  600. paddw mm0,mm7 ;16 -- Line0 + Line7
  601. movq mm2,[edi+5*8*2] ; 31
  602. psubw mm6,mm1 ; 24
  603. paddw mm1,[edi+1*8*2] ; 26
  604. psubw mm5,mm2 ; 34
  605. movq mm4,[edi+3*8*2] ; 40
  606. movq mm3,[edi+4*8*2] ; 41
  607. psubw mm6,mm5 ; 20 -- q6 - q5 (Stage 1)
  608. paddw mm2,[edi+2*8*2] ; 36
  609. psubw mm4,mm3 ; 44
  610. paddw mm3,[edi+3*8*2] ; 46
  611. psubw mm1,mm2 ; 30 -- p2 = q1 - q2 (Stage 1)
  612. ; ++ ========================================================================
  613. ; The StageOne performs a 4x4 Butterfly on rows q0 to q4 such that:
  614. ; p0 = q0+q3, p3 = q0-q3
  615. ; p1 = q1+q2, p2 = q1-q2
  616. ; A scaled butterflyon rows q5 and q6 yield the following equations.
  617. ; p5 = C4*(q6-q5), p6 = C4*(q6+q5)
  618. ; This has been optimized, but gives up four half clocks. The two simple
  619. ; butterflies are numbered 10 -> 16 and 30 -> 36.
  620. ; The scaled butterfly is numbered 20 -> 2c.
  621. ; -- ========================================================================
  622. paddw mm5,mm5 ; 21 -- 2q5
  623. paddw mm5,mm6 ; 22 -- q6 + q5
  624. psllw mm6,2 ; 23 -- scale
  625. pmulhw mm6,PD C4 ; 24 -- C4*(q6-q5) scaled
  626. psllw mm5,2 ; 23 -- scale
  627. pmulhw mm5,PD C4 ; 24 -- C4*(q6+q5) scaled
  628. psubw mm0,mm3 ; 10 -- p3 = q0 - q3
  629. paddw mm3,mm3 ; 11 -- 2q3
  630. paddw mm2,mm2 ; 31 -- 2q2
  631. paddw mm3,mm0 ; 12 -- p0 = q0 + q3
  632. psraw mm6,1 ; 25 -- p5 = C4*(q6-q5)
  633. paddw mm2,mm1 ; 32 -- p1 = q1 + q2
  634. psraw mm5,1 ; 26 -- p6 = C4*(q6+q5)
  635. ; ++ ========================================================================
  636. ; The StageTwo performs two simple butterflies on rows p4,p5 and
  637. ; p6,p7 such that:
  638. ; n4 = p4+p5, n5 = p4-p5
  639. ; n6 = p7-p6, n7 = p7+p6
  640. ; They are numbered 20 -> 26 and 40 -> 46.
  641. ;
  642. ; It also performs a scaled butterflies on rows p0,p1 such that:
  643. ; n0 = C4*(p0+p1), n1 = C4*(p0-p1)
  644. ; This are numbered 10 -> 1c.
  645. ;
  646. ; Finally, it performs a butterfly on the scaled rows p2,p3 such that:
  647. ; n2 = C2*p3+C6*p2, n3 = C6*p6-C2*p2
  648. ; This is numbered 30 -> 3f.
  649. ; -- ========================================================================
  650. psubw mm3,mm2 ; 10 -- p0 - p1
  651. paddw mm2,mm2 ; 11 -- 2p1
  652. paddw mm2,mm3 ; 12 -- p0 + p1
  653. psllw mm3,2 ; 13 -- scale
  654. pmulhw mm3,PD C4 ; 14 -- C4*(p0-p1)
  655. psllw mm2,2 ; 15 -- scale
  656. pmulhw mm2,PD C4 ; 16 -- C4*(p0+p1)
  657. psllw mm0,2 ; 30 -- scale p3
  658. psubw mm4,mm6 ; 20 -- n5 = p4 - p5
  659. psllw mm1,2 ; 31 -- scale p2
  660. psubw mm7,mm5 ; 40 -- n6 = p7 - p6
  661. psraw mm3,1 ; 17 -- n1 = C4*(p0-p1)
  662. paddw mm6,mm6 ; 21 -- 2p5
  663. psraw mm2,1 ; 18 -- n0 = C4*(p0+p1)
  664. movq [edi+4*8*2],mm3 ; 19 -- Save n1 (stage 3)
  665. movq mm3,mm0 ; 32 -- Copy scaled p3
  666. movq [edi+0*8*2],mm2 ; 1a -- Save n0 (stage 3)
  667. movq mm2,mm1 ; 33 -- Copy scaled p2
  668. pmulhw mm0,PD C2 ; 34 -- C2*p3 scaled
  669. paddw mm5,mm5 ; 41 -- 2p6
  670. pmulhw mm1,PD C6 ; 35 -- C6*p2 scaled
  671. paddw mm6,mm4 ; 22 -- n4 = p4 + p5
  672. pmulhw mm3,PD C6 ; 36 -- C6*p3 scaled
  673. paddw mm5,mm7 ; 42 -- n7 = p7 + p6
  674. pmulhw mm2,PD C2 ; 37 -- C2*p2 scaled
  675. psllw mm5,2 ; 10 -- scale n7 (stage 3)
  676. paddw mm0,mm1 ; 38 -- C2*p3 + C6*p2 scaled
  677. psllw mm7,2 ; 20 -- scale n6 (stage 3)
  678. movq mm1,mm5 ; 11 -- copy scaled n7 (stage 3)
  679. psraw mm0,1 ; 39 -- n2 = C2*p3 + C6*p2
  680. pmulhw mm5,PD C1 ; 12 -- C1*n7 scaled (stage 3)
  681. psllw mm6,2 ; 13 -- scale n4 (stage 3)
  682. movq [edi+2*8*2],mm0 ; 3c -- Save n2 (stage 3)
  683. psubw mm3,mm2 ; 3a -- C6*p3 - C2*p2 scaled
  684. ; ++ ========================================================================
  685. ; The StageThree macro performs a butterfly on the scaled rows n4,n7 and
  686. ; n5,n6 such that:
  687. ; m4 = C7*n4+C1*n7, m7 = C7*n7-C1*n4
  688. ; m5 = C5*n6+C3*n5, m6 = C3*n6-C5*n5
  689. ; Steps 10 -> 1f determine m4,m7 and 20 -> 2f determine m5,m6.
  690. ; The outputs m0-m7 are put into reverse binary order as follows:
  691. ; 0 = 000 -> 000 = 0
  692. ; 1 = 001 -> 100 = 4
  693. ; 2 = 010 -> 010 = 2
  694. ; 3 = 011 -> 110 = 6
  695. ; 4 = 100 -> 001 = 1
  696. ; 5 = 101 -> 101 = 5
  697. ; 6 = 110 -> 011 = 3
  698. ; 7 = 111 -> 111 = 7
  699. ; -- ========================================================================
  700. pmulhw mm1,PD C7 ; 14 -- C7*n7 scaled
  701. movq mm0,mm6 ; 15 -- copy scaled n4
  702. pmulhw mm6,PD C7 ; 16 -- C7*n4 scaled
  703. psraw mm3,1 ; 3b -- n3 = C6*p6 - C2*p2
  704. pmulhw mm0,PD C1 ; 17 -- C1*n4 scaled
  705. movq mm2,mm7 ; 21 -- copy scaled n6
  706. movq [edi+6*8*2],mm3 ; 3d -- Save n3
  707. psllw mm4,2 ; 22 -- scale n5
  708. pmulhw mm7,PD C5 ; 23 -- C5*n6 scaled
  709. movq mm3,mm4 ; 24 -- copy scaled n5
  710. pmulhw mm4,PD C3 ; 25 -- C3*n5 scaled
  711. paddw mm5,mm6 ; 18 -- C7*n4+C1*n7 scaled
  712. pmulhw mm2,PD C3 ; 26 -- C3*n6 scaled
  713. psubw mm1,mm0 ; 19 -- C7*n7-C1*n4 scaled
  714. pmulhw mm3,PD C5 ; 27 -- C5*n5 scaled
  715. psraw mm5,1 ; 1a -- m4 = C7*n4+C1*n7
  716. paddw mm7,mm4 ; 28 -- C5*n6+C3*n5 scaled
  717. psraw mm1,1 ; 1b -- m7 = C7*n7-C1*n4
  718. movq [edi+1*8*2],mm5 ; 1c -- Save m4
  719. psraw mm7,1 ; 29 -- m5 = C5*n6+C3*n5
  720. movq [edi+7*8*2],mm1 ; 1d -- Save m7
  721. psubw mm2,mm3 ; 2a -- C3*n6-C5*n5 scaled
  722. movq [edi+5*8*2],mm7 ; 2b -- Save m5
  723. psraw mm2,1 ; 2c -- m6 = C3*n6-C5*n5
  724. dec esi
  725. movq [edi+3*8*2],mm2 ; 2d -- Save m6
  726. ;
  727. lea edi,[edi+8]
  728. jne RepeatSecondTransform
  729. mov ebp,PITCH
  730. jmp MMxQuantRLE
  731. END