Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

530 lines
24 KiB

  1. ;/* *************************************************************************
  2. ;** INTEL Corporation Proprietary Information
  3. ;**
  4. ;** This listing is supplied under the terms of a license
  5. ;** agreement with INTEL Corporation and may not be copied
  6. ;** nor disclosed except in accordance with the terms of
  7. ;** that agreement.
  8. ;**
  9. ;** Copyright (c) 1996 Intel Corporation.
  10. ;** All Rights Reserved.
  11. ;**
  12. ;** *************************************************************************
  13. ;*/
  14. ;/* *************************************************************************
  15. ;** $Header: S:\h26x\src\dec\d3mmc.asv 1.1 14 Mar 1996 14:34:54 AGUPTA2 $
  16. ;** $Log: S:\h26x\src\dec\d3mmc.asv $
  17. ;//
  18. ;// Rev 1.1 14 Mar 1996 14:34:54 AGUPTA2
  19. ;//
  20. ;// Added alignment directives.
  21. ;//
  22. ;// Rev 1.0 14 Mar 1996 14:32:58 AGUPTA2
  23. ;// Initial revision.
  24. ;** *************************************************************************
  25. ;*/
  26. .586
  27. .model flat
  28. OPTION PROLOGUE:None
  29. OPTION EPILOGUE:None
  30. .xlist
  31. include iammx.inc
  32. .list
  33. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  34. MMXCODE1 ENDS
  35. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  36. MMXDATA1 ENDS
  37. MMXDATA1 SEGMENT
  38. ALIGN 8
  39. C0101010101010101H DD 001010101H, 001010101H
  40. CfefefefefefefefeH DD 0fefefefeH, 0fefefefeH
  41. CfcfcfcfcfcfcfcfcH DD 0fcfcfcfcH, 0fcfcfcfcH
  42. C0303030303030303H DD 003030303H, 003030303H
  43. TWO DD 002020202H, 002020202H
  44. MMXDATA1 ENDS
  45. PITCH TEXTEQU <384>
  46. MMXCODE1 SEGMENT
  47. ; @MMX_Interpolate_Int_Half
  48. ; This routine computes interpolated pels shown by 'x' for a an 8x8 block
  49. ; of pels. 'x' is computed by the formula (A+B+1)/2. The input and output
  50. ; pitch is assumed to be 384 (PITCH).
  51. ; A . . . . . . .
  52. ; x x x x x x x x
  53. ; B . . . . . . .
  54. ; The basic instruction sequence is:
  55. ; movq V0, A
  56. ; movq V2, B
  57. ; movq V1, V0
  58. ; por V1, V2
  59. ; pand V1, 0x0101010101010101
  60. ; pand V0, 0xfefefefefefefefe
  61. ; psrlq V0, 1
  62. ; pand V2, 0xfefefefefefefefe
  63. ; psrlq V2, 1
  64. ; paddb V0, V1
  65. ; paddb V0, V2
  66. ; movq dest, V0
  67. ; The instruction sequence for line 0 is 12 instructions. The instruction
  68. ; sequence for line 1 should be 12 instructions but is not because some of
  69. ; the values needed for line 1 have already been computed for line 0.
  70. ;
  71. ; Registers used for lines 0-7 are:
  72. ; line 0: mm0, mm1, mm2
  73. ; line 1: mm2, mm3, mm4
  74. ; line 2: mm4, mm5, mm0
  75. ; line 3: mm0, mm1, mm2
  76. ; line 4: mm2, mm3, mm4
  77. ; line 5: mm4, mm5, mm0
  78. ; line 6: mm0, mm1, mm2
  79. ; line 7: mm2, mm3, mm4
  80. ; Constants 0x0101010101010101 and 0xfefefefefefefefe are in mm6 and mm7,
  81. ; respectively.
  82. ; Parameters:
  83. ; The source block parameter should be in ecx and the destination block
  84. ; parameter should be in edx; i.e. it uses fastcall calling convention.
  85. ; (I am not aware of a way to declare a MASM function of type __fastcall.)
  86. ; Performance:
  87. ; 41 cycles ignoring unaligned memory accesses
  88. ; 68 cycles if all loads are unaligned (41+9*3); stores should always be
  89. ; aligned.
  90. ALIGN 4
  91. @MMX_Interpolate_Int_Half@8 PROC
  92. EXTRACTLOWBIT TEXTEQU <mm6>
  93. CLEARLOWBIT TEXTEQU <mm7>
  94. movq mm0, [ecx] ;0
  95. ;
  96. movq mm2, [ecx+PITCH] ;0
  97. movq mm1, mm0 ;0
  98. movq mm6, C0101010101010101H ;
  99. movq mm3, mm2 ;1
  100. movq mm7, CfefefefefefefefeH ;
  101. por mm1, mm2 ;0
  102. pand mm0, CLEARLOWBIT ;0
  103. pand mm2, CLEARLOWBIT ;0
  104. psrlq mm0, 1 ;0
  105. pand mm1, EXTRACTLOWBIT ;0
  106. movq mm4, [ecx+2*PITCH] ;1
  107. psrlq mm2, 1 ;0
  108. paddb mm0, mm1 ;0
  109. movq mm5, mm4 ;2
  110. paddb mm0, mm2 ;0
  111. por mm3, mm4 ;1
  112. pand mm4, CLEARLOWBIT ;1
  113. pand mm3, EXTRACTLOWBIT ;1
  114. movq [edx+0*PITCH], mm0 ;0
  115. psrlq mm4, 1 ;1
  116. movq mm0, [ecx+3*PITCH] ;2
  117. paddb mm2, mm3 ;1
  118. movq mm1, mm0 ;3
  119. paddb mm2, mm4 ;1
  120. por mm5, mm0 ;2
  121. pand mm0, CLEARLOWBIT ;2
  122. movq [edx+1*PITCH], mm2 ;1
  123. psrlq mm0, 1 ;2
  124. paddb mm4, mm0 ;2
  125. pand mm5, EXTRACTLOWBIT ;2
  126. movq mm2, [ecx+4*PITCH] ;3
  127. paddb mm4, mm5 ;2
  128. por mm1, mm2 ;3
  129. movq mm3, mm2 ;4
  130. movq [edx+2*PITCH],mm4 ;2
  131. pand mm2, CLEARLOWBIT ;3
  132. psrlq mm2, 1 ;3
  133. pand mm1, EXTRACTLOWBIT ;3
  134. movq mm4, [ecx+5*PITCH] ;4
  135. paddb mm0, mm1 ;3
  136. movq mm5, mm4 ;5
  137. paddb mm0, mm2 ;3
  138. por mm3, mm4 ;4
  139. pand mm4, CLEARLOWBIT ;4
  140. movq [edx+3*PITCH],mm0 ;3
  141. pand mm3, EXTRACTLOWBIT ;4
  142. movq mm0, [ecx+6*PITCH] ;5
  143. psrlq mm4, 1 ;4
  144. movq mm1, mm0 ;6
  145. paddb mm2, mm3 ;4
  146. paddb mm2, mm4 ;4
  147. por mm5, mm0 ;5
  148. pand mm0, CLEARLOWBIT ;5
  149. pand mm5, EXTRACTLOWBIT ;5
  150. movq [edx+4*PITCH], mm2 ;4
  151. psrlq mm0, 1 ;5
  152. movq mm2, [ecx+7*PITCH] ;6
  153. paddb mm4, mm5 ;5
  154. movq mm3, mm2 ;7
  155. paddb mm4, mm0 ;5
  156. por mm1, mm2 ;6
  157. pand mm2, CLEARLOWBIT ;6
  158. movq [edx+5*PITCH], mm4 ;5
  159. pand mm1, EXTRACTLOWBIT ;6
  160. movq mm4, [ecx+8*PITCH] ;7
  161. psrlq mm2, 1 ;6
  162. por mm3, mm4 ;7
  163. paddb mm0, mm1 ;6
  164. paddb mm0, mm2 ;6
  165. pand mm3, EXTRACTLOWBIT ;7
  166. pand mm4, CLEARLOWBIT ;7
  167. paddb mm3, mm2 ;7
  168. movq [edx+6*PITCH], mm0 ;6
  169. psrlq mm4, 1 ;7
  170. paddb mm3, mm4 ;7
  171. ;
  172. ;
  173. ;
  174. movq [edx+7*PITCH], mm3 ;7
  175. ret
  176. EXTRACTLOWBIT TEXTEQU <>
  177. CLEARLOWBIT TEXTEQU <>
  178. @MMX_Interpolate_Int_Half@8 endp
  179. ; @MMX_Interpolate_Half_Int
  180. ; This routine computes interpolated pels shown by 'x' for a an 8x8 block
  181. ; of pels. 'x' is computed by the formula (A+B+1)/2. The input and output
  182. ; pitch is assumed to be 384 (PITCH).
  183. ; A X B X . X . X . X . X . X . X
  184. ; The basic instruction sequence is:
  185. ; movq V0, A
  186. ; movq V2, B
  187. ; movq V1, V0
  188. ; por V1, V2
  189. ; pand V1, 0x0101010101010101
  190. ; pand V0, 0xfefefefefefefefe
  191. ; psrlq V0, 1
  192. ; pand V2, 0xfefefefefefefefe
  193. ; psrlq V2, 1
  194. ; paddb V0, V1
  195. ; paddb V0, V2
  196. ; movq dest, V0
  197. ; The instruction sequence for all lines is 12 instructions.
  198. ;
  199. ; Registers used for lines 0-7 are:
  200. ; line 0: mm0, mm1, mm2
  201. ; line 1: mm3, mm4, mm5
  202. ; line 2: mm0, mm1, mm2
  203. ; line 3: mm3, mm4, mm5
  204. ; line 4: mm0, mm1, mm2
  205. ; line 5: mm3, mm4, mm5
  206. ; line 6: mm0, mm1, mm2
  207. ; line 7: mm3, mm4, mm5
  208. ; Constants 0x0101010101010101 and 0xfefefefefefefefe are in mm6 and mm7,
  209. ; respectively.
  210. ; Parameters:
  211. ; The source block parameter should be in ecx and the destination block
  212. ; parameter should be in edx; i.e. it uses fastcall calling convention.
  213. ; Performance:
  214. ; 51 cycles ignoring unaligned memory accesses
  215. ; 99 cycles if all loads are unaligned (51+8*6); stores should always be
  216. ; aligned.
  217. ALIGN 4
  218. @MMX_Interpolate_Half_Int@8 proc
  219. EXTRACTLOWBIT TEXTEQU <mm6>
  220. CLEARLOWBIT TEXTEQU <mm7>
  221. movq mm0, [ecx] ;0 mm0,mm1=left pels
  222. ; ; mm2 =right pels
  223. movq mm2, [ecx+1] ;0 mm1 =interp pels
  224. movq mm1, mm0 ;0
  225. movq mm7, CfefefefefefefefeH ;
  226. por mm1, mm2 ;0
  227. movq mm6, C0101010101010101H ;
  228. pand mm0, CLEARLOWBIT ;0
  229. pand mm2, CLEARLOWBIT ;0
  230. psrlq mm0, 1 ;0
  231. psrlq mm2, 1 ;0
  232. pand mm1, EXTRACTLOWBIT ;0
  233. movq mm3, [ecx+1*PITCH] ;1 mm3,mm4=left pels
  234. paddb mm1, mm0 ;0 mm5 =right pels
  235. movq mm5, [ecx+1*PITCH+1] ;1 mm4 =interp pels
  236. paddb mm1, mm2 ;0
  237. movq mm4, mm3 ;1
  238. pand mm3, CLEARLOWBIT ;1
  239. movq [edx], mm1 ;0
  240. por mm4, mm5 ;1
  241. psrlq mm3, 1 ;1
  242. pand mm5, CLEARLOWBIT ;1
  243. psrlq mm5, 1 ;1
  244. pand mm4, EXTRACTLOWBIT ;1
  245. movq mm0, [ecx+2*PITCH] ;2 mm0,mm1=left pels
  246. paddb mm4, mm3 ;1 mm2 =right pels
  247. movq mm2, [ecx+2*PITCH+1] ;2 mm1 =interp pels
  248. paddb mm4, mm5 ;1
  249. movq mm1, mm0 ;2
  250. pand mm0, CLEARLOWBIT ;2
  251. movq [edx+1*PITCH], mm4 ;1
  252. por mm1, mm2 ;2
  253. psrlq mm0, 1 ;2
  254. pand mm2, CLEARLOWBIT ;2
  255. psrlq mm2, 1 ;2
  256. pand mm1, EXTRACTLOWBIT ;2
  257. movq mm3, [ecx+3*PITCH] ;3 mm3,mm4=left pels
  258. paddb mm1, mm0 ;2 mm5 =right pels
  259. movq mm5, [ecx+3*PITCH+1] ;3 mm4 =interp pels
  260. paddb mm1, mm2 ;2
  261. movq mm4, mm3 ;3
  262. pand mm3, CLEARLOWBIT ;3
  263. movq [edx+2*PITCH], mm1 ;2
  264. por mm4, mm5 ;3
  265. psrlq mm3, 1 ;3
  266. pand mm5, CLEARLOWBIT ;3
  267. psrlq mm5, 1 ;3
  268. pand mm4, EXTRACTLOWBIT ;3
  269. movq mm0, [ecx+4*PITCH] ;4 mm0,mm1=left pels
  270. paddb mm4, mm3 ;3 mm2 =right pels
  271. movq mm2, [ecx+4*PITCH+1] ;4 mm1 =interp pels
  272. paddb mm4, mm5 ;3
  273. movq mm1, mm0 ;4
  274. pand mm0, CLEARLOWBIT ;4
  275. movq [edx+3*PITCH], mm4 ;3
  276. por mm1, mm2 ;4
  277. psrlq mm0, 1 ;4
  278. pand mm2, CLEARLOWBIT ;4
  279. psrlq mm2, 1 ;4
  280. pand mm1, EXTRACTLOWBIT ;4
  281. movq mm3, [ecx+5*PITCH] ;5 mm3,mm4=left pels
  282. paddb mm1, mm0 ;4 mm5 =right pels
  283. movq mm5, [ecx+5*PITCH+1] ;5 mm4 =interp pels
  284. paddb mm1, mm2 ;4
  285. movq mm4, mm3 ;5
  286. pand mm3, CLEARLOWBIT ;5
  287. movq [edx+4*PITCH], mm1 ;4
  288. por mm4, mm5 ;5
  289. psrlq mm3, 1 ;5
  290. pand mm5, CLEARLOWBIT ;5
  291. psrlq mm5, 1 ;5
  292. pand mm4, EXTRACTLOWBIT ;5
  293. movq mm0, [ecx+6*PITCH] ;6 mm0,mm1=left pels
  294. paddb mm4, mm3 ;5 mm2 =right pels
  295. movq mm2, [ecx+6*PITCH+1] ;6 mm1 =interp pels
  296. paddb mm4, mm5 ;5
  297. movq mm1, mm0 ;6
  298. pand mm0, CLEARLOWBIT ;6
  299. movq [edx+5*PITCH], mm4 ;5
  300. por mm1, mm2 ;6
  301. psrlq mm0, 1 ;6
  302. pand mm2, CLEARLOWBIT ;6
  303. psrlq mm2, 1 ;6
  304. pand mm1, EXTRACTLOWBIT ;6
  305. movq mm3, [ecx+7*PITCH] ;7 mm3,mm4=left pels
  306. paddb mm1, mm0 ;6 mm5 =right pels
  307. movq mm5, [ecx+7*PITCH+1] ;7 mm4 =interp pels
  308. paddb mm1, mm2 ;6
  309. movq mm4, mm3 ;7
  310. pand mm3, CLEARLOWBIT ;7
  311. por mm4, mm5 ;7
  312. psrlq mm3, 1 ;7
  313. pand mm4, EXTRACTLOWBIT ;7
  314. pand mm5, CLEARLOWBIT ;7
  315. psrlq mm5, 1 ;7
  316. paddb mm4, mm3 ;7
  317. movq [edx+6*PITCH], mm1 ;6
  318. paddb mm4, mm5 ;7
  319. ;
  320. ;
  321. movq [edx+7*PITCH], mm4 ;7
  322. ret
  323. EXTRACTLOWBIT TEXTEQU <>
  324. CLEARLOWBIT TEXTEQU <>
  325. @MMX_Interpolate_Half_Int@8 endp
  326. ; @MMX_Interpolate_Half_Half
  327. ; This routine computes interpolated pels shown by 'X' for a an 8x8 block
  328. ; of pels. 'x' is computed by the formula (A+B+C+D+2)/4. The input and
  329. ; output pitch is assumed to be 384 (PITCH).
  330. ; A B
  331. ; X
  332. ; C D
  333. ; The value (A+B+C+D+2)/4 is computed as (A'+B'+C'+D')+((A*+B*+C*+D*+2)/4)
  334. ; where A = 4*A' + A*, etc.
  335. ; Parameters:
  336. ; The source block parameter should be in ecx and the destination block
  337. ; parameter should be in edx; i.e. it uses fastcall calling convention.
  338. ; Performance:
  339. ; 84 cycles ignoring unaligned memory accesses
  340. ; 138 cycles if all loads are unaligned (84+9*2*3); stores should always be
  341. ; aligned. Average cycle count will be less than 138.
  342. ALIGN 4
  343. @MMX_Interpolate_Half_Half@8 proc
  344. EXTRACTLOWBITS TEXTEQU <mm6>
  345. CLEARLOWBITS TEXTEQU <mm7>
  346. movq mm0, [ecx] ;0 A(mm0,mm1) B(mm4,mm5)
  347. ; 0
  348. movq mm7, CfcfcfcfcfcfcfcfcH ; C(mm2,mm3) D(mm4,mm5)
  349. movq mm1, mm0 ;0
  350. movq mm4, [ecx+1] ;0
  351. pand mm0, CLEARLOWBITS ;0
  352. movq mm6, C0303030303030303H ;
  353. movq mm5, mm4 ;0
  354. pand mm4, CLEARLOWBITS ;0
  355. pand mm1, EXTRACTLOWBITS ;0
  356. psrlq mm0, 2 ;0
  357. pand mm5, EXTRACTLOWBITS ;0
  358. psrlq mm4, 2 ;0
  359. paddb mm1, mm5 ;0 (A+B) low
  360. movq mm2, [ecx+1*PITCH] ;0
  361. paddb mm0, mm4 ;0 (A+B)/4 high
  362. movq mm4, [ecx+1*PITCH+1] ;0
  363. movq mm3, mm2 ;0
  364. pand mm3, EXTRACTLOWBITS ;0
  365. movq mm5, mm4 ;0
  366. pand mm5, EXTRACTLOWBITS ;0
  367. pand mm2, CLEARLOWBITS ;0
  368. pand mm4, CLEARLOWBITS ;0
  369. paddb mm3, mm5 ;0 (C+D) low
  370. paddb mm3, TWO ;0 (C+D+2) low = mm3
  371. psrlq mm2, 2 ;0
  372. paddb mm1, mm3 ;0 (A+B+C+D+2) low
  373. psrlq mm4, 2 ;0
  374. paddb mm2, mm4 ;0 (C+D)/4 high = mm2
  375. psrlq mm1, 2 ;0 (A+B+C+D+2)/4 low dirty
  376. paddb mm0, mm2 ;0 (A+B+C+D)/4 high
  377. pand mm1, EXTRACTLOWBITS ;0 (A+B+C+D+2)/4 low clean
  378. movq mm4, [ecx+2*PITCH] ;1 high(mm2) low(mm3)
  379. paddb mm0, mm1 ;0 1
  380. movq mm1, [ecx+2*PITCH+1] ;1 C(mm4,mm5) D(mm0,mm1)
  381. movq mm5, mm4 ;1
  382. movq [edx], mm0 ;0
  383. movq mm0, mm1 ;1
  384. pand mm0, CLEARLOWBITS ;1
  385. pand mm4, CLEARLOWBITS ;1
  386. psrlq mm0, 2 ;1
  387. pand mm1, EXTRACTLOWBITS ;1
  388. psrlq mm4, 2 ;1
  389. pand mm5, EXTRACTLOWBITS ;1
  390. paddb mm0, mm4 ;1 (C+D)/4 high = mm0
  391. paddb mm1, mm5 ;1 (C+D) low
  392. paddb mm2, mm0 ;1 (A+B+C+D)/4 high
  393. paddb mm3, mm1 ;1 (A+B+C+D+2) low
  394. movq mm4, [ecx+3*PITCH] ;2
  395. psrlq mm3, 2 ;1 (A+B+C+D+2)/4 low dirty
  396. movq mm5, mm4 ;2 high(mm0) low(mm1)
  397. pand mm3, EXTRACTLOWBITS ;1 2
  398. paddb mm2, mm3 ;1 C(mm4,mm5) D(mm2,mm3)
  399. pand mm5, EXTRACTLOWBITS ;2
  400. movq mm3, [ecx+3*PITCH+1] ;2
  401. pand mm4, CLEARLOWBITS ;2
  402. movq [edx+1*PITCH], mm2 ;1
  403. movq mm2, mm3 ;2
  404. pand mm3, EXTRACTLOWBITS ;2
  405. pand mm2, CLEARLOWBITS ;2
  406. psrlq mm4, 2 ;2
  407. paddb mm3, mm5 ;2
  408. paddb mm3, TWO ;2 (C+D+2) low = mm3
  409. psrlq mm2, 2 ;2
  410. paddb mm1, mm3 ;2 (A+B+C+D+2) low
  411. paddb mm2, mm4 ;2 (C+D)/4 hign = mm2
  412. psrlq mm1, 2 ;2 (A+B+C+D+2)/4 low dirty
  413. paddb mm0, mm2 ;2 (A+B+C+D)/4 high
  414. movq mm4, [ecx+4*PITCH] ;3 high(mm2) low(mm3)
  415. pand mm1, EXTRACTLOWBITS ;2 3
  416. movq mm5, mm4 ;3 C(mm4,mm5) D(mm0,mm1)
  417. paddb mm0, mm1 ;2
  418. movq mm1, [ecx+4*PITCH+1] ;3
  419. pand mm4, CLEARLOWBITS ;3
  420. movq [edx+2*PITCH], mm0 ;2
  421. movq mm0, mm1 ;3
  422. pand mm0, CLEARLOWBITS ;3
  423. pand mm1, EXTRACTLOWBITS ;3
  424. psrlq mm0, 2 ;3
  425. pand mm5, EXTRACTLOWBITS ;3
  426. psrlq mm4, 2 ;3
  427. paddb mm1, mm5 ;3 (C+D) low = mm1
  428. paddb mm0, mm4 ;3 (C+D)/4 high = mm0
  429. paddb mm3, mm1 ;3 (A+B+C+D+2) low
  430. paddb mm2, mm0 ;3 (A+B+C+D)/4 high
  431. psrlq mm3, 2 ;3 (A+B+C+D+2)/4 low dirty
  432. movq mm4, [ecx+5*PITCH] ;4
  433. pand mm3, EXTRACTLOWBITS ;3 (A+B+C+D+2)/4 low clean
  434. movq mm5, mm4 ;4
  435. paddb mm2, mm3 ;3 high(mm0) low(mm1)
  436. movq mm3, [ecx+5*PITCH+1] ;4 4
  437. pand mm4, CLEARLOWBITS ;4 C(mm4,mm5) D(mm2,mm3)
  438. movq [edx+3*PITCH], mm2 ;3
  439. movq mm2, mm3 ;4
  440. pand mm2, CLEARLOWBITS ;4
  441. pand mm5, EXTRACTLOWBITS ;4
  442. psrlq mm4, 2 ;4
  443. pand mm3, EXTRACTLOWBITS ;4
  444. psrlq mm2, 2 ;4
  445. paddb mm3, mm5 ;4
  446. paddb mm3, TWO ;4 (C+D+2) low = mm3
  447. paddb mm2, mm4 ;4 (C+D)/4 high = mm2
  448. paddb mm1, mm3 ;4 (A+B+C+D+2) low
  449. paddb mm0, mm2 ;4 (A+B+C+D)/4 high
  450. movq mm4, [ecx+6*PITCH] ;5
  451. psrlq mm1, 2 ;4 (A+B+C+D+2)/4 low dirty
  452. movq mm5, mm4 ;5
  453. pand mm1, EXTRACTLOWBITS ;4 (A+B+C+D+2)/4 low clean
  454. paddb mm0, mm1 ;4
  455. pand mm4, CLEARLOWBITS ;5 high(mm2) low(mm3)
  456. movq mm1, [ecx+6*PITCH+1] ;5 5
  457. psrlq mm4, 2 ;5 C(mm4,mm5) D(mm0,mm1)
  458. movq [edx+4*PITCH], mm0 ;4
  459. movq mm0, mm1 ;5
  460. pand mm1, EXTRACTLOWBITS ;5
  461. pand mm5, EXTRACTLOWBITS ;5
  462. pand mm0, CLEARLOWBITS ;5
  463. paddb mm1, mm5 ;5 (C+D) low = mm1
  464. psrlq mm0, 2 ;5
  465. paddb mm3, mm1 ;5 (A+B+C+D+2) low
  466. psrlq mm3, 2 ;5 (A+B+C+D+2)/4 low dirty
  467. paddb mm0, mm4 ;5 (C+D)/4 high = mm0
  468. pand mm3, EXTRACTLOWBITS ;5 (A+B+C+D+2)/4 low clean
  469. paddb mm2, mm0 ;5 (A+B+C+D)/4 high
  470. movq mm4, [ecx+7*PITCH] ;6 high(mm0) low(mm1)
  471. paddb mm2, mm3 ;5 6
  472. movq mm3, [ecx+7*PITCH+1] ;6 C(mm4,mm5) D(mm2,mm3)
  473. movq mm5, mm4 ;6
  474. movq [edx+5*PITCH], mm2 ;5
  475. movq mm2, mm3 ;6
  476. pand mm5, EXTRACTLOWBITS ;6
  477. pand mm3, EXTRACTLOWBITS ;6
  478. pand mm2, CLEARLOWBITS ;6
  479. paddb mm3, mm5 ;6
  480. pand mm4, CLEARLOWBITS ;6
  481. psrlq mm2, 2 ;6
  482. paddb mm3, TWO ;6 (C+D+2) low = mm3
  483. psrlq mm4, 2 ;6
  484. paddb mm2, mm4 ;6 (C+D)/4 high = mm2
  485. paddb mm1, mm3 ;6 (A+B+C+D+2) low
  486. paddb mm0, mm2 ;6 (A+B+C+D)/4 high
  487. psrlq mm1, 2 ;6 (A+B+C+D+2)/4 low dirty
  488. movq mm4, [ecx+8*PITCH] ;7 high(mm2) low(mm3)
  489. pand mm1, EXTRACTLOWBITS ;6 7
  490. movq mm5, mm4 ;7 C(mm4,mm5) D(mm0,mm1)
  491. paddb mm0, mm1 ;6
  492. movq mm1, [ecx+8*PITCH+1] ;7
  493. pand mm4, CLEARLOWBITS ;7
  494. movq [edx+6*PITCH], mm0 ;6
  495. movq mm0, mm1 ;7
  496. pand mm0, CLEARLOWBITS ;7
  497. pand mm5, EXTRACTLOWBITS ;7
  498. psrlq mm4, 2 ;7
  499. pand mm1, EXTRACTLOWBITS ;7
  500. psrlq mm0, 2 ;7
  501. paddb mm1, mm5 ;7 (C+D) low
  502. paddb mm0, mm4 ;7 (C+D)/4 high
  503. paddb mm3, mm1 ;7 (A+B+C+D+2) low
  504. psrlq mm3, 2 ;7 (A+B+C+D+2)/4 low dirty
  505. paddb mm2, mm0 ;7 (A+B+C+D)/4 high
  506. pand mm3, EXTRACTLOWBITS ;7 (A+B+C+D+2)/4 low clean
  507. ;
  508. paddb mm2, mm3 ;7
  509. ;
  510. ;
  511. ;
  512. movq [edx+7*PITCH], mm2 ;7
  513. ret
  514. EXTRACTLOWBITS TEXTEQU <>
  515. CLEARLOWBITS TEXTEQU <>
  516. @MMX_Interpolate_Half_Half@8 endp
  517. MMXCODE1 ENDS
  518. END