Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

991 lines
22 KiB

  1. ;--------------------------------------------------------------------------;
  2. ;
  3. ; INTEL Corporation Proprietary Information
  4. ;
  5. ; This listing is supplied under the terms of a license
  6. ; agreement with INTEL Corporation and may not be copied
  7. ; nor disclosed except in accordance with the terms of
  8. ; that agreement.
  9. ;
  10. ; Copyright (c) 1996 Intel Corporation.
  11. ; All Rights Reserved.
  12. ;
  13. ;--------------------------------------------------------------------------;
  14. ;
  15. ; $Header: S:\h26x\src\dec\d3madvpr.asv 1.6 01 Oct 1996 16:45:38 KLILLEVO $
  16. ; $Log: S:\h26x\src\dec\d3madvpr.asv $
  17. ;//
  18. ;// Rev 1.6 01 Oct 1996 16:45:38 KLILLEVO
  19. ;// removed unneccessary local variable and added code to verify
  20. ;// PITCH is 384 at compile-time
  21. ;//
  22. ;// Rev 1.5 01 Oct 1996 11:57:52 KLILLEVO
  23. ;// pairing done, saved about 5*4 = 20 cycles per block = 11880 cycles
  24. ;// per QCIF picture
  25. ;//
  26. ;// Rev 1.4 27 Sep 1996 17:28:40 KLILLEVO
  27. ;// added clipping of extended motion vectors, but pairing is horrible and
  28. ;// needs to be improved
  29. ;//
  30. ;// Rev 1.3 01 Apr 1996 12:35:14 RMCKENZX
  31. ;//
  32. ;// Added MMXCODE1 and MMXDATA1 segments, moved global data
  33. ;// to MMXDATA1 segment.
  34. ;//
  35. ;// Rev 1.2 07 Mar 1996 18:32:16 RMCKENZX
  36. ;//
  37. ;// Re-organized and optimized routine. Interpolaters now
  38. ;// interpolate & weight, driver accumulates and averages. Interpolaters
  39. ;// return results in mm4-mm7. Eliminated include file.
  40. ;//
  41. ;// Rev 1.0 27 Feb 1996 15:03:42 RMCKENZX
  42. ;// Initial revision.
  43. ;
  44. ;--------------------------------------------------------------------------;
  45. ;
  46. ; File:
  47. ; d3madvpr.asm
  48. ;
  49. ; Routines:
  50. ; MMX_AdvancePredict Driver
  51. ; MMxInterpolateAndAccumulate Assembly-called interpolate accumulate
  52. ;
  53. ;--------------------------------------------------------------------------;
  54. .586
  55. .MODEL FLAT
  56. ; make all symbols case sensitive
  57. OPTION CASEMAP:NONE
  58. .xlist
  59. include iammx.inc
  60. .list
  61. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  62. MMXCODE1 ENDS
  63. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  64. MMXDATA1 ENDS
  65. ;--------------------------------------------------------------------------;
  66. ;
  67. ; MMX_AdvancePredict
  68. ;
  69. ; Description:
  70. ; This routine performs advanced prediction, including overlapped
  71. ; block motion compensation. It uses the assembly routine
  72. ; MMxInterpolateAndAccumulate.
  73. ;
  74. ; This routine is the assembly equivalent of NewAdvancePredict.
  75. ;
  76. ; Inputs: (dwords pushed onto stack by caller)
  77. ; DC flat pointer to decoder catalog.
  78. ; fpBlockAction flat pointer to block action stream.
  79. ; iNext flat pointer to offsets for 4 neighboring blocks.
  80. ; 0 = left
  81. ; 1 = right
  82. ; 2 = above
  83. ; 3 = below
  84. ;
  85. ;
  86. ; Register Usage:
  87. ;
  88. ;
  89. ; Notes:
  90. ;
  91. ;--------------------------------------------------------------------------;
  92. ; register storage
  93. ; ebp esp+00
  94. ; ebx esp+04
  95. ; edi esp+08
  96. ; esi esp+12
  97. ; local variable definitions
  98. lpBlockAction EQU esp+16 ; local block action stream pointer
  99. lNext EQU esp+20 ; local block action offsets pointer
  100. lClipX EQU esp+24 ; local copy of pointer to x vector clipping table
  101. lClipY EQU esp+28 ; local copy of pointer to y vector clipping table
  102. lNext EQU esp+32 ; local offsets (4 DWORDS = 16 bytes)
  103. lAccum EQU esp+64 ; accumulator (64 WORDS = 128 bytes)
  104. zero EQU mm0
  105. lDst EQU edi ; local destination pointer
  106. ; C input parameters
  107. fpBlockAction EQU ebp+08 ; block action stream pointer
  108. iNext EQU esp+12 ; block action offsets pointer
  109. pDst EQU ebp+16 ; destination pointer
  110. pClipX EQU ebp+20 ; x vector clipping table
  111. pClipY EQU ebp+24 ; y vector clipping table
  112. ; MMX globals
  113. ; the weight tables are each 64 WORDS stored in Quadrant ascending order
  114. WtCt EQU gMMX_WeightCenter
  115. WtLR EQU gMMX_WeightLeftRight
  116. WtAB EQU gMMX_WeightAboveBelow
  117. Round1 EQU gMMX_Round1
  118. Round2 EQU gMMX_Round2
  119. Round4 EQU gMMX_Round4
  120. PITCH = 384
  121. FRAMESIZE = 256
  122. ; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT ****
  123. ;
  124. ; ANY CHANGES TO THE BLOCK ACTION STRUCTURE
  125. ; IN d3dec.h MUST BE ECHOED HERE!!!!
  126. ;
  127. ; **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT **** ALERT ****
  128. ; Offsets into Block Action structure T_BlkAction of length 20
  129. ; see the definition in d3dec.h
  130. i8MVx2 = 1 ; I8 = signed byte
  131. i8MVy2 = 2 ; I8 = signed byte
  132. pRefBlock = 8 ; U32 = unsigned dword
  133. MMXDATA1 SEGMENT
  134. ALIGN 8
  135. gMMX_WeightCenter LABEL DWORD
  136. WORD 5, 5, 5, 4, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6, 5, 5 ; Quadrant I
  137. WORD 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6, 6 ; Quadrant II
  138. WORD 5, 5, 6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 4, 5, 5, 5 ; Quadrant III
  139. WORD 6, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4 ; Quadrant IV
  140. gMMX_WeightLeftRight LABEL DWORD
  141. WORD 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 ; Quadrant I
  142. WORD 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 ; Quadrant II
  143. WORD 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1 ; Quadrant III
  144. WORD 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2 ; Quadrant IV
  145. gMMX_WeightAboveBelow LABEL DWORD
  146. WORD 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ; Quadrant I
  147. WORD 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 ; Quadrant II
  148. WORD 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2 ; Quadrant III
  149. WORD 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2 ; Quadrant IV
  150. gMMX_Round1 DWORD 00010001h, 00010001h
  151. gMMX_Round2 DWORD 00020002h, 00020002h
  152. gMMX_Round4 DWORD 00040004h, 00040004h
  153. MMXDATA1 ENDS
  154. ;--------------------------------------------------------------------------;
  155. ;--------------------------------------------------------------------------;
  156. MMXCODE1 SEGMENT
  157. PUBLIC C MMX_AdvancePredict
  158. IF PITCH-384
  159. ** error: this code assumes PITCH is 384
  160. ENDIF
  161. ;--------------------------------------------------------------------------;
  162. ; Start Code
  163. ;--------------------------------------------------------------------------;
  164. MMX_AdvancePredict:
  165. push ebp
  166. mov ebp, esp
  167. mov edx, [iNext]
  168. and esp, -32 ; align stack on cache boundary
  169. sub esp, FRAMESIZE
  170. pxor zero, zero ; zero for unpacking
  171. push esi
  172. push edi
  173. push ebx
  174. push ebp
  175. mov eax, [pClipX]
  176. mov ebx, [pClipY]
  177. mov [lClipX], eax
  178. mov [lClipY], ebx
  179. mov lDst, [pDst]
  180. mov eax, 00[edx]
  181. mov ebp, [fpBlockAction]
  182. mov ebx, 04[edx]
  183. lea eax, [eax+4*eax]
  184. mov ecx, 08[edx]
  185. lea ebx, [ebx+4*ebx]
  186. mov edx, 12[edx]
  187. lea ecx, [ecx+4*ecx]
  188. mov 00[lNext], eax
  189. lea edx, [edx+4*edx]
  190. mov 04[lNext], ebx
  191. mov 08[lNext], ecx
  192. mov 12[lNext], edx
  193. ;-----------------------------------------------------------------------;
  194. ; ;
  195. ; Center ;
  196. ; ;
  197. ;-----------------------------------------------------------------------;
  198. xor ecx, ecx
  199. mov esi, [lClipY]
  200. mov cl, i8MVy2[ebp]
  201. xor edx, edx
  202. add cl, 64
  203. mov dl, i8MVx2[ebp]
  204. add dl, 64
  205. mov ebx, [lClipX]
  206. mov ah, [ecx + esi]
  207. mov esi, pRefBlock[ebp]
  208. mov al, [edx + ebx]
  209. mov dl, ah
  210. shl edx, 24
  211. mov cl, al
  212. sar edx, 18
  213. xor cl, 080H
  214. shr ecx, 1
  215. and edx, 0FFFFFF80H
  216. lea ebx, [WtCt + 32]
  217. add esi, ecx
  218. lea edx, [edx + edx*2 - 64]
  219. add esi, edx
  220. ; Quadrant II
  221. call MMxInterpolateAndAccumulate
  222. movq mm3, [Round4]
  223. paddw mm4, mm3
  224. add esi, 4
  225. paddw mm5, mm3
  226. sub ebx, 32
  227. movq [lAccum+00], mm4
  228. paddw mm6, mm3
  229. movq [lAccum+16], mm5
  230. paddw mm7, mm3
  231. movq [lAccum+32], mm6
  232. movq [lAccum+48], mm7
  233. ; Quadrant I
  234. call MMxInterpolateAndAccumulate
  235. movq mm3, [Round4]
  236. paddw mm4, mm3
  237. add esi, 4*PITCH-4
  238. paddw mm5, mm3
  239. add ebx, 64
  240. movq [lAccum+08], mm4
  241. paddw mm6, mm3
  242. movq [lAccum+24], mm5
  243. paddw mm7, mm3
  244. movq [lAccum+40], mm6
  245. movq [lAccum+56], mm7
  246. ; Quadrant III
  247. call MMxInterpolateAndAccumulate
  248. movq mm3, [Round4]
  249. paddw mm4, mm3
  250. add esi, 4
  251. paddw mm5, mm3
  252. add ebx, 32
  253. movq [lAccum+64], mm4
  254. paddw mm6, mm3
  255. movq [lAccum+80], mm5
  256. paddw mm7, mm3
  257. movq [lAccum+96], mm6
  258. movq [lAccum+112], mm7
  259. ; Quadrant IV
  260. call MMxInterpolateAndAccumulate
  261. movq mm3, [Round4]
  262. paddw mm4, mm3
  263. mov ebx, 00[lNext]
  264. paddw mm5, mm3
  265. movq [lAccum+72], mm4
  266. paddw mm6, mm3
  267. movq [lAccum+88], mm5
  268. paddw mm7, mm3
  269. movq [lAccum+104], mm6
  270. movq [lAccum+120], mm7
  271. ;-----------------------------------------------------------------------;
  272. ; ;
  273. ; Left ;
  274. ; ;
  275. ;-----------------------------------------------------------------------;
  276. xor ecx, ecx
  277. mov esi, [lClipY]
  278. mov cl, i8MVy2[ebp + 4*ebx]
  279. xor edx, edx
  280. add cl, 64
  281. mov dl, i8MVx2[ebp + 4*ebx]
  282. add dl, 64
  283. mov ebx, [lClipX]
  284. mov ah, [ecx + esi]
  285. mov esi, pRefBlock[ebp]
  286. mov al, [edx + ebx]
  287. mov dl, ah
  288. shl edx, 24
  289. mov cl, al
  290. sar edx, 18
  291. xor cl, 080H
  292. shr ecx, 1
  293. and edx, 0FFFFFF80H
  294. lea ebx, [WtLR + 32]
  295. add esi, ecx
  296. lea edx, [edx + edx*2 - 64]
  297. add esi, edx
  298. ; Quadrant II
  299. call MMxInterpolateAndAccumulate
  300. paddw mm4, [lAccum+00]
  301. paddw mm5, [lAccum+16]
  302. paddw mm6, [lAccum+32]
  303. paddw mm7, [lAccum+48]
  304. movq [lAccum+00], mm4
  305. movq [lAccum+16], mm5
  306. movq [lAccum+32], mm6
  307. movq [lAccum+48], mm7
  308. ; Quadrant III
  309. add esi, 4*PITCH
  310. add ebx, 32
  311. call MMxInterpolateAndAccumulate
  312. paddw mm4, [lAccum+64]
  313. paddw mm5, [lAccum+80]
  314. paddw mm6, [lAccum+96]
  315. paddw mm7, [lAccum+112]
  316. movq [lAccum+64], mm4
  317. movq [lAccum+80], mm5
  318. movq [lAccum+96], mm6
  319. mov ebx, 04[lNext]
  320. movq [lAccum+112], mm7
  321. ;-----------------------------------------------------------------------;
  322. ; ;
  323. ; Right ;
  324. ; ;
  325. ;-----------------------------------------------------------------------;
  326. xor ecx, ecx
  327. mov esi, [lClipY]
  328. mov cl, i8MVy2[ebp + 4*ebx]
  329. xor edx, edx
  330. add cl, 64
  331. mov dl, i8MVx2[ebp + 4*ebx]
  332. add dl, 64
  333. mov ebx, [lClipX]
  334. mov ah, [ecx + esi]
  335. mov esi, pRefBlock[ebp]
  336. mov al, [edx + ebx]
  337. mov dl, ah
  338. shl edx, 24
  339. mov cl, al
  340. sar edx, 18
  341. xor cl, 080H
  342. shr ecx, 1
  343. and edx, 0FFFFFF80H
  344. lea ebx, [WtLR]
  345. add esi, ecx
  346. lea edx, [edx + edx*2 - 64]
  347. add esi, 4
  348. add esi, edx
  349. ; Quadrant I
  350. call MMxInterpolateAndAccumulate
  351. paddw mm4, [lAccum+08]
  352. paddw mm5, [lAccum+24]
  353. paddw mm6, [lAccum+40]
  354. paddw mm7, [lAccum+56]
  355. movq [lAccum+08], mm4
  356. movq [lAccum+24], mm5
  357. movq [lAccum+40], mm6
  358. movq [lAccum+56], mm7
  359. ; Quadrant IV
  360. add esi, 4*PITCH
  361. add ebx, 96
  362. call MMxInterpolateAndAccumulate
  363. paddw mm4, [lAccum+72]
  364. paddw mm5, [lAccum+88]
  365. paddw mm6, [lAccum+104]
  366. paddw mm7, [lAccum+120]
  367. movq [lAccum+72], mm4
  368. movq [lAccum+88], mm5
  369. movq [lAccum+104], mm6
  370. mov ebx, 08[lNext]
  371. movq [lAccum+120], mm7
  372. ;-----------------------------------------------------------------------;
  373. ; ;
  374. ; Above ;
  375. ; ;
  376. ;-----------------------------------------------------------------------;
  377. xor ecx, ecx
  378. mov esi, [lClipY]
  379. mov cl, i8MVy2[ebp + 4*ebx]
  380. xor edx, edx
  381. add cl, 64
  382. mov dl, i8MVx2[ebp + 4*ebx]
  383. add dl, 64
  384. mov ebx, [lClipX]
  385. mov ah, [ecx + esi]
  386. mov esi, pRefBlock[ebp]
  387. mov al, [edx + ebx]
  388. mov dl, ah
  389. shl edx, 24
  390. mov cl, al
  391. sar edx, 18
  392. xor cl, 080H
  393. shr ecx, 1
  394. and edx, 0FFFFFF80H
  395. lea ebx, [WtAB]
  396. add esi, ecx
  397. lea edx, [edx + edx*2 - 64]
  398. add esi, 4
  399. add esi, edx
  400. ; Quadrant I
  401. call MMxInterpolateAndAccumulate
  402. paddw mm4, [lAccum+08]
  403. paddw mm5, [lAccum+24]
  404. psraw mm4, 3
  405. paddw mm6, [lAccum+40]
  406. psraw mm5, 3
  407. paddw mm7, [lAccum+56]
  408. psraw mm6, 3
  409. movq [lAccum+08], mm4
  410. psraw mm7, 3
  411. movq [lAccum+24], mm5
  412. movq [lAccum+40], mm6
  413. movq [lAccum+56], mm7
  414. ; Quadrant II
  415. sub esi, 4
  416. add ebx, 32
  417. call MMxInterpolateAndAccumulate
  418. paddw mm4, [lAccum+00]
  419. paddw mm5, [lAccum+16]
  420. paddw mm6, [lAccum+32]
  421. psraw mm4, 3
  422. paddw mm7, [lAccum+48]
  423. psraw mm5, 3
  424. packuswb mm4, [lAccum+08]
  425. packuswb mm5, [lAccum+24]
  426. movq [lDst+00], mm4
  427. psraw mm6, 3
  428. movq [lDst+PITCH], mm5
  429. psraw mm7, 3
  430. packuswb mm6, [lAccum+40]
  431. packuswb mm7, [lAccum+56]
  432. movq [lDst+2*PITCH], mm6
  433. mov ebx, 12[lNext]
  434. movq [lDst+3*PITCH], mm7
  435. ;-----------------------------------------------------------------------;
  436. ; ;
  437. ; Below ;
  438. ; ;
  439. ;-----------------------------------------------------------------------;
  440. xor ecx, ecx
  441. mov esi, [lClipY]
  442. mov cl, i8MVy2[ebp + 4*ebx]
  443. xor edx, edx
  444. add cl, 64
  445. mov dl, i8MVx2[ebp + 4*ebx]
  446. add dl, 64
  447. mov ebx, [lClipX]
  448. mov ah, [ecx + esi]
  449. mov esi, pRefBlock[ebp]
  450. mov al, [edx + ebx]
  451. mov dl, ah
  452. shl edx, 24
  453. mov cl, al
  454. sar edx, 18
  455. xor cl, 080H
  456. shr ecx, 1
  457. and edx, 0FFFFFF80H
  458. lea ebx, [WtAB + 96]
  459. add esi, ecx
  460. lea edx, [edx + edx*2 - 64]
  461. add esi, 4*PITCH+4
  462. add esi, edx
  463. ; Quadrant IV
  464. call MMxInterpolateAndAccumulate
  465. paddw mm4, [lAccum+72]
  466. paddw mm5, [lAccum+88]
  467. psraw mm4, 3
  468. paddw mm6, [lAccum+104]
  469. psraw mm5, 3
  470. paddw mm7, [lAccum+120]
  471. psraw mm6, 3
  472. movq [lAccum+72], mm4
  473. psraw mm7, 3
  474. movq [lAccum+88], mm5
  475. movq [lAccum+104], mm6
  476. movq [lAccum+120], mm7
  477. ; Quadrant III
  478. sub esi, 4
  479. sub ebx, 32
  480. call MMxInterpolateAndAccumulate
  481. paddw mm4, [lAccum+64]
  482. paddw mm5, [lAccum+80]
  483. paddw mm6, [lAccum+96]
  484. psraw mm4, 3
  485. paddw mm7, [lAccum+112]
  486. psraw mm5, 3
  487. packuswb mm4, [lAccum+72]
  488. packuswb mm5, [lAccum+88]
  489. movq [lDst+4*PITCH], mm4
  490. psraw mm6, 3
  491. movq [lDst+5*PITCH], mm5
  492. psraw mm7, 3
  493. packuswb mm6, [lAccum+104]
  494. packuswb mm7, [lAccum+120]
  495. movq [lDst+6*PITCH], mm6
  496. movq [lDst+7*PITCH], mm7
  497. pop ebp
  498. pop ebx
  499. pop edi
  500. pop esi
  501. mov esp, ebp
  502. pop ebp
  503. ret
  504. ;--------------------------------------------------------------------------;
  505. ;
  506. ; Routine:
  507. ; MMxInterpolateAndAccumulate
  508. ;
  509. ; Inputs:
  510. ; esi flat pointer to Reference Block Source.
  511. ; it is already adjusted by the motion vector.
  512. ; al x component of motion vector.
  513. ; ah y component of motion vector.
  514. ; ebx flat pointer to Weighting values.
  515. ;
  516. ; Outputs
  517. ; mm4-mm7 Weighted, interpolated values for rows 0-3.
  518. ; Values are in packed word format.
  519. ;
  520. ; Description:
  521. ; This routine performs motion compensation interpolation, weights the
  522. ; results, and returns them in mmx registers 4-7.
  523. ; It works on a single 4x4 Quadrant per call. It is an assembly
  524. ; callable routine with its parameters in registers.
  525. ;
  526. ; Register Usage:
  527. ; This routine modifies no integer registers.
  528. ; All MMx registers are modified.
  529. ;
  530. ; Notes:
  531. ;
  532. ;--------------------------------------------------------------------------;
  533. ; asm input parameters
  534. lpSrc EQU esi ; motion compensated source pointer
  535. lpWt EQU ebx ; pointer to matrix of weights 4x4xWORD
  536. MMxInterpolateAndAccumulate:
  537. test eax, 100h ; test mvy's parity bit
  538. jnz IAAhalf ; jump when it was odd
  539. test eax, 1 ; test mvx's parity bit
  540. jnz IAAhalf_int ; jump when it was odd
  541. IAAint_int:
  542. movd mm4, [lpSrc] ; 1 - fetch row
  543. movd mm5, [PITCH+lpSrc] ; 2 - fetch row
  544. punpcklbw mm4, zero ; 1 - unpack row
  545. pmullw mm4, 00[lpWt] ; 1 - multiply by weights
  546. movq mm6, [PITCH*2+lpSrc] ; 3 - fetch row
  547. punpcklbw mm5, zero ; 2 - unpack row
  548. pmullw mm5, 08[lpWt] ; 2 - multiply by weights
  549. punpcklbw mm6, zero ; 3 - unpack row
  550. movq mm7, [PITCH*3+lpSrc] ; 4 - fetch row
  551. pmullw mm6, 16[lpWt] ; 3 - multiply by weights
  552. punpcklbw mm7, zero ; 4 - unpack row
  553. pmullw mm7, 24[lpWt] ; 4 - multiply by weights
  554. ret
  555. IAAhalf_int:
  556. movq mm4, [lpSrc] ; 0 - fetch row
  557. movq mm1, mm4 ; 0 - copy row
  558. psrlq mm4, 8 ; 0 - shift row
  559. movq mm5, [PITCH+lpSrc] ; 1 - fetch row
  560. punpcklbw mm4, zero ; 0 - unpack shifted row
  561. movq mm6, [PITCH*2+lpSrc] ; 2 - fetch row
  562. punpcklbw mm1, zero ; 0 - unpack row
  563. movq mm2, mm5 ; 1 - copy row
  564. psrlq mm5, 8 ; 1 - shift row
  565. paddw mm4, [Round1] ; 0 - add in Round
  566. punpcklbw mm5, zero ; 1 - unpack shifted row
  567. paddw mm4, mm1 ; 0 - sum copies of row
  568. punpcklbw mm2, zero ; 1 - unpack row
  569. movq mm3, mm6 ; 2 - copy row
  570. psrlq mm6, 8 ; 2 - shift row
  571. paddw mm5, [Round1] ; 1 - add in Round
  572. punpcklbw mm6, zero ; 2 - unpack shifted row
  573. paddw mm5, mm2 ; 1 - sum copies of row
  574. punpcklbw mm3, zero ; 2 - unpack row
  575. movq mm7, [PITCH*3+lpSrc] ; 3 - fetch row
  576. psraw mm4, 1 ; 0 - divide by 2
  577. pmullw mm4, 00[lpWt] ; 0 - multiply by weights
  578. psraw mm5, 1 ; 1 - divide by 2
  579. movq mm1, mm7 ; 3 - copy row
  580. psrlq mm7, 8 ; 3 - shift row
  581. paddw mm6, [Round1] ; 2 - add in Round
  582. punpcklbw mm7, zero ; 3 - unpack shifted row
  583. paddw mm6, mm3 ; 2 - sum copies of rows
  584. punpcklbw mm1, zero ; 3 - unpack row
  585. paddw mm7, [Round1] ; 3 - add in Round
  586. psraw mm6, 1 ; 2 - divide by 2
  587. pmullw mm5, 08[lpWt] ; 1 - multiply by weights
  588. paddw mm7, mm1 ; 3 - sum copies of row
  589. pmullw mm6, 16[lpWt] ; 2 - multiply by weights
  590. psraw mm7, 1 ; 3 - divide by 2
  591. pmullw mm7, 24[lpWt] ; 3 - multiply by weights
  592. ret
  593. IAAhalf:
  594. test eax, 1 ; test mvx's parity bit
  595. jnz IAAhalf_half ; jump when it was odd
  596. IAAint_half:
  597. movd mm4, [lpSrc] ; 0 - fetch row
  598. movd mm5, [PITCH+lpSrc] ; 1 - fetch row
  599. punpcklbw mm4, zero ; 0 - unpack row
  600. movd mm6, [PITCH*2+lpSrc] ; 2 - fetch row
  601. punpcklbw mm5, zero ; 1 - unpack row
  602. paddw mm4, [Round1] ; 0 - add in Round
  603. punpcklbw mm6, zero ; 2 - unpack row
  604. paddw mm4, mm5 ; 0 - sum rows
  605. paddw mm5, [Round1] ; 1 - add in Round
  606. movd mm7, [PITCH*3+lpSrc] ; 3 - fetch row
  607. psraw mm4, 1 ; 0 - divide by 2
  608. pmullw mm4, 00[lpWt] ; 0 - multiply by weights
  609. paddw mm5, mm6 ; 1 - sum rows
  610. movd mm3, [PITCH*4+lpSrc] ; 4 - fetch row
  611. punpcklbw mm7, zero ; 3 - unpack row
  612. paddw mm6, [Round1] ; 2 - add in Round
  613. psraw mm5, 1 ; 1 - divide by 2
  614. pmullw mm5, 08[lpWt] ; 1 - multiply by weights
  615. punpcklbw mm3, zero ; 4 - unpack row
  616. paddw mm6, mm7 ; 2 - sum rows
  617. paddw mm7, [Round1] ; 3 - add in Round
  618. paddw mm7, mm3 ; 3 - sum rows
  619. psraw mm6, 1 ; 2 - divide by 2
  620. pmullw mm6, 16[lpWt] ; 2 - multiply by weights
  621. psraw mm7, 1 ; 3 - divide by 2
  622. pmullw mm7, 24[lpWt] ; 3 - multiply by weights
  623. ret
  624. IAAhalf_half:
  625. movq mm4, [lpSrc] ; 0 - fetch row
  626. movq mm5, [PITCH+lpSrc] ; 1 - fetch row
  627. movq mm1, mm4 ; 0 - copy row
  628. movq mm2, mm5 ; 1 - copy row
  629. psrlq mm4, 8 ; 0 - shift row
  630. movq mm6, [PITCH*2+lpSrc] ; 2 - fetch row
  631. punpcklbw mm4, zero ; 0 - unpack shifted row
  632. movq mm3, mm6 ; 2 - copy row
  633. punpcklbw mm1, zero ; 0 - unpack row
  634. paddw mm4, mm1 ; 0 - parital sum both copies of row
  635. psrlq mm5, 8 ; 1 - shift row
  636. paddw mm4, [Round2] ; 0 - add in Round
  637. punpcklbw mm5, zero ; 1 - unpack shifted row
  638. movq mm7, [PITCH*3+lpSrc] ; 3 - fetch row
  639. punpcklbw mm2, zero ; 1 - unpack row
  640. paddw mm5, mm2 ; 1 - parital sum both copies of row
  641. psrlq mm6, 8 ; 2 - shift row
  642. paddw mm4, mm5 ; 0 - add partial sums
  643. punpcklbw mm6, zero ; 2 - unpack shifted row
  644. paddw mm5, [Round2] ; 1 - add in Round
  645. punpcklbw mm3, zero ; 2 - unpack row
  646. paddw mm6, mm3 ; 2 - parital sum both copies of row
  647. movq mm1, mm7 ; 3 - copy row
  648. movq mm2, [PITCH*4+lpSrc] ; 4 - fetch row
  649. psraw mm4, 2 ; 0 - divide by 2
  650. paddw mm5, mm6 ; 1 - add partial sums
  651. psrlq mm7, 8 ; 3 - shift row
  652. paddw mm6, [Round2] ; 2 - add in Round
  653. punpcklbw mm7, zero ; 3 - unpack shifted row
  654. movq mm3, mm2 ; 4 - copy row
  655. punpcklbw mm1, zero ; 3 - unpack row
  656. paddw mm7, mm1 ; 3 - parital sum both copies of row
  657. psrlq mm2, 8 ; 4 - shift row
  658. pmullw mm4, 00[lpWt] ; 0 - multiply by weights
  659. punpcklbw mm2, zero ; 4 - unpack shifted row
  660. paddw mm6, mm7 ; 2 - add partial sums
  661. punpcklbw mm3, zero ; 4 - unpack row
  662. paddw mm7, [Round2] ; 3 - add in Round
  663. psraw mm5, 2 ; 1 - divide by 2
  664. pmullw mm5, 08[lpWt] ; 1 - multiply by weights
  665. paddw mm2, mm3 ; 4 - parital sum both copies of row
  666. paddw mm7, mm2 ; 3 - add partial sums
  667. psraw mm6, 2 ; 2 - divide by 2
  668. pmullw mm6, 16[lpWt] ; 2 - multiply by weights
  669. psraw mm7, 2 ; 3 - divide by 2
  670. pmullw mm7, 24[lpWt] ; 3 - multiply by weights
  671. ret
  672. MMXCODE1 ENDS
  673. ; 11111111112222222222333333333344444444445555555555666666666677777777778
  674. ;2345678901234567890123456789012345678901234567890123456789012345678901234567890
  675. END