Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

6016 lines
222 KiB

  1. ;////////////////////////////////////////////////////////////////////////////
  2. ;//
  3. ;// INTEL CORPORATION PROPRIETARY INFORMATION
  4. ;//
  5. ;// This software is supplied under the terms of a license
  6. ;// agreement or nondisclosure agreement with Intel Corporation
  7. ;// and may not be copied or disclosed except in accordance
  8. ;// with the terms of that agreement.
  9. ;//
  10. ;////////////////////////////////////////////////////////////////////////////
  11. ;//
  12. ;// $Header: S:\h26x\src\enc\exmme.asv 1.37 13 Dec 1996 17:19:38 MBODART $
  13. ;//
  14. ;// $Log: S:\h26x\src\enc\exmme.asv $
  15. ;//
  16. ;// Rev 1.37 13 Dec 1996 17:19:38 MBODART
  17. ;// Tuned the ME parameters for H.261.
  18. ;//
  19. ;// Rev 1.36 06 Nov 1996 16:18:24 BNICKERS
  20. ;// Improve performance.
  21. ;//
  22. ;// Rev 1.35 30 Oct 1996 17:30:36 BNICKERS
  23. ;// Fix UMV table for right edge macroblocks.
  24. ;//
  25. ;// Rev 1.34 30 Oct 1996 14:49:20 KLILLEVO
  26. ;// zero motion vectors for intra blocks in PB-frame mode.
  27. ;// This is necesseary in the Extended Motion Vector mode
  28. ;//
  29. ;// Rev 1.33 18 Oct 1996 16:57:16 BNICKERS
  30. ;// Fixes for EMV
  31. ;//
  32. ;// Rev 1.32 15 Oct 1996 17:53:04 BNICKERS
  33. ;//
  34. ;// Fix major bug w.r.t. EMV ME.
  35. ;//
  36. ;// Rev 1.31 14 Oct 1996 13:10:14 BNICKERS
  37. ;//
  38. ;// Correct several problems wrt H261 ME.
  39. ;//
  40. ;// Rev 1.30 11 Oct 1996 16:53:12 KLILLEVO
  41. ;//
  42. ;// Fix threshold
  43. ;//
  44. ;// Rev 1.29 11 Oct 1996 16:52:18 KLILLEVO
  45. ;// Another EMV fix.
  46. ;//
  47. ;// Rev 1.28 11 Oct 1996 15:43:16 KLILLEVO
  48. ;// Really fix the handling of the top row of MBs for EMV ME.
  49. ;//
  50. ;// Rev 1.27 11 Oct 1996 15:24:38 BNICKERS
  51. ;// Special handling of top row of MBs for EMV ME.
  52. ;//
  53. ;// Rev 1.26 11 Oct 1996 14:47:42 KLILLEVO
  54. ;// Kill full pel MV for Intra blocks so that EMV of adjacent blocks will work.
  55. ;//
  56. ;// Rev 1.25 10 Oct 1996 16:42:56 BNICKERS
  57. ;// Initial debugging of Extended Motion Vectors.
  58. ;//
  59. ;// Rev 1.24 04 Oct 1996 08:48:02 BNICKERS
  60. ;// Add EMV.
  61. ;//
  62. ;// Rev 1.23 24 Sep 1996 10:42:24 BNICKERS
  63. ;// For H261, zero out motion vectors when classifying MB as intra.
  64. ;//
  65. ;// Rev 1.22 12 Sep 1996 10:56:24 BNICKERS
  66. ;// Add arguments for thresholds and differentials.
  67. ;//
  68. ;// Rev 1.21 22 Jul 1996 15:23:24 BNICKERS
  69. ;// Reduce code size. Implement H261 spatial filter.
  70. ;//
  71. ;// Rev 1.20 18 Jul 1996 16:54:26 KLILLEVO
  72. ;// changed emptythreshold to 40 instead of 128 to remove some blockiness
  73. ;// from the still frame mode on MMX
  74. ;//
  75. ;// Rev 1.19 26 Jun 1996 12:49:02 KLILLEVO
  76. ;// Fix minor booboo left in by Brian.
  77. ;//
  78. ;// Rev 1.18 26 Jun 1996 12:21:50 BNICKERS
  79. ;// Make heuristic ME work without unrestricted motion vectors.
  80. ;//
  81. ;// Rev 1.17 25 Jun 1996 14:24:58 BNICKERS
  82. ;// Implement heuristic motion estimation for MMX, AP mode.
  83. ;//
  84. ;// Rev 1.16 15 May 1996 16:57:14 BNICKERS
  85. ;// Fix SWD tabulation (again)! @#$%!%
  86. ;//
  87. ;// Rev 1.15 15 May 1996 16:53:24 BNICKERS
  88. ;//
  89. ;// Fix SWD tabulation.
  90. ;//
  91. ;// Rev 1.14 15 May 1996 11:33:28 BNICKERS
  92. ;// Bug fix for calc of total SWD.
  93. ;//
  94. ;// Rev 1.13 14 May 1996 12:18:58 BNICKERS
  95. ;// Initial debugging of MMx B-Frame ME.
  96. ;//
  97. ;// Rev 1.12 03 May 1996 14:03:50 BNICKERS
  98. ;//
  99. ;// Minor bug fixes and integration refinements.
  100. ;//
  101. ;// Rev 1.11 02 May 1996 12:00:32 BNICKERS
  102. ;// Initial integration of B Frame ME, MMX version.
  103. ;//
  104. ;// Rev 1.10 16 Apr 1996 16:40:14 BNICKERS
  105. ;// Fix some important but simple bugs. Start adding table inits for B frm ME.
  106. ;//
  107. ;// Rev 1.9 10 Apr 1996 13:13:44 BNICKERS
  108. ;// Recoding of Motion Estimation, Advanced Prediction.
  109. ;//
  110. ;// Rev 1.8 05 Apr 1996 12:28:10 BNICKERS
  111. ;// Improvements to baseline half pel ME.
  112. ;//
  113. ;// Rev 1.7 26 Mar 1996 12:00:22 BNICKERS
  114. ;// Did some tuning for MMx encode.
  115. ;//
  116. ;// Rev 1.6 20 Mar 1996 17:01:44 KLILLEVO
  117. ;// fixed bug in new quant code
  118. ;//
  119. ;// Rev 1.5 20 Mar 1996 15:26:40 KLILLEVO
  120. ;// changed quantization to match IA quantization
  121. ;//
  122. ;// Rev 1.3 15 Mar 1996 15:51:16 BECHOLS
  123. ;// Completed monolithic - Brian
  124. ;//
  125. ;// Rev 1.0 16 Feb 1996 17:12:12 BNICKERS
  126. ;// Initial revision.
  127. ;//
  128. ;////////////////////////////////////////////////////////////////////////////
  129. ;
  130. ; MMxMotionEstimation -- This function performs motion estimation for the
  131. ; macroblocks identified in the input list. This is
  132. ; the MMx version. Conditional assembly selects either
  133. ; the H263 or H261 version.
  134. ;
  135. ; Arguments: See ex5me.asm.
  136. ;
  137. ; Other assumptions: See ex5me.asm. Most of the read-only tables needed in
  138. ; ex5me.asm are not needed here.
  139. ;
  140. OPTION PROLOGUE:None
  141. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  142. OPTION M510
  143. OPTION CASEMAP:NONE
  144. IFDEF H261
  145. ZEROVECTORTHRESHOLD = 600
  146. NONZEROMVDIFFERENTIAL = 256
  147. BLOCKMOTIONTHRESHOLD = 1152
  148. BLOCKMVDIFFERENTIAL = 768
  149. EMPTYTHRESHOLD = 40
  150. INTERCODINGTHRESHOLD = 300
  151. INTRACODINGDIFFERENTIAL = 200
  152. ELSE
  153. ZEROVECTORTHRESHOLD = 450
  154. NONZEROMVDIFFERENTIAL = 375
  155. BLOCKMOTIONTHRESHOLD = 1152
  156. BLOCKMVDIFFERENTIAL = 768
  157. EMPTYTHRESHOLD = 40
  158. INTERCODINGTHRESHOLD = 1152
  159. INTRACODINGDIFFERENTIAL = 1000
  160. ENDIF
  161. include iammx.inc
  162. include e3inst.inc
  163. include e3mbad.inc
  164. .xlist
  165. include memmodel.inc
  166. .list
  167. include exEDTQ.inc
  168. MMXMEDATA SEGMENT PAGE
  169. ALIGN 16
  170. ; Storage for Target and Reference frames can interleave into 8K of the 16K
  171. ; cache. Pitch must be 384.
  172. ;
  173. ; C# -- Stands for row number "#" of target macroblock in *C*urrent P frame.
  174. ; B# -- Stands for row number "#" of target macroblock in current *B* frame.
  175. ; R# -- Stands for row number "#" of 0MV *R*ef macroblock in past frame.
  176. ; v -- Stands for a row below 0MV, reference macroblock.
  177. ; These same cache lines would hit reference lines >8 above the 0MV.
  178. ; ^ -- Stands for a row below 0MV, reference macroblock.
  179. ; These same cache lines would hit reference lines >8 below the 0MV.
  180. ; +-+-+
  181. ; | | -- A cache line (32 bytes). Position of letters,<, and > indicate
  182. ; +-+-+ which 16 bytes may be used in the cache line.
  183. ;
  184. ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  185. ; |C0 | | v| |Cb | | ^| |B6 | | R6| |
  186. ; |C1 | | v| |Cc | | ^| |B7 | | R7| |
  187. ; |C2 | | v| |Cd | | ^| |B8 | | R8| |
  188. ; |C3 | | v| |Ce | | ^| |B9 | | R9| |
  189. ; |C4 | | v| |Cf | | ^| |Ba | | Ra| |
  190. ; |C5 | | v| |B0 | | R0| |Bb | | Rb| |
  191. ; |C6 | | v| |B1 | | R1| |Bc | | Rc| |
  192. ; |C7 | | v| |B2 | | R2| |Bd | | Rd| |
  193. ; |C8 | | ^| |B3 | | R3| |Be | | Re| |
  194. ; |C9 | | ^| |B4 | | R4| |Bf | | Rf| |
  195. ; |Ca | | ^| |B5 | | R5| +-+-+-+-+-+-+-+-+
  196. ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  197. ;
  198. ; The static storage space used for read-only tables, and the stack usage
  199. ; are coordinated such that they mesh in the data cache, and use only one
  200. ; 4K way of the 4-way, 16K cache.
  201. ;
  202. ; The first 32 bytes of the static storage space are unallocated, because
  203. ; the top of stack ranges in this area. As local procedure calls are made
  204. ; within this function, return addresses get pushed into these 32 bytes.
  205. ; (32 bytes; 0: 31)
  206. DB 32 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
  207. ;
  208. ; The next 608 bytes of the static storage space are unallocated, because
  209. ; the local stack frame is made to hit cache at these addresses. More of
  210. ; the local stack frame is allocated after a gap of 64 bytes.
  211. ; (608 bytes; 32: 639)
  212. LocalStorage LABEL DWORD
  213. DB 608 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
  214. ; Motion Estimation State Engine adjustments to reference block address to get
  215. ; to next candidate reference block.
  216. ; (64 bytes; 640: 703)
  217. FullPelMotionVectorAdjustment LABEL DWORD
  218. DD -16*PITCH-8
  219. VMG EQU 000H+0+8
  220. VMGHM8 EQU 000H-8+8
  221. DD -8*PITCH-8-010H
  222. VM8HM8 EQU 010H
  223. DD -8*PITCH-020H
  224. VM8 EQU 020H
  225. VM8HP8 EQU 020H+8
  226. DD -4*PITCH-8-030H
  227. VM4HM8 EQU 030H-8+8
  228. VM4HM4 EQU 030H-4+8
  229. VM4 EQU 030H+0+8
  230. VM4HP4 EQU 030H+4+8
  231. DD -4*PITCH+8-040H
  232. VM4HP8 EQU 040H+8-8
  233. VM4HPG EQU 040H+16-8
  234. DD -2*PITCH-4-050H
  235. VM2HM4 EQU 050H-4+4
  236. VM2HM2 EQU 050H-2+4
  237. VM2HM1 EQU 050H-1+4
  238. VM2 EQU 050H+0+4
  239. VM2HP1 EQU 050H+1+4
  240. VM2HP2 EQU 050H+2+4
  241. VM2HP4 EQU 050H+4+4
  242. VM2HP8 EQU 050H+8+4
  243. DD -1*PITCH-2-060H
  244. VM1HM2 EQU 060H-2+2
  245. VM1HM1 EQU 060H-1+2
  246. VM1 EQU 060H+0+2
  247. VM1HP1 EQU 060H+1+2
  248. VM1HP2 EQU 060H+2+2
  249. VM1HP4 EQU 060H+4+2
  250. DD -16-070H
  251. HMG EQU 070H-16+16
  252. HM8 EQU 070H-8+16
  253. HM4 EQU 070H-4+16
  254. HM3 EQU 070H-3+16
  255. HM2 EQU 070H-2+16
  256. HM1 EQU 070H-1+16
  257. DD -080H
  258. NOADJ EQU 080H
  259. HP1 EQU 080H+1
  260. HP2 EQU 080H+2
  261. HP4 EQU 080H+4
  262. HP8 EQU 080H+8
  263. DD 1*PITCH-2-090H
  264. VP1HM2 EQU 090H-2+2
  265. VP1HM1 EQU 090H-1+2
  266. VP1 EQU 090H+0+2
  267. VP1HP1 EQU 090H+1+2
  268. VP1HP2 EQU 090H+2+2
  269. VP1HP4 EQU 090H+4+2
  270. DD 2*PITCH-4-0A0H
  271. VP2HM4 EQU 0A0H-4+4
  272. VP2HM2 EQU 0A0H-2+4
  273. VP2HM1 EQU 0A0H-1+4
  274. VP2 EQU 0A0H+0+4
  275. VP2HP1 EQU 0A0H+1+4
  276. VP2HP2 EQU 0A0H+2+4
  277. VP2HP4 EQU 0A0H+4+4
  278. VP2HP8 EQU 0A0H+8+4
  279. DD 4*PITCH-8-0B0H
  280. VP4HM8 EQU 0B0H-8+8
  281. VP4HM4 EQU 0B0H-4+8
  282. VP4HM2 EQU 0B0H-2+8
  283. VP4 EQU 0B0H+0+8
  284. VP4HP2 EQU 0B0H+2+8
  285. VP4HP4 EQU 0B0H+4+8
  286. DD 4*PITCH+8-0C0H
  287. VP4HP8 EQU 0C0H+8-8
  288. VP4HPG EQU 0C0H+16-8
  289. DD 8*PITCH-8-0D0H
  290. VP8HM8 EQU 0D0H-8+8
  291. VP8HM4 EQU 0D0H-4+8
  292. DD 8*PITCH-0E0H
  293. VP8 EQU 0E0H+0
  294. VP8HP4 EQU 0E0H+4
  295. VP8HP8 EQU 0E0H+8
  296. DD 16*PITCH-0F0H
  297. VPG EQU 0F0H+0
  298. VPGHP8 EQU 0F0H+8
  299. ; Additional space reserved for stack variables. If more space is needed,
  300. ; it should go here.
  301. ; (160 bytes; 704: 863)
  302. DB 160 DUP (?) ; Static space place-holder. Stack frame hits these addrs.
  303. ; QWORD Constants used by motion estimation, frame differencing, and FDCT.
  304. ; (144 bytes; 864:1007)
  305. C0101010101010101 DD 001010101H, 001010101H
  306. CFFFF0000FFFF0000 DD 0FFFF0000H, 0FFFF0000H
  307. C0200010101010101 DD 001010101H, 002000101H
  308. C0001000200020001 DD 000020001H, 000010002H
  309. CFFFF00000000FFFF DD 00000FFFFH, 0FFFF0000H
  310. C0000FFFFFFFF0000 DD 0FFFF0000H, 00000FFFFH
  311. CFF000000000000FF DD 0000000FFH, 0FF000000H
  312. C0101010101010002 DD 001010002H, 001010101H
  313. C0100010001000100 DD 001000100H, 001000100H
  314. C0001000100010001 DD 000010001H, 000010001H
  315. C7F7F7F7F7F7F7F7F DD 07F7F7F7FH, 07F7F7F7FH
  316. C1 DD 07D8A7D8AH, 07D8A7D8AH
  317. C2 DD 076417641H, 076417641H
  318. C3 DD 06A6D6A6DH, 06A6D6A6DH
  319. C4 DD 05A825A82H, 05A825A82H
  320. C5 DD 0471D471DH, 0471D471DH
  321. C6 DD 030FC30FCH, 030FC30FCH
  322. C7 DD 018F818F8H, 018F818F8H
  323. ; Distances to Block Action Descriptors for blocks that provide remote vectors
  324. ; for OBMC. Which element accessed depends on edge condition. Top edge is
  325. ; stack based variable, since different instances may have different distances
  326. ; to BAD of block above. Bottom edge is always a constant, regardless of
  327. ; edge condition. This is used in OBMC frame differencing.
  328. ; (16 bytes; 1008:1023)
  329. BlockToLeft DD 0, -SIZEOF T_MacroBlockActionDescr+SIZEOF T_Blk
  330. BlockToRight DD 0, SIZEOF T_MacroBlockActionDescr-SIZEOF T_Blk
  331. ; Table to map linearized motion vector to vertical part, used by motion
  332. ; estimation. (Shift linearized motion vector right by 8 bits, and then
  333. ; use result as index into this array to get vertical MV.)
  334. ; (96 bytes; 1024:1119)
  335. IF PITCH-384
  336. *** error: The magic of this table assumes a pitch of 384.
  337. ENDIF
  338. DB -64, -64
  339. DB -62
  340. DB -60, -60
  341. DB -58
  342. DB -56, -56
  343. DB -54
  344. DB -52, -52
  345. DB -50
  346. DB -48, -48
  347. DB -46
  348. DB -44, -44
  349. DB -42
  350. DB -40, -40
  351. DB -38
  352. DB -36, -36
  353. DB -34
  354. DB -32, -32
  355. DB -30
  356. DB -28, -28
  357. DB -26
  358. DB -24, -24
  359. DB -22
  360. DB -20, -20
  361. DB -18
  362. DB -16, -16
  363. DB -14
  364. DB -12, -12
  365. DB -10
  366. DB -8, -8
  367. DB -6
  368. DB -4, -4
  369. DB -2
  370. DB 0
  371. UnlinearizedVertMV DB 0
  372. DB 2
  373. DB 4, 4
  374. DB 6
  375. DB 8, 8
  376. DB 10
  377. DB 12, 12
  378. DB 14
  379. DB 16, 16
  380. DB 18
  381. DB 20, 20
  382. DB 22
  383. DB 24, 24
  384. DB 26
  385. DB 28, 28
  386. DB 30
  387. DB 32, 32
  388. DB 34
  389. DB 36, 36
  390. DB 38
  391. DB 40, 40
  392. DB 42
  393. DB 44, 44
  394. DB 46
  395. DB 48, 48
  396. DB 50
  397. DB 52, 52
  398. DB 54
  399. DB 56, 56
  400. DB 58
  401. DB 60, 60
  402. DB 62
  403. ; Table to provide index value in low byte, and rounding term of 1 in all bytes.
  404. ; Used in frame differencing, when half pel horizontal interpolation is needed.
  405. ; (1024 bytes; 1120:2143)
  406. Pel_Rnd LABEL DWORD
  407. CNT = 0
  408. REPEAT 128
  409. DD CNT+001010101H, 001010101H
  410. CNT = CNT + 1
  411. ENDM
  412. ; Motion Estimation State Engine Rules.
  413. ; (896 bytes;2144:3039)
  414. StateEngineFirstRule LABEL BYTE ; Rules that govern state engine of estimator.
  415. StateEngine EQU StateEngineFirstRule-20+2
  416. ; Starting States:
  417. IF PITCH-384
  418. *** error: The magic of this table assumes a pitch of 384.
  419. ENDIF
  420. DB ? ; 0: not used.
  421. DB 3 ; 1: Upper left corner.
  422. DB 3 ; 2: Upper edge.
  423. DB 3 ; 3: Upper right corner.
  424. DB 3 ; 4: Left edge.
  425. DB 3 ; 5: Interior MB, not doing block search.
  426. DB 0 ; 6: Right edge.
  427. DB 0 ; 7: Lower left corner.
  428. DB 0 ; 8: Lower edge.
  429. DB 0 ; 9: Lower right corner.
  430. DB ? ; 0: not used.
  431. DB 34 ; 1: Upper left corner.
  432. DB 66 ; 2: Upper edge.
  433. DB 42 ; 3: Upper right corner.
  434. DB 98 ; 4: Left edge.
  435. DB 16 ; 5: Interior MB, not doing block search.
  436. DB 114 ; 6: Right edge.
  437. DB 50 ; 7: Lower left corner.
  438. DB 82 ; 8: Lower edge.
  439. DB 58 ; 9: Lower right corner.
  440. DB ?,? ; Skip 2 bytes.
  441. LASTINITIALMESTATE EQU 9
  442. ; Interior Telescoping States:
  443. ; Try +/- 8,4,2,1, vertically first, then horizontally.
  444. FIRSTBLOCKMESTATE EQU 10
  445. DB VM2, VM2, 12, 11 ; 10: V+1 better/worse than central. Try V-1.
  446. DB VP2HP1, HP1, 13, 13 ; 11: Accept V+1/V-1 as best. Try H+1.
  447. DB VP1HP1, HP1, 13, 13 ; 12: Accept central/V-1 as best. Try H+1.
  448. DB HM2, HM2, 15, 14 ; 13: H+1 better/worse than central. Try H-1.
  449. DB HP2, NOADJ, 0FFH, 0FFH ; 14: Accept H+1/H-1 as best. Done.
  450. DB HP1, NOADJ, 0FFH, 0FFH ; 15: Accept central/H-1 as best. Done.
  451. DB VMG, VMG, 18, 17 ; 16: V+8 better/worse than central. Try V-8.
  452. DB VPGHP8, HP8, 19, 19 ; 17: Accept V+8/V-8 as best. Try H+8.
  453. DB VP8HP8, HP8, 19, 19 ; 18: Accept central/V-8 as best. Try H+8.
  454. DB HMG, HMG, 21, 20 ; 19: H+8 better/worse than central. Try H-8.
  455. DB VP4HPG, VP4, 22, 22 ; 20: Accept H+8/H-8 as best. Try V+4.
  456. DB VP4HP8, VP4, 22, 22 ; 21: Accept central/H-8 as best. Try V+4.
  457. DB VM8, VM8, 24, 23 ; 22: V+4 better/worse than central. Try V-4.
  458. DB VP8HP4, HP4, 25, 25 ; 23: Accept V+4/V-4 as best. Try H+4.
  459. DB VP4HP4, HP4, 25, 25 ; 24: Accept central/V-4 as best. Try H+4.
  460. DB HM8, HM8, 27, 26 ; 25: H+4 better/worse than central. Try H-4.
  461. DB VP2HP8, VP2, 28, 28 ; 26: Accept H+4/H-4 as best. Try V+2.
  462. DB VP2HP4, VP2, 28, 28 ; 27: Accept central/H-4 as best. Try V+2.
  463. DB VM4, VM4, 30, 29 ; 28: V+2 better/worse than central. Try V-2.
  464. DB VP4HP2, HP2, 31, 31 ; 29: Accept V+2/V-2 as best. Try H+2.
  465. DB VP2HP2, HP2, 31, 31 ; 30: Accept central/V-2 as best. Try H+2.
  466. DB HM4, HM4, 33, 32 ; 31: H+2 better/worse than central. Try H-2.
  467. DB VP1HP4, VP1, 10, 10 ; 32: Accept H+2/H-2 as best. Try V+1.
  468. DB VP1HP2, VP1, 10, 10 ; 33: Accept central/H-2 as best. Try V+1.
  469. ; Boundary States:
  470. ; Upper left corner:
  471. DB VM8HP8, HP8, 35, 101 ; 34: Accept corner/V+8. Try H+8.
  472. DB VP4HM8, VP4, 36, 70 ; 35: Accept corner/H+8. Try V+4.
  473. DB VM4HP4, HP4, 37, 105 ; 36: Accept corner/V+4. Try H+4.
  474. DB VP2HM4, VP2, 38, 74 ; 37: Accept corner/H+4. Try V+2.
  475. DB VM2HP2, HP2, 39, 109 ; 38: Accept corner/V+2. Try H+2.
  476. DB VP1HM2, VP1, 40, 78 ; 39: Accept corner/H+2. Try V+1.
  477. DB VM1HP1, HP1, 41, 113 ; 40: Accept corner/V+1. Try H+1.
  478. DB HM1, NOADJ, 0F5H, 0F7H ; 41: Accept corner/H+1. Done.
  479. ; Upper right corner:
  480. DB VM8HM8, HM8, 43, 117 ; 42: Accept corner/V+8. Try H-8.
  481. DB VP4HP8, VP4, 44, 70 ; 43: Accept corner/H-8. Try V+4.
  482. DB VM4HM4, HM4, 45, 121 ; 44: Accept corner/V+4. Try H-4.
  483. DB VP2HP4, VP2, 46, 74 ; 45: Accept corner/H-4. Try V+2.
  484. DB VM2HM2, HM2, 47, 125 ; 46: Accept corner/V+2. Try H-2.
  485. DB VP1HP2, VP1, 48, 78 ; 47: Accept corner/H-2. Try V+1.
  486. DB VM1HM1, HM1, 49, 129 ; 48: Accept corner/V+1. Try H-1.
  487. DB HP1, NOADJ, 0F6H, 0F7H ; 49: Accept corner/H-1. Done
  488. ; Lower left corner:
  489. DB VP8HP8, HP8, 51, 101 ; 50: Accept corner/V-8. Try H+8.
  490. DB VM4HM8, VM4, 52, 86 ; 51: Accept corner/H+8. Try V-4.
  491. DB VP4HP4, HP4, 53, 105 ; 52: Accept corner/V-4. Try H+4.
  492. DB VM2HM4, VM2, 54, 90 ; 53: Accept corner/H+4. Try V-2.
  493. DB VP2HP2, HP2, 55, 109 ; 54: Accept corner/V-2. Try H+2.
  494. DB VM1HM2, VM1, 56, 94 ; 55: Accept corner/H+2. Try V-1.
  495. DB VP1HP1, HP1, 57, 113 ; 56: Accept corner/V-1. Try H+1.
  496. DB HM1, NOADJ, 0F9H, 0FBH ; 57: Accept corner/H+1. Done.
  497. ; Lower right corner:
  498. DB VP8HM8, HM8, 59, 117 ; 58: Accept corner/V-8. Try H-8.
  499. DB VM4HP8, VM4, 60, 86 ; 59: Accept corner/H-8. Try V-4.
  500. DB VP4HM4, HM4, 61, 121 ; 60: Accept corner/V-4. Try H-4.
  501. DB VM2HP4, VM2, 62, 90 ; 61: Accept corner/H-4. Try V-2.
  502. DB VP2HM2, HM2, 63, 125 ; 62: Accept corner/V-2. Try H-2.
  503. DB VM1HP2, VM1, 64, 94 ; 63: Accept corner/H-2. Try V-1.
  504. DB VP1HM1, HM1, 65, 129 ; 64: Accept corner/V-1. Try H-1.
  505. DB HP1, NOADJ, 0FAH, 0FBH ; 65: Accept corner/H-1. Done.
  506. ; Upper edge:
  507. DB VM8HP8, HP8, 67, 19 ; 66: Accept central/V+8 as best. Try H+8.
  508. DB HMG, HMG, 69, 68 ; 67: H+8 worse/better than central. Try H-8.
  509. DB VP4HPG, VP4, 70, 70 ; 68: Accept H+8/H-8 as best. Try V+4.
  510. DB VP4HP8, VP4, 70, 70 ; 69: Accept central/H-8 as best. Try V+4.
  511. DB VM4HP4, HP4, 71, 25 ; 70: Accept central/V+4 as best. Try H+4.
  512. DB HM8, HM8, 73, 72 ; 71: H+4 worse/better than central. Try H-4.
  513. DB VP2HP8, VP2, 74, 74 ; 72: Accept H+4/H-4 as best. Try V+2.
  514. DB VP2HP4, VP2, 74, 74 ; 73: Accept central/H-4 as best. Try V+2.
  515. DB VM2HP2, HP2, 75, 31 ; 74: Accept central/V+2 as best. Try H+2.
  516. DB HM4, HM4, 77, 76 ; 75: H+2 worse/better than central. Try H-2.
  517. DB VP1HP4, VP1, 78, 78 ; 76: Accept H+2/H-2 as best. Try V+1.
  518. DB VP1HP2, VP1, 78, 78 ; 77: Accept central/H-2 as best. Try V+1.
  519. DB VM1HP1, HP1, 79, 13 ; 78: Accept central/V+1 as best. Try H+1.
  520. DB HM2, HM2, 81, 80 ; 79: H+1 worse/better than central. Try H-1.
  521. DB HP2, NOADJ, 0F7H, 0F7H ; 80: Accept H+1/H-1 as best. Done.
  522. DB HP1, NOADJ, 0F7H, 0F7H ; 81: Accept central/H-1 as best. Done.
  523. ; Lower edge:
  524. DB VP8HP8, HP8, 83, 19 ; 82: Accept central/V-8 as best. Try H+8.
  525. DB HMG, HMG, 85, 84 ; 83: H+8 worse/better than central. Try H-8.
  526. DB VM4HPG, VM4, 86, 86 ; 84: Accept H+8/H-8 as best. Try V-4.
  527. DB VM4HP8, VM4, 86, 86 ; 85: Accept central/H-8 as best. Try V-4.
  528. DB VP4HP4, HP4, 87, 25 ; 86: Accept central/V-4 as best. Try H+4.
  529. DB HM8, HM8, 89, 88 ; 87: H+4 worse/better than central. Try H-4.
  530. DB VM2HP8, VM2, 90, 90 ; 88: Accept H+4/H-4 as best. Try V-2.
  531. DB VM2HP4, VM2, 90, 90 ; 89: Accept central/H-4 as best. Try V-2.
  532. DB VP2HP2, HP2, 91, 31 ; 90: Accept central/V-2 as best. Try H+2.
  533. DB HM4, HM4, 93, 92 ; 91: H+2 worse/better than central. Try H-2.
  534. DB VM1HP4, VM1, 94, 94 ; 92: Accept H+2/H-2 as best. Try V-1.
  535. DB VM1HP2, VM1, 94, 94 ; 93: Accept central/H-2 as best. Try V-1.
  536. DB VP1HP1, HP1, 95, 13 ; 94: Accept central/V-1 as best. Try H+1.
  537. DB HM2, HM2, 97, 96 ; 95: H+1 worse/better than central. Try H-1.
  538. DB HP2, NOADJ, 0FBH, 0FBH ; 96: Accept H+1/H-1 as best. Done.
  539. DB HP1, NOADJ, 0FBH, 0FBH ; 97: Accept central/H-1 as best. Done.
  540. ; Left edge:
  541. DB VMG, VMG, 100, 99 ; 98: V+8 worse/better than central. Try V-8.
  542. DB VPGHP8, HP8, 101, 101 ; 99: Accept V+8/V-8 as best. Try H+8.
  543. DB VP8HP8, HP8, 101, 101 ; 100: Accept central/V-8 as best. Try H+8.
  544. DB VP4HM8, VP4, 102, 22 ; 101: Accept central/H+8 as best. Try V+4.
  545. DB VM8, VM8, 104, 103 ; 102: V+4 worse/better than central. Try V-4.
  546. DB VP8HP4, HP4, 105, 105 ; 103: Accept V+4/V-4 as best. Try H+4.
  547. DB VP4HP4, HP4, 105, 105 ; 104: Accept central/V-4 as best. Try H+4.
  548. DB VP2HM4, VP2, 106, 28 ; 105: Accept central/H+4 as best. Try V+2.
  549. DB VM4, VM4, 108, 107 ; 106: V+2 worse/better than central. Try V-2.
  550. DB VP4HP2, HP2, 109, 109 ; 107: Accept V+2/V-2 as best. Try H+2.
  551. DB VP2HP2, HP2, 109, 109 ; 108: Accept central/V-2 as best. Try H+2.
  552. DB VP1HM2, VP1, 110, 10 ; 109: Accept central/H+2 as best. Try V+1.
  553. DB VM2, VM2, 112, 111 ; 110: V+1 worse/better than central. Try V-1.
  554. DB VP2HP1, HP1, 113, 113 ; 111: Accept V+1/V-1 as best. Try H+1.
  555. DB VP1HP1, HP1, 113, 113 ; 112: Accept central/V-1 as best. Try H+1.
  556. DB HM1, NOADJ, 0FDH, 0FDH ; 113: Accept central/H+1 as best. Done.
  557. ; Right edge:
  558. DB VPG, VPG, 116, 115 ; 114: V-8 worse/better than central. Try V+8.
  559. DB VMGHM8, HM8, 117, 117 ; 115: Accept V-8/V+8 as best. Try H-8.
  560. DB VM8HM8, HM8, 117, 117 ; 116: Accept central/V+8 as best. Try H-8.
  561. DB VP4HP8, VP4, 118, 22 ; 117: Accept central/H+8 as best. Try V+4.
  562. DB VM8, VM8, 120, 119 ; 118: V+4 worse/better than central. Try V-4.
  563. DB VP8HM4, HM4, 121, 121 ; 119: Accept V+4/V-4 as best. Try H-4.
  564. DB VP4HM4, HM4, 121, 121 ; 120: Accept central/V-4 as best. Try H-4.
  565. DB VP2HP4, VP2, 122, 28 ; 121: Accept central/H+4 as best. Try V+2.
  566. DB VM4, VM4, 124, 123 ; 122: V+2 worse/better than central. Try V-2.
  567. DB VP4HM2, HM2, 125, 125 ; 123: Accept V+2/V-2 as best. Try H-2.
  568. DB VP2HM2, HM2, 125, 125 ; 124: Accept central/V-2 as best. Try H-2.
  569. DB VP1HP2, VP1, 126, 10 ; 125: Accept central/H+2 as best. Try V+1.
  570. DB VM2, VM2, 128, 127 ; 126: V+1 worse/better than central. Try V-1.
  571. DB VP2HM1, HM1, 129, 129 ; 127: Accept V+1/V-1 as best. Try H-1.
  572. DB VP1HM1, HM1, 129, 129 ; 128: Accept central/V-1 as best. Try H-1.
  573. DB HP1, NOADJ, 0FEH, 0FEH ; 129: Accept central/H+1 as best. Done.
  574. ; Exhaustive search, radius 1 here, reaching out to radius 2 further below.
  575. ; . . . . .
  576. ; . 2 5 3 . C = center.
  577. ; . 7 C 8 .
  578. ; . 4 6 1 . # = order to try additional candidates.
  579. ; . . . . .
  580. FIRST_HEURISTIC_EXHAUSTIVE = 130
  581. DB VM2HM2, VM2HM2, 131, 138 ; 130: #1 worse/better than C. Try #2.
  582. DB HP2, HP2, 132, 145 ; 131: #2 worse/better than C. Try #3.
  583. DB VP2HM2, VP2HM2, 133, 151 ; 132: #3 worse/better than C. Try #4.
  584. DB VM2HP1, VM2HP1, 134, 156 ; 133: #4 worse/better than C. Try #5.
  585. DB VP2, VP2, 135, 160 ; 134: #5 worse/better than C. Try #6.
  586. DB VM1HM1, VM1HM1, 136, 163 ; 135: #6 worse/better than C. Try #7.
  587. DB HP2, HP2, 137, 165 ; 136: #7 worse/better than C. Try #8.
  588. DB HM1, HP1, 0FFH, 166 ; 137: If C best, quit. If 8 best, keep going.
  589. DB HP2, HP2, 139, 145 ; 138: #2 worse/better than #1. Try #3.
  590. DB VP2HM2, VP2HM2, 140, 151 ; 139: #3 worse/better than #1. Try #4.
  591. DB VM2HP1, VM2HP1, 141, 156 ; 140: #4 worse/better than #1. Try #5.
  592. DB VP2, VP2, 142, 160 ; 141: #5 worse/better than #1. Try #6.
  593. DB VM1HM1, VM1HM1, 143, 163 ; 142: #6 worse/better than #1. Try #7.
  594. DB HP2, HP2, 144, 165 ; 143: #7 worse/better than #1. Try #8.
  595. DB HP1, HP1, 199, 166 ; 144: #8 worse/better than #1. Take best, go on.
  596. DB VP2HM2, VP2HM2, 146, 151 ; 145: #3 worse/better than #2. Try #4.
  597. DB VM2HP1, VM2HP1, 147, 156 ; 146: #4 worse/better than #2. Try #5.
  598. DB VP2, VP2, 148, 160 ; 147: #5 worse/better than #2. Try #6.
  599. DB VM1HM1, VM1HM1, 149, 163 ; 148: #6 worse/better than #2. Try #7.
  600. DB HP2, HP2, 150, 165 ; 149: #7 worse/better than #2. Try #8.
  601. DB HM3, HP1, 208, 166 ; 150: #8 worse/better than #2. Take best, go on.
  602. DB VM2HP1, VM2HP1, 152, 156 ; 151: #4 worse/better than #3. Try #5.
  603. DB VP2, VP2, 153, 160 ; 152: #5 worse/better than #3. Try #6.
  604. DB VM1HM1, VM1HM1, 154, 163 ; 153: #6 worse/better than #3. Try #7.
  605. DB HP2, HP2, 155, 165 ; 154: #7 worse/better than #3. Try #8.
  606. DB HP1, HP1, 217, 166 ; 155: #8 worse/better than #3. Take best, go on.
  607. DB VP2, VP2, 157, 160 ; 156: #5 worse/better than #4. Try #6.
  608. DB VM1HM1, VM1HM1, 158, 163 ; 157: #6 worse/better than #4. Try #7.
  609. DB HP2, HP2, 159, 165 ; 158: #7 worse/better than #4. Try #8.
  610. DB HM3, HP1, 190, 166 ; 159: #8 worse/better than #4. Take best, go on.
  611. DB VM1HM1, VM1HM1, 161, 163 ; 160: #6 worse/better than #5. Try #7.
  612. DB HP2, HP2, 162, 165 ; 161: #7 worse/better than #5. Try #8.
  613. DB VM2HM1, HP1, 184, 166 ; 162: #8 worse/better than #5. Take best, go on.
  614. DB HP2, HP2, 164, 165 ; 163: #7 worse/better than #6. Try #8.
  615. DB VP2HM1, HP1, 176, 166 ; 164: #8 worse/better than #6. Take best, go on.
  616. DB HM3, HP1, 172, 166 ; 165: #8 worse/better than #7. Take best, go on.
  617. ; . . . . . C = center.
  618. ; . ~ ~ ~ 2 ~ = tried, but not as good.
  619. ; . ~ C X 1 X = best so far.
  620. ; . ~ ~ ~ 3 # = order to try additional candidates.
  621. ; . . . . .
  622. DB VM1, VM1, 167, 169 ; 166: #1 better/worse than X. Try #2.
  623. DB VP2, VP2, 168, 171 ; 167: #2 better/worse than X. Try #3.
  624. DB VM1HM1, NOADJ, 0FFH,0FFH ; 168: #3 better/worse than X. Take best, quit.
  625. DB VP2, VP2, 170, 171 ; 169: #2 better/worse than #1. Try #3.
  626. DB VM1, NOADJ, 0FFH,0FFH ; 170: #3 better/worse than #1. Take best, quit.
  627. DB VM2, NOADJ, 0FFH,0FFH ; 171: #3 better/worse than #2. Take best, quit.
  628. ; . . . . . C = center.
  629. ; 2 ~ ~ ~ . ~ = tried, but not as good.
  630. ; 1 X C ~ . X = best so far.
  631. ; 3 ~ ~ ~ . # = order to try additional candidates.
  632. ; . . . . .
  633. DB VM1, VM1, 173, 175 ; 172: #1 better/worse than X. Try #2.
  634. DB VP2, VP2, 174, 177 ; 173: #2 better/worse than X. Try #3.
  635. DB VM1HP1, NOADJ, 0FFH,0FFH ; 174: #3 better/worse than X. Take best, quit.
  636. DB VP2, VP2, 176, 177 ; 175: #2 better/worse than #1. Try #3.
  637. DB VM1, NOADJ, 0FFH,0FFH ; 176: #3 better/worse than #1. Take best, quit.
  638. DB VM2, NOADJ, 0FFH,0FFH ; 177: #3 better/worse than #2. Take best, quit.
  639. ; . . . . . C = center.
  640. ; . ~ ~ ~ . ~ = tried, but not as good.
  641. ; . ~ C ~ . X = best so far.
  642. ; . ~ X ~ . # = order to try additional candidates.
  643. ; . 2 1 3 .
  644. DB HM1, HM1, 179, 181 ; 178: #1 better/worse than X. Try #2.
  645. DB HP2, HP2, 180, 183 ; 179: #2 better/worse than X. Try #3.
  646. DB VM1HM1, NOADJ, 0FFH,0FFH ; 180: #3 better/worse than X. Take best, quit.
  647. DB HP2, HP2, 182, 183 ; 181: #2 better/worse than #1. Try #3.
  648. DB HM1, NOADJ, 0FFH,0FFH ; 182: #3 better/worse than #1. Take best, quit.
  649. DB HM2, NOADJ, 0FFH,0FFH ; 183: #3 better/worse than #2. Take best, quit.
  650. ; . 2 1 3 . C = center.
  651. ; . ~ X ~ . ~ = tried, but not as good.
  652. ; . ~ C ~ . X = best so far.
  653. ; . ~ ~ ~ . # = order to try additional candidates.
  654. ; . . . . .
  655. DB HM1, HM1, 185, 187 ; 184: #1 better/worse than X. Try #2.
  656. DB HP2, HP2, 186, 189 ; 185: #2 better/worse than X. Try #3.
  657. DB VP1HM1, NOADJ, 0FFH,0FFH ; 186: #3 better/worse than X. Take best, quit.
  658. DB HP2, HP2, 188, 189 ; 187: #2 better/worse than #1. Try #3.
  659. DB HM1, NOADJ, 0FFH,0FFH ; 188: #3 better/worse than #1. Take best, quit.
  660. DB HM2, NOADJ, 0FFH,0FFH ; 189: #3 better/worse than #2. Take best, quit.
  661. ; . . . . . C = center.
  662. ; . ~ ~ ~ . ~ = tried, but not as good.
  663. ; 1 ~ C ~ . X = best so far.
  664. ; 2 X ~ ~ . # = order to try additional candidates.
  665. ; 4 3 5 . .
  666. DB VP1, VP1, 191, 195 ; 190: #1 better/worse than X. Try #2.
  667. DB VP1HP1, VP1HP1, 178, 192 ; 191: #2 better/worse than X. Try #3.
  668. DB HM1, HM1, 193, 181 ; 192: #3 better/worse than #2. Try #4.
  669. DB HP2, HP2, 194, 183 ; 193: #4 better/worse than #2. Try #5.
  670. DB VM1HM2, NOADJ, 0FFH,0FFH ; 194: #5 better/worse than #2. Take best, quit.
  671. DB VP1HP1, VP1HP1, 196, 192 ; 195: #2 better/worse than #1. Try #3.
  672. DB HM1, HM1, 197, 181 ; 196: #3 better/worse than #1. Try #4.
  673. DB HP2, HP2, 198, 183 ; 197: #4 better/worse than #1. Try #5.
  674. DB VM2HM2, NOADJ, 0FFH,0FFH ; 198: #5 better/worse than #1. Take best, quit.
  675. ; . . . . . C = center.
  676. ; . ~ ~ ~ . ~ = tried, but not as good.
  677. ; . ~ C ~ 1 X = best so far.
  678. ; . ~ ~ X 2 # = order to try additional candidates.
  679. ; . . 4 3 5
  680. DB VP1, VP1, 200, 204 ; 199: #1 better/worse than X. Try #2.
  681. DB VP1HM1, VP1HM1, 178, 201 ; 200: #2 better/worse than X. Try #3.
  682. DB HM1, HM1, 202, 181 ; 201: #3 better/worse than #2. Try #4.
  683. DB HP2, HP2, 203, 183 ; 202: #4 better/worse than #2. Try #5.
  684. DB VM1, NOADJ, 0FFH,0FFH ; 203: #5 better/worse than #2. Take best, quit.
  685. DB VP1HM1, VP1HM1, 205, 201 ; 204: #2 better/worse than #1. Try #3.
  686. DB HM1, HM1, 206, 181 ; 205: #3 better/worse than #1. Try #4.
  687. DB HP2, HP2, 207, 183 ; 206: #4 better/worse than #1. Try #5.
  688. DB VM2, NOADJ, 0FFH,0FFH ; 207: #5 better/worse than #1. Take best, quit.
  689. ; 4 3 5 . . C = center.
  690. ; 2 X ~ ~ . ~ = tried, but not as good.
  691. ; 1 ~ C ~ . X = best so far.
  692. ; . ~ ~ ~ . # = order to try additional candidates.
  693. ; . . . . .
  694. DB VM1, VM1, 209, 213 ; 208: #1 better/worse than X. Try #2.
  695. DB VM1HP1, VM1HP1, 184, 210 ; 209: #2 better/worse than X. Try #3.
  696. DB HM1, HM1, 211, 187 ; 210: #3 better/worse than #2. Try #4.
  697. DB HP2, HP2, 212, 189 ; 211: #4 better/worse than #2. Try #5.
  698. DB VP1HM2, NOADJ, 0FFH,0FFH ; 212: #5 better/worse than #2. Take best, quit.
  699. DB VM1HP1, VM1HP1, 214, 210 ; 213: #2 better/worse than #1. Try #3.
  700. DB HM1, HM1, 215, 187 ; 214: #3 better/worse than #1. Try #4.
  701. DB HP2, HP2, 216, 189 ; 215: #4 better/worse than #1. Try #5.
  702. DB VP2HM2, NOADJ, 0FFH,0FFH ; 216: #5 better/worse than #1. Take best, quit.
  703. ; . . 4 3 5 C = center.
  704. ; . ~ ~ X 2 ~ = tried, but not as good.
  705. ; . ~ C ~ 1 X = best so far.
  706. ; . ~ ~ ~ . # = order to try additional candidates.
  707. ; . . . . .
  708. DB VM1, VM1, 218, 222 ; 217: #1 better/worse than X. Try #2.
  709. DB VM1HM1, VM1HM1, 184, 219 ; 218: #2 better/worse than X. Try #3.
  710. DB HM1, HM1, 220, 187 ; 219: #3 better/worse than #2. Try #4.
  711. DB HP2, HP2, 221, 189 ; 220: #4 better/worse than #2. Try #5.
  712. DB VP1, NOADJ, 0FFH,0FFH ; 221: #5 better/worse than #2. Take best, quit.
  713. DB VM1HM1, VM1HM1, 223, 219 ; 222: #2 better/worse than #1. Try #3.
  714. DB HM1, HM1, 224, 187 ; 223: #3 better/worse than #1. Try #4.
  715. DB HP2, HP2, 225, 189 ; 224: #4 better/worse than #1. Try #5.
  716. DB VP2, NOADJ, 0FFH,0FFH ; 225: #5 better/worse than #1. Take best, quit.
  717. FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR = 226
  718. DB VP1HP1, VP1HP1, 130, 130 ; 226: Redoing ctr, away from limiting edge.
  719. DB ?, ?, ?, ?, ?, ?
  720. ; Table of values to add to SWDs for half pel reference macroblocks, to cause
  721. ; those that are off the edge of the frame to produce artificially high SWDs.
  722. ; (64 bytes;3040:3103)
  723. InvalidateBadHalfPelMVs LABEL DWORD
  724. DD 0FFFFFFFFH, 0FFFFFF00H, 0FFFF00FFH, 0FFFF0000H
  725. DD 0FF00FFFFH, 0FF00FF00H, 0FF0000FFH, 0FF000000H
  726. DD 000FFFFFFH, 000FFFF00H, 000FF00FFH, 000FF0000H
  727. DD 00000FFFFH, 00000FF00H, 0000000FFH, 000000000H
  728. ; Tables (interleaved) to select case from next table (below these) to drive
  729. ; the weighting of the future and past predictions in the construction of
  730. ; B-frame reference blocks.
  731. ; (448 bytes;3104:3551)
  732. VertWtSel LABEL BYTE
  733. DB 0
  734. HorzWtSel LABEL BYTE
  735. DB 240
  736. DB 0, 240
  737. DB 0, 240
  738. DB 0, 240
  739. DB 0, 240
  740. DB 0, 240
  741. DB 0, 240
  742. DB 0, 240
  743. DB 0, 240
  744. DB 0, 240
  745. DB 0, 240
  746. DB 0, 240
  747. DB 0, 240
  748. DB 0, 240
  749. DB 0, 240
  750. DB 0, 240
  751. DB 0, 240
  752. DB 0, 240
  753. DB 0, 240
  754. DB 0, 240
  755. DB 0, 240
  756. DB 0, 240
  757. DB 0, 240
  758. DB 0, 240
  759. DB 0, 240
  760. DB 0, 240
  761. DB 0, 240
  762. DB 0, 240
  763. DB 0, 240
  764. DB 0, 240
  765. DB 0, 240
  766. DB 0, 240
  767. DB 0, 240
  768. DB 0, 240
  769. DB 0, 240
  770. DB 0, 240
  771. DB 0, 240
  772. DB 0, 240
  773. DB 0, 240
  774. DB 0, 240
  775. DB 0, 240
  776. DB 0, 240
  777. DB 0, 240
  778. DB 0, 240
  779. DB 0, 240
  780. DB 0, 240
  781. DB 0, 240
  782. DB 0, 240
  783. DB 0, 240
  784. DB 0, 240
  785. DB 1, 0
  786. DB 1, 0
  787. DB 2, 16
  788. DB 2, 16
  789. DB 3, 32
  790. DB 3, 32
  791. DB 4, 48
  792. DB 4, 48
  793. DB 5, 64
  794. DB 5, 64
  795. DB 6, 80
  796. DB 6, 80
  797. DB 7, 96
  798. DB 7, 96
  799. DB 8, 112
  800. DB 8, 112
  801. DB 8, 112
  802. DB 8, 112
  803. DB 8, 112
  804. DB 8, 112
  805. DB 8, 112
  806. DB 8, 112
  807. DB 8, 112
  808. DB 8, 112
  809. DB 8, 112
  810. DB 8, 112
  811. DB 8, 112
  812. DB 8, 112
  813. DB 8, 112
  814. DB 8, 112
  815. DB 8, 112
  816. DB 9, 128
  817. DB 9, 128
  818. DB 10, 144
  819. DB 10, 144
  820. DB 11, 160
  821. DB 11, 160
  822. DB 12, 176
  823. DB 12, 176
  824. DB 13, 192
  825. DB 13, 192
  826. DB 14, 208
  827. DB 14, 208
  828. DB 15, 224
  829. DB 15, 224
  830. DB 0, 240
  831. DB 0, 240
  832. DB 0, 240
  833. DB 0, 240
  834. DB 0, 240
  835. DB 0, 240
  836. DB 0, 240
  837. DB 0, 240
  838. DB 0, 240
  839. DB 0, 240
  840. DB 0, 240
  841. DB 0, 240
  842. DB 0, 240
  843. DB 0, 240
  844. DB 0, 240
  845. DB 0, 240
  846. DB 0, 240
  847. DB 0, 240 ; Chroma starts here
  848. DB 0, 240
  849. DB 0, 240
  850. DB 0, 240
  851. DB 0, 240
  852. DB 0, 240
  853. DB 0, 240
  854. DB 0, 240
  855. DB 0, 240
  856. DB 0, 240
  857. DB 0, 240
  858. DB 0, 240
  859. DB 0, 240
  860. DB 0, 240
  861. DB 0, 240
  862. DB 0, 240
  863. DB 0, 240
  864. DB 0, 240
  865. DB 0, 240
  866. DB 0, 240
  867. DB 0, 240
  868. DB 0, 240
  869. DB 0, 240
  870. DB 0, 240
  871. DB 0, 240
  872. DB 0, 240
  873. DB 0, 240
  874. DB 0, 240
  875. DB 0, 240
  876. DB 0, 240
  877. DB 0, 240
  878. DB 0, 240 ; Luma ends here
  879. DB 0, 240
  880. DB 0, 240
  881. DB 1, 0
  882. DB 1, 0
  883. DB 2, 16
  884. DB 2, 16
  885. DB 3, 32
  886. DB 3, 32
  887. DB 4, 48
  888. DB 4, 48
  889. DB 5, 64
  890. DB 5, 64
  891. DB 6, 80
  892. DB 6, 80
  893. DB 7, 96
  894. DB 7, 96
  895. DB 8, 112
  896. DB 9, 128
  897. DB 9, 128
  898. DB 10, 144
  899. DB 10, 144
  900. DB 11, 160
  901. DB 11, 160
  902. DB 12, 176
  903. DB 12, 176
  904. DB 13, 192
  905. DB 13, 192
  906. DB 14, 208
  907. DB 14, 208
  908. DB 15, 224
  909. DB 15, 224
  910. DB 0, 240
  911. DB 0, 240
  912. DB 0, 240
  913. DB 0, 240
  914. DB 0, 240
  915. DB 0, 240
  916. DB 0, 240
  917. DB 0, 240
  918. DB 0, 240
  919. DB 0, 240
  920. DB 0, 240
  921. DB 0, 240
  922. DB 0, 240
  923. DB 0, 240
  924. DB 0, 240
  925. DB 0, 240
  926. DB 0, 240
  927. DB 0, 240
  928. DB 0, 240
  929. DB 0, 240
  930. DB 0, 240
  931. DB 0, 240
  932. DB 0, 240
  933. DB 0, 240
  934. DB 0, 240
  935. DB 0, 240
  936. DB 0, 240
  937. DB 0, 240
  938. DB 0, 240
  939. DB 0, 240
  940. DB 0, 240
  941. DB 0, 240
  942. DB 0, 240
  943. DB 0, 240
  944. DB 0, 240
  945. DB 0, 240
  946. DB 0, 240
  947. DB 0, 240
  948. DB 0, 240
  949. DB 0, 240
  950. DB 0, 240
  951. DB 0, 240
  952. DB 0, 240
  953. DB 0, 240
  954. DB 0, 240
  955. DB 0, 240
  956. DB 0, 240
  957. DB 0, 240
  958. DB 0, 240
  959. ; Table indexed by VertWtSel and HorzWtSel to get index of weight to apply to
  960. ; future and past predictions in the construction of B-frame reference blocks
  961. ; for frame differencing.
  962. ; (264 bytes;3552:3815)
  963. ;
  964. ; Indexed by VertWtSel[VMV]+HorzWtSel[HMV]+N to get idx of weight for line N.
  965. P8F0 = 0*8
  966. F1P7 = 1*8
  967. F2P6 = 2*8
  968. F3P5 = 3*8
  969. F4P4 = 4*8
  970. F5P3 = 5*8
  971. F6P2 = 6*8
  972. F7P1 = 7*8
  973. F8P0 = 8*8
  974. P1F7 = 9*8
  975. P2F6 = 10*8
  976. P3F5 = 11*8
  977. P4F4 = 12*8
  978. P5F3 = 13*8
  979. P6F2 = 14*8
  980. P7F1 = 15*8
  981. Diff_IdxRefWts LABEL BYTE
  982. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  983. DB F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7, F1P7
  984. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  985. DB F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6, F2P6
  986. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  987. DB F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5, F3P5
  988. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  989. DB F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4, F4P4
  990. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  991. DB F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3, F5P3
  992. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  993. DB F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2, F6P2
  994. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  995. DB F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1, F7P1
  996. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  997. DB F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0, F8P0
  998. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  999. DB P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7, P1F7
  1000. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1001. DB P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6, P2F6
  1002. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1003. DB P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5, P3F5
  1004. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1005. DB P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4, P4F4
  1006. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1007. DB P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3, P5F3
  1008. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1009. DB P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2, P6F2
  1010. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1011. DB P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1, P7F1
  1012. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1013. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1014. DB P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0, P8F0
  1015. BFrmSWDState LABEL BYTE ; State engine rules for finding best motion vector.
  1016. ; (48 bytes; 3816:3863)
  1017. ; 1st number: Horizontal Motion displacement to try, in half pel increments.
  1018. ; 2nd number: Vertical Motion displacement to try, in half pel increments.
  1019. ; 3rd number: Next state to enter if previous best is still best.
  1020. ; 4th number: Next state to enter if this motion is better than previous best.
  1021. DB -2, 0, 4, 8 ; 0 -- ( 0, 0) Try (-2, 0)
  1022. DB 2, 0, 12, 12 ; 4 -- ( 0, 0) Try ( 2, 0)
  1023. DB 4, 0, 12, 12 ; 8 -- (-2, 0) Try ( 2, 0)
  1024. DB 0, -2, 16, 20 ; 12 -- ( N, 0) Try ( N,-2) (N = {-2,0,2})
  1025. DB 0, 2, 24, 24 ; 16 -- ( N, 0) Try ( N, 2)
  1026. DB 0, 4, 24, 24 ; 20 -- ( N,-2) Try ( N, 2)
  1027. DB -1, 0, 28, 32 ; 24
  1028. DB 1, 0, 36, 36 ; 28
  1029. DB 2, 0, 36, 36 ; 32
  1030. DB 0, -1, 40, 44 ; 36
  1031. DB 0, 1, 0, 0 ; 40
  1032. DB 0, 2, 0, 0 ; 44
  1033. ; Table used by Quant RLE to navigate the zigzag order of quantized coeffs.
  1034. ; Contents of this table are initialized by first entry to MMxEDTQ. In
  1035. ; unlikely event of race condition, it will just get initialized by more
  1036. ; than one encoder instance.
  1037. ; (128 bytes; 3864:3991)
  1038. NextZigZagCoeff LABEL BYTE
  1039. DB 128 DUP (0FFH)
  1040. ; Table used to initial above table.
  1041. ; (64 bytes: 3992:4055)
  1042. InitZigZagCoeff LABEL BYTE
  1043. DB Q01,Q10,Q20,Q11,Q02,Q03,Q12,Q21,Q30,Q40,Q31,Q22,Q13,Q04,Q05,Q14
  1044. DB Q23,Q32,Q41,Q50,Q60,Q51,Q42,Q33,Q24,Q15,Q06,Q07,Q16,Q25,Q34,Q43
  1045. DB Q52,Q61,Q70,Q71,Q62,Q53,Q44,Q35,Q26,Q17,Q27,Q36,Q45,Q54,Q63,Q72
  1046. DB Q73,Q64,Q55,Q46,Q37,Q47,Q56,Q65,Q74,Q75,Q66,Q57,Q67,Q76,Q77, 0
  1047. ; Constants needed by the Quant RLE phase.
  1048. ; (128 bytes; 4056:4183)
  1049. Recip2QP LABEL DWORD
  1050. WORD 0H, 0H ; QP = 000h
  1051. WORD 04000H, 04000H ; QP = 001h
  1052. WORD 02000H, 02000H ; QP = 002h
  1053. WORD 01555H, 01555H ; QP = 003h
  1054. WORD 01000H, 01000H ; QP = 004h
  1055. WORD 00CCCH, 00CCCH ; QP = 005h
  1056. WORD 00AAAH, 00AAAH ; QP = 006h
  1057. WORD 00924H, 00924H ; QP = 007h
  1058. WORD 00800H, 00800H ; QP = 008h
  1059. WORD 0071CH, 0071CH ; QP = 009h
  1060. WORD 00666H, 00666H ; QP = 00Ah
  1061. WORD 005D1H, 005D1H ; QP = 00Bh
  1062. WORD 00555H, 00555H ; QP = 00Ch
  1063. WORD 004ECH, 004ECH ; QP = 00Dh
  1064. WORD 00492H, 00492H ; QP = 00Eh
  1065. WORD 00444H, 00444H ; QP = 00Fh
  1066. WORD 00400H, 00400H ; QP = 010h
  1067. WORD 003C3H, 003C3H ; QP = 011h
  1068. WORD 0038EH, 0038EH ; QP = 012h
  1069. WORD 0035EH, 0035EH ; QP = 013h
  1070. WORD 00333H, 00333H ; QP = 014h
  1071. WORD 0030CH, 0030CH ; QP = 015h
  1072. WORD 002E8H, 002E8H ; QP = 016h
  1073. WORD 002C8H, 002C8H ; QP = 017h
  1074. WORD 002AAH, 002AAH ; QP = 018h
  1075. WORD 0028FH, 0028FH ; QP = 019h
  1076. WORD 00276H, 00276H ; QP = 01Ah
  1077. WORD 0025EH, 0025EH ; QP = 01Bh
  1078. WORD 00249H, 00249H ; QP = 01Ch
  1079. WORD 00234H, 00234H ; QP = 01Dh
  1080. WORD 00222H, 00222H ; QP = 01Eh
  1081. WORD 00210H, 00210H ; QP = 01Fh
  1082. ; Skip over space to get to where the following tables can go. They will
  1083. ; hit the cache at the same point as a portion of the StateEngine states
  1084. ; that aren't used in the heuristic ME mode.
  1085. ; (2056 bytes; 4184:6239)
  1086. DB 2056 DUP (?) ; Static space place-holder.
  1087. ; Table to select base address in next table below to use for particular block
  1088. ; of macroblock. First column provides address of base element of HorzWtSel
  1089. ; to use to map horizontal MV to list of weighting indices to use. ; Second
  1090. ; column is similar, but for Vertical MV. Third and fourth columns not used.
  1091. ; 6 rows; one for each block in a macroblock.
  1092. ; (88 bytes; 6240:6327)
  1093. LeftRightBlkPosition LABEL DWORD
  1094. DD HorzWtSel+0-64
  1095. UpDownBlkPosition LABEL DWORD
  1096. DD VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH
  1097. DD HorzWtSel+32-64, VertWtSel+0-64, 0DEADBEEFH, 0DEADBEEFH
  1098. DD HorzWtSel+0-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH
  1099. DD HorzWtSel+32-64, VertWtSel+32-64, 0DEADBEEFH, 0DEADBEEFH
  1100. DD HorzWtSel+128, VertWtSel+128, 0DEADBEEFH
  1101. BlkEmptyFlag LABEL BYTE ; sneak this in here
  1102. DB 16, 0, 32, 0
  1103. DD HorzWtSel+128, VertWtSel+128
  1104. ; The following table, indexed by MBEdgeType&7, returns a mask which is used to
  1105. ; zero-out the motion vectors for predictors that are off the edge of the
  1106. ; frame. The index is a 3 bit value, each bit being set if the macroblock
  1107. ; is NOT on the corresponding edge. 1 == left; 2 == right; 4 == top;
  1108. ; The value gotten out is (where A==left; B==above; C==above right):
  1109. ; <mask(A) mask(A) mask(C) mask(C) mask(B) mask(B) mask(A) mask(A)>
  1110. ; The mask is 0xFF if the corresponding remote block is NOT off the edge, and
  1111. ; 0x00 if it is off the edge.
  1112. ; (32 bytes: 6328: 6359)
  1113. ValidRemoteVectors LABEL DWORD
  1114. DWORD 0DEADBEEFH ; 0: Can't be on left and right edges at once.
  1115. DWORD 0FF0000FFH ; 1: Top right corner.
  1116. DWORD 000000000H ; 2: Top left corner.
  1117. DWORD 0FF0000FFH ; 3: Top edge.
  1118. DWORD 0DEADBEEFH ; 4: Can't be on left and right edges at once.
  1119. DWORD 0FF00FFFFH ; 5: Right edge.
  1120. DWORD 000FFFF00H ; 6: Left edge.
  1121. DWORD 0FFFFFFFFH ; 7: Central macroblock.
  1122. ; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
  1123. ; to be subtracted with saturation to the predicted motion vector for extended
  1124. ; motion vector search. Since saturation occurs at 0, the values here are
  1125. ; such that the motion vectors are biased to the appropriate point for the
  1126. ; clamping effect. The index is a 4 bit value, each bit being set if the
  1127. ; macroblock is NOT on the corresponding edge. 1 == left; 2 == right;
  1128. ; 4 == top; 8 == bottom. The 8 values being calculated are as follows:
  1129. ; ; [ 0: 7] -- HMV lower limit for signature search
  1130. ; ; [ 8:15] -- HMV lower limit
  1131. ; ; [16:23] -- HMV upper limit for signature search
  1132. ; ; [24:31] -- HMV upper limit
  1133. ; ; [32:39] -- VMV lower limit for signature search
  1134. ; ; [40:47] -- VMV lower limit
  1135. ; ; [48:55] -- VMV upper limit for signature search
  1136. ; ; [56:63] -- VMV upper limit
  1137. ; (88 bytes: 6360:6447)
  1138. EMV_ClampLowerEnd LABEL DWORD
  1139. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
  1140. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
  1141. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
  1142. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
  1143. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
  1144. BYTE 87, 94, 97, 100, ; 5: Bottom right corner.
  1145. 87, 94, 97, 100
  1146. BYTE 119, 126, 97, 100, ; 6: Bottom left corner.
  1147. 87, 94, 97, 100
  1148. BYTE 87, 94, 97, 100, ; 7: Bottom edge.
  1149. 87, 94, 97, 100
  1150. DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
  1151. BYTE 87, 94, 97, 100, ; 9: Top right corner.
  1152. 119, 126, 97, 100
  1153. BYTE 119, 126, 97, 100, ; 10: Top left corner.
  1154. 119, 126, 97, 100
  1155. BYTE 87, 94, 97, 100, ; 11: Top edge.
  1156. 119, 126, 97, 100
  1157. DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
  1158. BYTE 87, 94, 97, 100, ; 13: Right edge.
  1159. 87, 94, 97, 100
  1160. BYTE 119, 126, 97, 100, ; 14: Left edge.
  1161. 87, 94, 97, 100
  1162. BYTE 87, 94, 97, 100, ; 15: Central macroblock.
  1163. 87, 94, 97, 100
  1164. ; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
  1165. ; to be added with saturation to the result of the application of the preceed-
  1166. ; ing table, to clamp the upper limit on the motion vector search parameters.
  1167. ; Since saturation occurs at 255, the values here are such that the motion
  1168. ; vectors are biased to the appropriate point for the clamping effect.
  1169. ; (88 bytes: 6448:6535)
  1170. EMV_ClampUpperEnd LABEL DWORD
  1171. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
  1172. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
  1173. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
  1174. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
  1175. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
  1176. BYTE 184, 193, 216, 225, ; 5: Bottom right corner.
  1177. 184, 193, 216, 225
  1178. BYTE 216, 225, 184, 193, ; 6: Bottom left corner.
  1179. 184, 193, 216, 225
  1180. BYTE 184, 193, 184, 193, ; 7: Bottom edge.
  1181. 184, 193, 216, 225
  1182. DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
  1183. BYTE 184, 193, 216, 225, ; 9: Top right corner.
  1184. 216, 225, 184, 193
  1185. BYTE 216, 225, 184, 193, ; 10: Top left corner.
  1186. 216, 225, 184, 193
  1187. BYTE 184, 193, 184, 193, ; 11: Top edge.
  1188. 216, 225, 184, 193
  1189. DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
  1190. BYTE 184, 193, 216, 225, ; 13: Right edge.
  1191. 184, 193, 184, 193
  1192. BYTE 216, 225, 184, 193, ; 14: Left edge.
  1193. 184, 193, 184, 193
  1194. BYTE 184, 193, 184, 193, ; 15: Central macroblock.
  1195. 184, 193, 184, 193
  1196. ; The following table, indexed by MBEdgeType, returns a QWORD of unsigned bytes
  1197. ; to be added without saturation to the result of the application of the
  1198. ; preceeding table, to return the the motion vector search parameters to the
  1199. ; proper range for subsequent use.
  1200. ; (88 bytes: 6536:6623)
  1201. EMV_RestoreRange LABEL DWORD
  1202. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 0: Can't be on all edges at once.
  1203. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 1: Can't be on top and bottom edges at once.
  1204. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 2: Can't be on top and bottom edges at once.
  1205. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 3: Can't be on top and bottom edges at once.
  1206. ; DWORD 0DEADBEEFH, 0DEADBEEFH ; 4: Can't be on left and right edges at once.
  1207. BYTE 120, 255, 88, 225, ; 5: Bottom right corner.
  1208. 120, 255, 88, 225
  1209. BYTE 120, 255, 56, 193, ; 6: Bottom left corner.
  1210. 120, 255, 88, 225
  1211. BYTE 120, 255, 56, 193, ; 7: Bottom edge.
  1212. 120, 255, 88, 225
  1213. DWORD 0DEADBEEFH, 0DEADBEEFH ; 8: Can't be on left and right edges at once.
  1214. BYTE 120, 255, 88, 225, ; 9: Top right corner.
  1215. 120, 255, 56, 193
  1216. BYTE 120, 255, 56, 193, ; 10: Top left corner.
  1217. 120, 255, 56, 193
  1218. BYTE 120, 255, 56, 193, ; 11: Top edge.
  1219. 120, 255, 56, 193
  1220. DWORD 0DEADBEEFH, 0DEADBEEFH ; 12: Can't be on left and right edges at once.
  1221. BYTE 120, 255, 88, 225, ; 13: Right edge.
  1222. 120, 255, 56, 193
  1223. BYTE 120, 255, 56, 193, ; 14: Left edge.
  1224. 120, 255, 56, 193
  1225. BYTE 120, 255, 56, 193, ; 15: Central macroblock.
  1226. 120, 255, 56, 193
  1227. ; Tables indexed by indices fetched from Diff_IdxRefWts. These tables return
  1228. ; a multipler to apply to past or future predictions to construct the
  1229. ; B-frame candidate reference blocks.
  1230. ; (128 bytes;6624:6751)
  1231. FutureWt_FF_or_00 LABEL DWORD
  1232. DD 000000000H, 000000000H
  1233. DD 000000000H, 0FF000000H
  1234. DD 000000000H, 0FFFF0000H
  1235. DD 000000000H, 0FFFFFF00H
  1236. DD 000000000H, 0FFFFFFFFH
  1237. DD 0FF000000H, 0FFFFFFFFH
  1238. DD 0FFFF0000H, 0FFFFFFFFH
  1239. DD 0FFFFFF00H, 0FFFFFFFFH
  1240. DD 0FFFFFFFFH, 0FFFFFFFFH
  1241. DD 0FFFFFFFFH, 000FFFFFFH
  1242. DD 0FFFFFFFFH, 00000FFFFH
  1243. DD 0FFFFFFFFH, 0000000FFH
  1244. DD 0FFFFFFFFH, 000000000H
  1245. DD 000FFFFFFH, 000000000H
  1246. DD 00000FFFFH, 000000000H
  1247. DD 0000000FFH, 000000000H
  1248. MMXMEDATA ENDS
  1249. ;=============================================================================
  1250. .CODE EDTQ
  1251. ASSUME cs : FLAT
  1252. ASSUME ds : FLAT
  1253. ASSUME es : FLAT
  1254. ASSUME fs : FLAT
  1255. ASSUME gs : FLAT
  1256. ASSUME ss : FLAT
  1257. EXTERN MMxDoForwardDCT:NEAR
  1258. EXTERN MMxDoForwardDCTx:NEAR
  1259. EXTERN MMxDoForwardDCTy:NEAR
  1260. IFDEF H261
  1261. ELSE
  1262. EXTERN MMxDoBFrameLumaBlocks:NEAR
  1263. EXTERN MMxDoBFrameChromaBlocks:NEAR
  1264. ENDIF
  1265. MMxEDTQ proc C AMBAS: DWORD,
  1266. ATarg: DWORD,
  1267. APrev: DWORD,
  1268. ABTarg: DWORD,
  1269. AWtFwd: DWORD,
  1270. AWtBwd: DWORD,
  1271. AFrmWd: DWORD,
  1272. ADoHalf: DWORD,
  1273. ADoBlk: DWORD,
  1274. ADoSF: DWORD,
  1275. ADoAP: DWORD,
  1276. ADoB: DWORD,
  1277. ADoLuma: DWORD,
  1278. ADoExtMV:DWORD,
  1279. AQP: DWORD,
  1280. ABQP: DWORD,
  1281. AB0VecT: DWORD,
  1282. ASpaFilT:DWORD,
  1283. ASpaFilD:DWORD,
  1284. ASWDTot: DWORD,
  1285. ABSWDTot:DWORD,
  1286. ACodStr: DWORD,
  1287. ABCodStr:DWORD
  1288. LocalFrameSize = 1536 ; Space needed for locals
  1289. RegStoSize = 16
  1290. ; Arguments:
  1291. MBlockActionStream_arg = RegStoSize + 4
  1292. TargetFrameBaseAddress_arg = RegStoSize + 8
  1293. PreviousFrameBaseAddress_arg = RegStoSize + 12
  1294. BTargetFrameBaseAddress_arg = RegStoSize + 16
  1295. SignatureBaseAddress_arg = RegStoSize + 20
  1296. WeightForwardMotion_arg = RegStoSize + 24
  1297. WeightBackwardMotion_arg = RegStoSize + 28
  1298. FrameWidth = RegStoSize + 32
  1299. DoHalfPelEstimation_arg = RegStoSize + 36
  1300. DoBlockLevelVectors_arg = RegStoSize + 40
  1301. DoSpatialFiltering_arg = RegStoSize + 44
  1302. DoAdvancedPrediction_arg = RegStoSize + 48
  1303. DoBFrame_arg = RegStoSize + 52
  1304. DoLumaBlocksInThisPass_arg = RegStoSize + 56
  1305. DoExtendedMotionVectors_arg = RegStoSize + 60
  1306. QuantizationLevel = RegStoSize + 64
  1307. BQuantizationLevel = RegStoSize + 68
  1308. BFrmZeroVectorThreshold_arg = RegStoSize + 72
  1309. SpatialFiltThreshold_arg = RegStoSize + 76
  1310. SpatialFiltDifferential_arg = RegStoSize + 80
  1311. PSWDTotal = RegStoSize + 84
  1312. PBSWDTotal = RegStoSize + 88
  1313. CodeStreamCursor_arg = RegStoSize + 92
  1314. BCodeStreamCursor_arg = RegStoSize + 96
  1315. EndOfArgList = RegStoSize + 100
  1316. StackOffset TEXTEQU <0>
  1317. CONST_384 TEXTEQU <384>
  1318. push esi
  1319. push edi
  1320. push ebp
  1321. push ebx
  1322. ; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
  1323. mov esi,esp
  1324. and esp,0FFFFF000H
  1325. sub esp,000000FE0H
  1326. IFDEF H261
  1327. mov ebp,PITCH
  1328. CONST_384 TEXTEQU <ebp>
  1329. mov eax,[esi+SpatialFiltThreshold_arg]
  1330. mov ebx,[esi+SpatialFiltDifferential_arg]
  1331. mov SpatialFiltThreshold,eax
  1332. mov SpatialFiltDifferential,ebx
  1333. mov ecx,[esi+TargetFrameBaseAddress_arg]
  1334. mov ebx,[esi+SignatureBaseAddress_arg]
  1335. sub ecx,ebx
  1336. mov eax,[esi+TargetFrameBaseAddress_arg]
  1337. mov SigToTarget,ecx
  1338. add ecx,PITCH*80+64
  1339. neg ecx
  1340. mov TargetToSig_Debiased,ecx
  1341. mov ebx,[esi+PreviousFrameBaseAddress_arg]
  1342. mov PreviousFrameBaseAddress,ebx
  1343. mov TargetFrameBaseAddress,eax
  1344. sub ebx,eax
  1345. mov ecx,[esi+QuantizationLevel]
  1346. mov TargToRef,ebx
  1347. mov eax,[esi+CodeStreamCursor_arg]
  1348. mov ebx,ecx
  1349. mov CodeStreamCursor,eax
  1350. shl ebx,16
  1351. xor edx,edx
  1352. or ebx,ecx
  1353. mov ecx,Recip2QP[ecx*4]
  1354. mov QPDiv2,ebx
  1355. mov Recip2QPToUse,ecx
  1356. mov eax,[esi+DoSpatialFiltering_arg]
  1357. mov DoExtendedMotionVectors,edx
  1358. test eax,eax
  1359. je @f
  1360. mov eax,3
  1361. @@:
  1362. mov DoSpatialFiltering,al
  1363. mov SWDTotal,edx
  1364. mov BestMBHalfPelMV,edx
  1365. mov ebx,PreviousFrameBaseAddress
  1366. mov BlockAbove[0],edx
  1367. sub ebx,16
  1368. mov edx,[esi+FrameWidth]
  1369. mov SpatiallyFilteredMB,ebx
  1370. imul edx,-SIZEOF T_MacroBlockActionDescr/16
  1371. add edx,2*SIZEOF T_Blk
  1372. mov eax,14 ; 14 if restricted MVs and doing heuristic ME.
  1373. mov BlockAbove[4],edx
  1374. mov DoHeuristicME,eax
  1375. ELSE
  1376. mov eax,[esi+DoExtendedMotionVectors_arg]
  1377. test eax,eax
  1378. je @f
  1379. mov eax,7
  1380. @@:
  1381. mov DoExtendedMotionVectors,eax
  1382. mov eax,[esi+BFrmZeroVectorThreshold_arg]
  1383. mov edi,[esi+WeightForwardMotion_arg]
  1384. mov BFrmZeroVectorThreshold,eax
  1385. mov ecx,60
  1386. mov ebx,060606060H
  1387. lea edx,WeightForwardMotion+128
  1388. @@:
  1389. mov eax,[edi+ecx]
  1390. and eax,03F3F3F3FH ; ???
  1391. mov ebp,[edi+ecx+64]
  1392. and ebp,03F3F3F3FH ; ???
  1393. xor eax,ebx
  1394. xor ebp,ebx
  1395. mov [edx+ecx+64],eax
  1396. mov [edx+ecx-128],ebp
  1397. sub ecx,4
  1398. mov ebp,PITCH
  1399. jge @b
  1400. mov edi,[esi+WeightBackwardMotion_arg]
  1401. mov eax,edx
  1402. lea edx,WeightBackwardMotion+128
  1403. mov ecx,60
  1404. sub eax,edx
  1405. jne @b
  1406. CONST_384 TEXTEQU <ebp>
  1407. mov ebx,[esi+PreviousFrameBaseAddress_arg]
  1408. mov eax,[esi+TargetFrameBaseAddress_arg]
  1409. mov PreviousFrameBaseAddress,ebx
  1410. mov TargetFrameBaseAddress,eax
  1411. mov ecx,[esi+BTargetFrameBaseAddress_arg]
  1412. sub ebx,eax
  1413. mov TargToRef,ebx
  1414. sub eax,ecx
  1415. mov BFrameBaseAddress,ecx
  1416. mov BFrameToFuture,eax
  1417. mov ecx,[esi+TargetFrameBaseAddress_arg]
  1418. mov ebx,[esi+SignatureBaseAddress_arg]
  1419. sub ecx,ebx
  1420. mov edx,[esi+FrameWidth]
  1421. mov SigToTarget,ecx
  1422. add ecx,PITCH*80+64
  1423. neg ecx
  1424. imul edx,-SIZEOF T_MacroBlockActionDescr/16
  1425. mov TargetToSig_Debiased,ecx
  1426. mov ecx,[esi+DoBFrame_arg]
  1427. add edx,2*SIZEOF T_Blk
  1428. xor cl,1
  1429. mov BlockAbove[4],edx
  1430. mov IsPlainPFrame,cl
  1431. mov ecx,[esi+QuantizationLevel]
  1432. mov eax,[esi+CodeStreamCursor_arg]
  1433. mov ebx,ecx
  1434. mov CodeStreamCursor,eax
  1435. mov eax,[esi+BCodeStreamCursor_arg]
  1436. mov BCodeStreamCursor,eax
  1437. shl ebx,16
  1438. mov eax,[esi+DoHalfPelEstimation_arg]
  1439. or ebx,ecx
  1440. mov ecx,Recip2QP[ecx*4]
  1441. mov QPDiv2,ebx
  1442. mov Recip2QPToUse,ecx
  1443. mov ecx,[esi+BQuantizationLevel]
  1444. xor edx,edx
  1445. mov ebx,ecx
  1446. shl ebx,16
  1447. mov BestMBHalfPelMV,edx
  1448. or ebx,ecx
  1449. mov ecx,Recip2QP[ecx*4]
  1450. mov BQPDiv2,ebx
  1451. mov BRecip2QPToUse,ecx
  1452. test eax,eax
  1453. je @f
  1454. mov eax,-4
  1455. @@:
  1456. mov DoHalfPelME,eax
  1457. mov eax,[esi+DoBlockLevelVectors_arg]
  1458. mov DoBlockLevelVectors,al
  1459. mov eax,[esi+DoAdvancedPrediction_arg]
  1460. mov DoAdvancedPrediction,al
  1461. mov SWDTotal,edx
  1462. test eax,eax
  1463. lea eax,[eax+14] ; 14 if restricted MVs and doing heuristic ME.
  1464. je @f
  1465. xor eax,eax ; 0 if unrestricted MVs and doing heuristic ME.
  1466. @@:
  1467. mov DoHeuristicME,eax
  1468. mov BSWDTotal,edx
  1469. mov PendingOBMC,edx
  1470. mov BlockAbove[0],edx
  1471. ENDIF
  1472. mov eax,01E98E268H
  1473. mov EMVLimitsForThisMB,eax
  1474. ; ; [ 0: 7] -- HMV lower limit for sig search (biased 128)
  1475. ; ; [ 8:15] -- HMV lower limit (signed)
  1476. ; ; [16:23] -- HMV upper limit for sig search (biased 128)
  1477. ; ; [24:31] -- HMV upper limit (signed)
  1478. mov EMVLimitsForThisMB+4,eax ; Same as for HMV.
  1479. mov edx,[esi+MBlockActionStream_arg]
  1480. mov al,NextZigZagCoeff[Q77]
  1481. test al,al
  1482. je ZigZagCoeffInitialized
  1483. xor ecx,ecx
  1484. lea ebx,InitZigZagCoeff
  1485. xor eax,eax
  1486. @@:
  1487. mov al,[ebx]
  1488. inc ebx
  1489. mov NextZigZagCoeff[ecx],al
  1490. mov ecx,eax
  1491. test eax,eax
  1492. jne @b
  1493. ZigZagCoeffInitialized:
  1494. mov StashESP,esi
  1495. mov eax,[esi+DoLumaBlocksInThisPass_arg]
  1496. test eax,eax
  1497. jne FirstMacroBlock ; Jump if doing luma plane
  1498. jmp FirstMacroBlock_ChromaProcessing
  1499. IntraCodedChromaProcessingDone:
  1500. IFDEF H261
  1501. ELSE
  1502. mov al,IsPlainPFrame
  1503. test al,al
  1504. jne NextMacroBlock_ChromaProcessing
  1505. mov eax,QPDiv2
  1506. mov ebx,BQPDiv2
  1507. call MMxDoBFrameChromaBlocks
  1508. ENDIF
  1509. NextMacroBlock_ChromaProcessing:
  1510. mov bl,[edx].CodedBlocks
  1511. sub edx,-SIZEOF T_MacroBlockActionDescr
  1512. and bl,040H ; Check for end-of-stream
  1513. jne TrulyDone
  1514. FirstMacroBlock_ChromaProcessing:
  1515. mov al,[edx].BlockType ; Chroma handling. Intra? Or Inter?
  1516. mov ecx,TargetFrameBaseAddress
  1517. cmp al,INTRA
  1518. jne ChromaIsInterCoded
  1519. mov esi,[edx].BlkU.BlkOffset
  1520. mov StashBlockType,al
  1521. add esi,ecx
  1522. push eax ; Adjust stack pointer
  1523. StackOffset TEXTEQU <4>
  1524. call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
  1525. shl bl,4
  1526. mov al,[edx].CodedBlocks
  1527. sub al,bl
  1528. mov esi,[edx].BlkV.BlkOffset
  1529. mov [edx].CodedBlocks,al
  1530. mov ecx,TargetFrameBaseAddress
  1531. add esi,ecx
  1532. call MMxDoForwardDCT ; Block is in target frame; Pitch is PITCH
  1533. shl bl,5
  1534. mov al,[edx].CodedBlocks
  1535. sub al,bl
  1536. pop ecx ; Adjust stack pointer
  1537. StackOffset TEXTEQU <0>
  1538. mov [edx].CodedBlocks,al
  1539. jmp IntraCodedChromaProcessingDone
  1540. ChromaIsInterCoded:
  1541. mov edi,[edx].BlkU.BlkOffset ; Get address of next macroblock to do.
  1542. mov ebx,[edx].BlkU.MVs
  1543. add edi,ecx
  1544. mov esi,[edx].BlkU.PastRef
  1545. mov StashBlockType,al
  1546. IFDEF H261
  1547. mov ecx,2+256*1 ; cl==2 tells SpatialLoopFilter code to do one
  1548. ; ; block. ch==1 causes it to return to here.
  1549. mov TargetMacroBlockBaseAddr,edi ; Store address of U block.
  1550. cmp al,INTERSLF
  1551. je DoSpatialFilterForChroma
  1552. ReturnFromSpatialFilterForU:
  1553. ENDIF
  1554. call DoNonOBMCDifferencing
  1555. ; (Finish differencing the last four lines.)
  1556. movq mm4,[edi+ebp*4] ; T4
  1557. psrlq mm1,1
  1558. movq mm5,[edi+PITCH*5]
  1559. psubb mm4,mm0 ; D4 = T4 - P4
  1560. movq mm0,[edi+PITCH*6]
  1561. psubb mm5,mm1
  1562. movq mm1,[edi+PITCH*7]
  1563. pand mm2,mm6
  1564. pand mm3,mm6
  1565. psrlq mm2,1
  1566. movq PelDiffsLine4,mm4 ; Store D4.
  1567. psubb mm0,mm2
  1568. movq PelDiffsLine5,mm5
  1569. psrlq mm3,1
  1570. movq PelDiffsLine6,mm0
  1571. psubb mm1,mm3
  1572. push eax ; Adjust stack pointer
  1573. StackOffset TEXTEQU <4>
  1574. call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
  1575. shl bl,4
  1576. mov al,[edx].CodedBlocks
  1577. sub al,bl
  1578. mov ecx,TargetFrameBaseAddress
  1579. mov [edx].CodedBlocks,al
  1580. pop edi ; Adjust stack pointer
  1581. StackOffset TEXTEQU <0>
  1582. mov edi,[edx].BlkV.BlkOffset ; Get address of next macroblock to do.
  1583. mov ebx,[edx].BlkV.MVs
  1584. add edi,ecx
  1585. mov esi,[edx].BlkV.PastRef
  1586. IFDEF H261
  1587. mov ecx,2-256*1 ; cl==2 tells SpatialLoopFilter code to do one
  1588. ; ; block. ch==-1 causes it to return to here.
  1589. mov TargetMacroBlockBaseAddr,edi ; Store address of U block.
  1590. mov al,[edx].BlockType
  1591. cmp al,INTERSLF
  1592. je DoSpatialFilterForChroma
  1593. ReturnFromSpatialFilterForV:
  1594. ENDIF
  1595. call DoNonOBMCDifferencing
  1596. ; (Finish differencing the last four lines.)
  1597. movq mm4,[edi+ebp*4] ; T4
  1598. psrlq mm1,1
  1599. movq mm5,[edi+PITCH*5]
  1600. psubb mm4,mm0 ; D4 = T4 - P4
  1601. movq mm0,[edi+PITCH*6]
  1602. psubb mm5,mm1
  1603. movq mm1,[edi+PITCH*7]
  1604. pand mm2,mm6
  1605. pand mm3,mm6
  1606. psrlq mm2,1
  1607. movq PelDiffsLine4,mm4 ; Store D4.
  1608. psubb mm0,mm2
  1609. movq PelDiffsLine5,mm5
  1610. psrlq mm3,1
  1611. movq PelDiffsLine6,mm0
  1612. psubb mm1,mm3
  1613. push eax ; Adjust stack pointer
  1614. StackOffset TEXTEQU <4>
  1615. call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
  1616. shl bl,5
  1617. mov al,[edx].CodedBlocks
  1618. sub al,bl
  1619. pop ecx ; Adjust stack pointer
  1620. StackOffset TEXTEQU <0>
  1621. mov [edx].CodedBlocks,al
  1622. jmp IntraCodedChromaProcessingDone
  1623. ;============================================================================
  1624. ; Here we copy the target macroblock, and interpolate left, right, and both.
  1625. ; We also accumulate the target pels for each block. Result is four partial
  1626. ; sums in four packed words. After summing them all up, the final sum will
  1627. ; be the sum of the 64 pels of each block, divided by 2.
  1628. NextMacroBlock:
  1629. mov bl,[edx].CodedBlocks
  1630. sub edx,-SIZEOF T_MacroBlockActionDescr
  1631. and bl,040H ; Check for end-of-stream
  1632. jne Done
  1633. FirstMacroBlock:
  1634. mov edi,TargetFrameBaseAddress
  1635. mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do.
  1636. add edi,esi
  1637. mov esi,TargToRef
  1638. add esi,edi
  1639. mov TargetMacroBlockBaseAddr,edi
  1640. mov Addr0MVRef,esi
  1641. ;============================================================================
  1642. ; We calculate the 0-motion SWD. We use 32 match points per block, and
  1643. ; write the result seperately for each block. If the SWD for the 0-motion
  1644. ; vector is below a threshold, we don't bother searching for other possibly
  1645. ; better motion vectors.
  1646. ;
  1647. ; ebp -- PITCH
  1648. ; esi -- Address of ref block.
  1649. ; edi -- Address of target block.
  1650. ; edx -- MBlockActionStream
  1651. ; ecx -- Not used. Will be linearized MV in non-zero MV search.
  1652. ; ebx -- CurrSWDState, i.e. FirstMEState, times 8
  1653. ; eax -- Scratch
  1654. ; mm7 -- Best SWD for macroblock.
  1655. ; mm0-mm6 Scratch
  1656. ;
  1657. mov cl,[edx].CodedBlocks ; Init CBP for macroblock.
  1658. or cl,03FH ; Indicate all 6 blocks are coded.
  1659. mov eax,DoHeuristicME ; 0 if unrestricted MVs and heur ME.
  1660. ; ; 14 if restricted MVs and heur ME.
  1661. ; ; 15 if suppressing heuristic ME.
  1662. mov [edx].CodedBlocks,cl
  1663. js IntraByDecree
  1664. xor ebx,ebx ; Avoid partial register stall.
  1665. xor ecx,ecx
  1666. mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
  1667. pcmpeqd mm7,mm7 ; Init previous best SWD to huge.
  1668. mov bl,[edx].FirstMEState ; Test for INTRA-BY-DECREE.
  1669. sub eax,ecx ; Negative iff should do heuristic ME
  1670. ; ; for this macroblock.
  1671. test bl,bl
  1672. je IntraByDecree
  1673. sar eax,31
  1674. psrlq mm7,2
  1675. or ebx,eax ; -1 if doing heuristic ME.
  1676. mov al,INTER1MV ; Speculate INTER, 1 motion vector.
  1677. mov [edx].BlockType,al
  1678. psrld mm7,14 ; mm7[32:63]: Previous best SWD = 0x0000FFFF.
  1679. ; ; mm7[ 0:31]: Prev SWD that we diminish = 0x0003FFFF.
  1680. ; ; Since we can't diminish it below 0x00020000, we
  1681. ; ; won't take the short circuit exit from MblkEstQWA.
  1682. ; At this point:
  1683. ; ebp -- PITCH
  1684. ; esi -- Address of upper left block of 0,0 ref area.
  1685. ; edi -- Address of upper left block of target.
  1686. ; edx -- MBlockActionStream
  1687. ; ecx -- Scratch
  1688. ; ebx -- CurrSWDState, i.e. FirstMEState.
  1689. ; eax -- Scratch
  1690. ; mm7 -- Previous best SWD initialized to huge (0xFFFF, 0x3FFFF).
  1691. ; mm0-mm6 -- Scratch
  1692. ;============================================================================
  1693. ; Compute SWD for macroblock.
  1694. ComputeMBSWD:
  1695. ; Registers at this point:
  1696. ; ebp -- PITCH
  1697. ; esi -- Address of upper left block of candidate ref area.
  1698. ; edi -- Address of upper left block of target.
  1699. ; edx -- MBlockActionStream
  1700. ; ecx -- Scratch
  1701. ; ebx -- CurrSWDState
  1702. ; eax -- Scratch
  1703. ; mm7 -- Previous best SWD.
  1704. ; mm0-mm6 -- Scratch
  1705. ;
  1706. lea ecx,[ebp+ebp*4] ; Get PITCH*5
  1707. lea eax,[ebp+ebp*2] ; Get PITCH*3
  1708. movq mm0,[esi+PITCH*15] ; FL A: Ref MB, lower left block, line 15.
  1709. psubw mm0,[edi+PITCH*15] ; FL B: Diff for lower left block, line 15.
  1710. movq mm6,[esi+PITCH*15+8] ; FR A
  1711. psllw mm0,8 ; FL C: Extract diffs for line 15 even pels.
  1712. psubw mm6,[edi+PITCH*15+8] ; FR B
  1713. pmaddwd mm0,mm0 ; FL D: Square of diffs for even pels.
  1714. movq mm1,[esi+PITCH*9] ; 9L A
  1715. psllw mm6,8 ; FR C
  1716. psubw mm1,[edi+PITCH*9] ; 9L B
  1717. pmaddwd mm6,mm6 ; FR D
  1718. movq mm5,[esi+PITCH*9+8] ; 9R A
  1719. psllw mm1,8 ; 9L C
  1720. psubw mm5,[edi+PITCH*9+8] ; 9R B
  1721. pmaddwd mm1,mm1 ; 9L D
  1722. movq mm2,[esi+eax*4] ; CL a
  1723. psllw mm5,8 ; 9R C
  1724. psubw mm2,[edi+eax*4] ; CL b
  1725. pmaddwd mm5,mm5 ; 9R D
  1726. movq mm3,[esi+eax*4+8] ; CR a
  1727. pmaddwd mm2,mm2 ; CL c: Square of diffs for odd pels.
  1728. psubw mm3,[edi+eax*4+8] ; CR b
  1729. paddusw mm0,mm1 ; LL + Accumulate SWD for lower left block.
  1730. movq mm1,[esi+eax*1] ; 3L A
  1731. pmaddwd mm3,mm3 ; CR c
  1732. psubw mm1,[edi+eax*1] ; 3L B
  1733. paddusw mm6,mm5 ; LR +
  1734. movq mm5,[esi+eax*1+8] ; 3R A
  1735. psllw mm1,8 ; 3L C
  1736. psubw mm5,[edi+eax*1+8] ; 3R B
  1737. paddusw mm0,mm2 ; LL +
  1738. movq mm2,[esi] ; 0L a
  1739. pmaddwd mm1,mm1 ; 3L D
  1740. psubw mm2,[edi] ; 0L b
  1741. paddusw mm6,mm3 ; LR +
  1742. movq mm3,[esi+8] ; 0R a
  1743. psllw mm5,8 ; 3R C
  1744. psubw mm3,[edi+8] ; 0R b
  1745. pmaddwd mm5,mm5 ; 3R D
  1746. movq mm4,[esi+eax*2] ; 6L a
  1747. pmaddwd mm2,mm2 ; 0L c
  1748. psubw mm4,[edi+eax*2] ; 6L b
  1749. pmaddwd mm3,mm3 ; 0R c
  1750. movq PartSWDForLLBlk,mm0 ; Stash SWD for lines 9,12,15, LL blk.
  1751. paddusw mm0,mm6 ; Sum SWD for lines 9,12,15 LL and LR.
  1752. movq PartSWDForLRBlk,mm6 ; Stash SWD for lines 9,12,15, LR blk.
  1753. pmaddwd mm4,mm4 ; 6L c
  1754. movq mm6,[esi+eax*2+8] ; 6R a
  1755. paddusw mm1,mm2 ; UL +
  1756. psubw mm6,[edi+eax*2+8] ; 6R b
  1757. paddusw mm5,mm3 ; UR +
  1758. movq mm2,[esi+ebp*1] ; 1L A
  1759. pmaddwd mm6,mm6 ; 6R c
  1760. psubw mm2,[edi+ebp*1] ; 1L B
  1761. paddusw mm1,mm4 ; UL +
  1762. movq mm3,[esi+ecx*1] ; 5L A
  1763. paddusw mm0,mm1 ; Sum partial SWD for LL, LR, and UL.
  1764. psubw mm3,[edi+ecx*1] ; 5L B
  1765. paddusw mm5,mm6 ; UR +
  1766. movq mm6,[esi+ebp*4] ; 4L a
  1767. paddusw mm0,mm5 ; Sum partial SWD for all blocks.
  1768. movq PartSWDForURBlk,mm5 ; Stash SWD for lines 0,3,6, UR blk.
  1769. punpckldq mm5,mm0 ; Get low sum into high bits.
  1770. psubw mm6,[edi+ebp*4] ; 4L b
  1771. paddusw mm5,mm0 ; Total up SWD for every third line.
  1772. movq mm0,[esi+ebp*2] ; 2L a
  1773. psrlq mm5,47 ; Position, and double.
  1774. psubw mm0,[edi+ebp*2] ; 2L b
  1775. pcmpgtd mm5,mm7 ; Is 2 * SWD for 6 lines > prev SWD?
  1776. pmaddwd mm0,mm0 ; 2L c
  1777. psllw mm2,8 ; 1L C
  1778. movdf eax,mm5
  1779. pmaddwd mm2,mm2 ; 1L D
  1780. test eax,eax
  1781. jne MblkEst_EarlyOut
  1782. lea eax,[ecx+ebp*2] ; PITCH*7
  1783. psllw mm3,8 ; 5L C
  1784. paddusw mm1,mm2 ; UL +
  1785. pmaddwd mm3,mm3 ; 5L D
  1786. movq mm5,[esi+eax*1] ; 7L A
  1787. psubw mm5,[edi+eax*1] ; 7L B
  1788. pmaddwd mm6,mm6 ; 4L c
  1789. movq mm2,[esi+PITCH*11+8] ; BR A
  1790. psllw mm5,8 ; 7L C
  1791. psubw mm2,[edi+PITCH*11+8] ; BR B
  1792. paddusw mm1,mm3 ; UL +
  1793. movq mm3,[esi+PITCH*13+8] ; DR A
  1794. paddusw mm1,mm0 ; UL +
  1795. psubw mm3,[edi+PITCH*13+8] ; DR B
  1796. pmaddwd mm5,mm5 ; 7L D
  1797. movq mm0,[esi+ebp*8+8] ; 8R a
  1798. paddusw mm1,mm6 ; UL +
  1799. psubw mm0,[edi+ebp*8+8] ; 8R b
  1800. psllw mm2,8 ; BR C
  1801. movq mm4,[esi+ecx*2+8] ; AR a
  1802. paddusw mm1,mm5 ; UL +
  1803. psubw mm4,[edi+ecx*2+8] ; AR b
  1804. punpckldq mm6,mm1 ; Get low SWD accum to hi order of mm6.
  1805. movq mm5,[esi+eax*2+8] ; ER a
  1806. paddusw mm6,mm1 ; mm6[48:63] is SWD for upper left blk.
  1807. psubw mm5,[edi+eax*2+8] ; ER b
  1808. psrlq mm6,48 ; mm6 is SWD for upper left block.
  1809. psubusw mm7,mm6 ; Diminish prev best SWD by cand UL blk.
  1810. pmaddwd mm2,mm2 ; BR D
  1811. pmaddwd mm0,mm0 ; 8R c
  1812. psllw mm3,8 ; DR C
  1813. movq mm1,[esi+ebp*1+8] ; 1R A
  1814. pmaddwd mm3,mm3 ; DR D
  1815. paddusw mm2,PartSWDForLRBlk ; LR +
  1816. pmaddwd mm4,mm4 ; AR c
  1817. psubw mm1,[edi+ebp*1+8] ; 1R B
  1818. paddusw mm2,mm0 ; LR +
  1819. movq mm0,[esi+ecx*1+8] ; 5R A
  1820. pmaddwd mm5,mm5 ; ER c
  1821. psubw mm0,[edi+ecx*1+8] ; 5R B
  1822. paddusw mm2,mm3 ; LR +
  1823. movq mm3,[esi+eax*1+8] ; 7R A
  1824. paddusw mm2,mm4 ; LR +
  1825. paddusw mm2,mm5 ; LR +
  1826. psllw mm1,8 ; 1R C
  1827. psubw mm3,[edi+eax*1+8] ; 7R B
  1828. punpckldq mm5,mm2 ; Get low SWD accum to hi order of mm5.
  1829. paddusw mm5,mm2 ; mm5[48:63] is SWD for lower right blk.
  1830. pmaddwd mm1,mm1 ; 1R D
  1831. movq mm2,[esi+ebp*2+8] ; 2R a
  1832. psrlq mm5,48 ; mm5 is SWD for lower right block.
  1833. psubusw mm7,mm5 ; Diminish prev best SWD by cand LR blk.
  1834. punpckldq mm6,mm5 ; mm6[0:31] UL SWD; mm6[32:63] LR SWD.
  1835. psubw mm2,[edi+ebp*2+8] ; 2R b
  1836. psllw mm0,8 ; 5R C
  1837. movq mm5,[esi+ebp*4+8] ; 4R a
  1838. pmaddwd mm0,mm0 ; 5R D
  1839. psubw mm5,[edi+ebp*4+8] ; 4R b
  1840. psllw mm3,8 ; 7R C
  1841. paddusw mm1,PartSWDForURBlk ; UR +
  1842. pmaddwd mm3,mm3 ; 7R D
  1843. paddusw mm1,mm0 ; UR +
  1844. pmaddwd mm2,mm2 ; 2R c
  1845. movq mm0,[esi+PITCH*11] ; BL A
  1846. pmaddwd mm5,mm5 ; 4R c
  1847. psubw mm0,[edi+PITCH*11] ; BL B
  1848. paddusw mm1,mm3 ; UR +
  1849. movq mm3,[esi+ecx*2] ; AL a
  1850. paddusw mm1,mm2 ; UR +
  1851. psubw mm3,[edi+ecx*2] ; AL b
  1852. paddusw mm1,mm5 ; UR +
  1853. pmaddwd mm3,mm3 ; AL c
  1854. psllw mm0,8 ; BL C
  1855. movq mm2,[esi+PITCH*13] ; DL A
  1856. pmaddwd mm0,mm0 ; BL D
  1857. psubw mm2,[edi+PITCH*13] ; DL B
  1858. punpckldq mm5,mm1 ; Get low SWD accum to hi order of mm5.
  1859. movq mm4,[esi+ebp*8] ; 8L a
  1860. paddusw mm5,mm1 ; mm5[48:63] is SWD for upper right blk.
  1861. psubw mm4,[edi+ebp*8] ; 8L b
  1862. psllw mm2,8 ; DL C
  1863. movq mm1,[esi+eax*2] ; EL a
  1864. pmaddwd mm2,mm2 ; DL D
  1865. psubw mm1,[edi+eax*2] ; EL b
  1866. pmaddwd mm4,mm4 ; 8L c
  1867. paddusw mm3,PartSWDForLLBlk ; LL +
  1868. pmaddwd mm1,mm1 ; EL c
  1869. paddusw mm3,mm0 ; LL +
  1870. psrlq mm5,48 ; mm5 is SWD for upper right block.
  1871. paddusw mm3,mm2 ; LL +
  1872. psubusw mm7,mm5 ; Diminish prev best SWD by cand UR blk.
  1873. paddusw mm3,mm4 ; LL +
  1874. movq mm0,mm7
  1875. paddusw mm3,mm1 ; LL +
  1876. psrlq mm7,32 ; Get original Best SWD
  1877. punpckldq mm1,mm3
  1878. pxor mm2,mm2
  1879. paddusw mm1,mm3
  1880. psrlq mm1,48
  1881. punpckldq mm5,mm1 ; mm5[32:63] SWD for LL. mm5[0:31] SWD for UR.
  1882. psubusw mm0,mm1
  1883. psubusw mm7,mm0 ; BestSWD dim (BestSWD dim CandSWD) --> new best.
  1884. pcmpeqd mm2,mm0 ; [0:31] == 0 iff cand better, else -1.
  1885. ; Registers at this point:
  1886. ; ebp -- PITCH
  1887. ; edi -- Target MacroBlock Base Address.
  1888. ; esi -- Address of upper left block of candidate ref area.
  1889. ; edx -- MBlockActionStream
  1890. ; ebx -- CurrSWDState
  1891. ; mm7 -- New best SWD for macroblock.
  1892. ; mm6 -- [0:31] SWD for upper left; [32:63] SWD for lower right.
  1893. ; mm5 -- [0:31] SWD for upper right; [32:63] SWD for lower left.
  1894. ; mm2 -- [0:31] 0 if cand better, else -1.
  1895. cmp ebx,LASTINITIALMESTATE ; Did we just do zero motion vector?
  1896. jg MEForNonZeroMVDone
  1897. movdf eax,mm7 ; SWD for this candidate.
  1898. punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
  1899. test ebx,ebx
  1900. jns ZeroMVDoneForNonHeuristicME
  1901. HeuristicME_EarlyOut:
  1902. movq mm0,EMVLimitsForThisMB ; Speculate no extended motion vectors.
  1903. pcmpeqb mm1,mm1 ; <FFFF FFFF FFFF FFFF>
  1904. xor ecx,ecx
  1905. cmp bl,-3
  1906. mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
  1907. jle HeuristicME_CaseSigMVDone_or_CaseAboveMVDone
  1908. sub eax,NONZEROMVDIFFERENTIAL
  1909. inc bl
  1910. mov ebx,DoExtendedMotionVectors ; 7 iff doing extende MVs, else 0.
  1911. jne HeuristicME_CaseLeftMVDone
  1912. HeuristicME_Case0MVDone:
  1913. movq SWDULandLR,mm6
  1914. pcmpeqb mm4,mm4 ; <FFFF FFFF FFFF FFFF>
  1915. movq SWDURandLL,mm5
  1916. psllw mm4,15 ; <8000 8000 8000 8000>
  1917. cmp eax,ZEROVECTORTHRESHOLD-NONZEROMVDIFFERENTIAL
  1918. ; ; Compare 0-MV against ZeroVectorThreshold.
  1919. jl BelowZeroThresh ; Jump if 0-MV is good enough.
  1920. mov SWDForNon0MVToBeat,eax
  1921. and ebx,ecx ; Elim flag for bottom row. 0 iff no ExtMV.
  1922. mov eax,BlockAbove[4]
  1923. je NotExtendedMVs ; Jump if not doing extended MVs?
  1924. ; Below: A==left; B==above; C==above rt.
  1925. movdt mm3,ValidRemoteVectors[ebx*4] ; <mask(A) (C) (B) (A)>
  1926. movq mm2,mm4 ; <8000 8000 8000 8000>
  1927. IF SIZEOF T_MacroBlockActionDescr-128
  1928. **** error: Due to assembler weakness, can't use spaces here, so SIZEOF
  1929. **** T_MacroBlockActionDescr is replaced by constant. If assembly error
  1930. **** occurs, the constant has been changed, and the three instructions in
  1931. **** the next 10 lines have to change.
  1932. ENDIF
  1933. IF SIZEOF T_Blk-16
  1934. **** error: Due to assembler weakness, can't use spaces here, so SIZEOF T_Blk
  1935. **** is replaced by constant. If assembly error occurs, the constant has been
  1936. **** changed, and the three instructions in the next 10 lines have to change.
  1937. ENDIF
  1938. movdt mm0,[edx-128].BestFullPelMBMVs ; <x x Av,h x >
  1939. punpcklbw mm3,mm3 ; mask for both MV parts
  1940. movdt mm1,[edx+eax-2*16+128].BestFullPelMBMVs ; <x x Cv,h x >
  1941. psrlw mm2,8 ; <0080 0080 0080 0080>
  1942. por mm4,mm2 ; <8080 ...> bias value.
  1943. punpcklwd mm1,mm0 ; <Av,h Cv,h x x >
  1944. punpcklwd mm0,[edx+eax-2*16].BestFullPelMBMVs ; <Bv,h Av,h x x >
  1945. ;
  1946. punpckhdq mm0,mm1 ; <Av,h Cv,h Bv,h Av,h>
  1947. ;
  1948. pand mm0,mm3 ; Set to 0 any off edge.
  1949. and ebx,4 ; If zero, we're on the top edge.
  1950. paddb mm0,mm4 ; <Av,h Cv,h Bv,h Av,h> biased
  1951. je @f ; If on top edge, cause LEFT to be taken.
  1952. movq mm1,mm0 ; <Av,h Cv,h Bv,h Av,h>
  1953. psrlq mm0,16 ; <x Av,h Cv,h Bv,h>
  1954. psubusb mm0,mm1 ; <x floor(A-C) floor(C-B) floor(B-A)>
  1955. ;
  1956. paddb mm0,mm1 ; <x max(A,C) max(C,B) max(B,A)>
  1957. ;
  1958. movq mm1,mm0 ; <x max(A,C) max(C,B) max(B,A)>
  1959. psrlq mm0,16 ; <x x max(A,C) max(C,B)>
  1960. pxor mm1,mm0 ; Part of median calc.
  1961. psrlq mm0,16 ; <x x x max(A,C)>
  1962. pxor mm0,mm1 ; <x x x median(A,B,C)> biased by +128.
  1963. ;
  1964. @@:
  1965. punpcklbw mm0,mm0 ; 2 copies of median predictor MVs.
  1966. pcmpeqb mm1,mm1
  1967. punpcklwd mm0,mm0 ; 4 copies. Will now calc the following:
  1968. ; ; [ 0: 7] -- HMV lower limit for sig search
  1969. ; ; [ 8:15] -- HMV lower limit
  1970. ; ; [16:23] -- HMV upper limit for sig search
  1971. ; ; [24:31] -- HMV upper limit
  1972. ; ; [32:39] -- VMV lower limit for sig search
  1973. ; ; [40:47] -- VMV lower limit
  1974. ; ; [48:55] -- VMV upper limit for sig search
  1975. ; ; [56:63] -- VMV upper limit
  1976. ;
  1977. psubusb mm0,EMV_ClampLowerEnd[ecx*8-40]
  1978. psllw mm1,3 ; <FF F8 FF F8 FF F8 FF F8> i.e. Mask to
  1979. ; ; set sig srch range to mult of 8.
  1980. paddusb mm0,EMV_ClampUpperEnd[ecx*8-40]
  1981. psubb mm0,EMV_RestoreRange[ecx*8-40]
  1982. NotExtendedMVs:
  1983. movq SWD0MVURandLL,mm5
  1984. pand mm0,mm1 ; Set sig search at multiples of four.
  1985. movq SWD0MVULandLR,mm6
  1986. pcmpeqb mm2,mm2 ; Set cand as worse than 0MV, in case skip.
  1987. movq EMVLimitsForThisMB,mm0
  1988. and cl,1
  1989. je HeuristicME_SkipLeftMV
  1990. mov BestOfFourStartingPoints,esi
  1991. mov ebx,-2 ; Indicate trying MV of MB to left.
  1992. movsx ecx,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBVMV
  1993. movsx eax,[edx-SIZEOF T_MacroBlockActionDescr].BestFullPelMBHMV
  1994. ClampHeurMECandidateToRange:
  1995. movsx esi,PB EMVLimitsForThisMB+5 ; VMV lower limit.
  1996. cmp ecx,esi
  1997. jl ClampVMV_1
  1998. movsx esi,PB EMVLimitsForThisMB+7 ; VMV upper limit.
  1999. cmp ecx,esi
  2000. jle @f
  2001. ClampVMV_1:
  2002. mov ecx,esi
  2003. @@:
  2004. movsx esi,PB EMVLimitsForThisMB+1 ; HMV lower limit.
  2005. cmp eax,esi
  2006. jl ClampHMV_1
  2007. movsx esi,PB EMVLimitsForThisMB+3 ; HMV upper limit.
  2008. cmp eax,esi
  2009. jle @f
  2010. ClampHMV_1:
  2011. mov eax,esi
  2012. @@:
  2013. sar eax,1
  2014. lea ecx,[ecx+ecx*2]
  2015. IF PITCH-384
  2016. *** error: The magic here assumes a pitch of 384.
  2017. ENDIF
  2018. shl ecx,6
  2019. mov esi,Addr0MVRef
  2020. add eax,ecx ; Clamped Linearized Motion Vector
  2021. ;
  2022. sub eax,1
  2023. jc MblkEst_EarlyOut ; Jump if Lin MV is zero.
  2024. lea esi,[esi+eax+1] ; Candidate reference address.
  2025. jmp ComputeMBSWD
  2026. HeuristicME_SkipLeftMV:
  2027. mov BestOfFourStartingPoints,esi
  2028. mov cl,[edx].MBEdgeType ; 1 left | 2 right | 4 top | 8 bottom
  2029. HeuristicME_CaseLeftMVDone:
  2030. movdf eax,mm2 ; eax == 0 iff cand better, else -1.
  2031. mov ebx,BlockAbove[4]
  2032. and cl,4
  2033. movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
  2034. punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
  2035. movq SWDURandLL[eax*8],mm5
  2036. pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip.
  2037. mov BestOfFourStartingPoints[eax*4],esi
  2038. je HeuristicME_SkipAboveMV
  2039. movsx ecx,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBVMV
  2040. movsx eax,[edx+ebx-2*SIZEOF T_Blk].BestFullPelMBHMV
  2041. mov ebx,-3 ; Indicate trying MV of MB above.
  2042. jmp ClampHeurMECandidateToRange
  2043. HeuristicME_CaseSigMVDone_or_CaseAboveMVDone:
  2044. HeuristicME_SkipAboveMV:
  2045. movdf eax,mm2 ; eax == 0 iff cand better, else -1.
  2046. jne HeuristicME_CaseSigMVDone
  2047. HeuristicME_CaseAboveMVDone:
  2048. mov cl,4
  2049. lea ebx,C0001000100010001
  2050. movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
  2051. pxor mm0,mm0
  2052. movq SWDURandLL[eax*8],mm5
  2053. pxor mm1,mm1
  2054. mov BestOfFourStartingPoints[eax*4],esi
  2055. lea esi,TargetSigContribForRowPairs
  2056. movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
  2057. pcmpeqb mm7,mm7 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
  2058. ; ebp -- Pitch
  2059. ; edi -- Address of target macroblock.
  2060. ; esi -- Address at which to store target macroblock's signature contributions.
  2061. ; cl -- Loop counter.
  2062. ; mm0 -- Accumulator for target MB's sig contrib for first four even columns.
  2063. ; mm1 -- Accumulator for target MB's sig contrib for last four even columns.
  2064. movq mm2,[edi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
  2065. pcmpeqb mm5,mm5 ; W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
  2066. paddb mm2,[edi+ebp*1] ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
  2067. psrlw mm5,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
  2068. @@:
  2069. movq mm3,[edi+ebp*2] ; B:<P27 P26 P25 P24 P23 P22 P21 P20>
  2070. movq mm4,mm2 ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
  2071. paddb mm3,[edi+PITCH*3] ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
  2072. psrlw mm2,8 ; W:<P07+P17 P05+P15 P03+P13 P01+P11>
  2073. pmaddwd mm2,[ebx] ; D:<P07+P17+P05+P15 P03+P13+P01+P11>
  2074. movq mm7,mm5 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
  2075. pand mm5,mm3 ; W:<P26+P36 P24+P34 P22+P32 P20+P30>
  2076. psrlw mm3,8 ; W:<P27+P37 P25+P35 P23+P33 P21+P31>
  2077. pmaddwd mm3,[ebx] ; D:<P27+P37+P25+P35 P23+P33+P21+P31>
  2078. paddw mm0,mm5 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>
  2079. movq mm5,[edi+ebp*2+8] ; B:<P2F P2E P2D P2C P2B P2A P29 P28>
  2080. pand mm4,mm7 ; W:<P06+P16 P04+P14 P02+P12 P00+P10>
  2081. paddb mm5,[edi+PITCH*3+8] ; B:<P2F+P3F P2E+P3E P2D+P3D P2C+P3C ...>
  2082. paddw mm0,mm4 ; W:<sum(P*6) sum(P*4) sum(P*2) sum (P*0)>
  2083. movq mm4,[edi+8] ; B:<P0F P0E P0D P0C P0B P0A P09 P08>
  2084. movq mm6,mm7 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
  2085. paddb mm4,[edi+ebp*1+8] ; B:<P0F+P1F P0E+P1E P0D+P1D P0C+P1C ...>
  2086. pand mm7,mm5 ; W:<P2E+P3E P2C+P3C P2A+P3A P28+P38>
  2087. pand mm6,mm4 ; W:<P0E+P1E P0C+P1C P0A+P1A P08+P18>
  2088. psrlw mm5,8 ; W:<P2F+P3F P2D+P3D P2B+P3B P29+P39>
  2089. pmaddwd mm5,[ebx] ; D:<P2F+P3F+P2D+P3D P2B+P3B+P29+P39>
  2090. psrlw mm4,8 ; W:<P0F+P1F P0D+P1D P0B+P1B P09+P19>
  2091. pmaddwd mm4,[ebx] ; D:<P0F+P1F+P0D+P1D P0B+P1B+P09+P19>
  2092. paddw mm1,mm7 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>
  2093. paddw mm1,mm6 ; W:<sum(P*E) sum(P*C) sum(P*A) sum (P*8)>
  2094. lea edi,[edi+ebp*4] ; Advance input cursor
  2095. paddw mm3,mm5 ; D:<P2F+P3F+P2D+P3D+P27+P37+P25+P35
  2096. ; ; P2B+P3B+P29+P39+P23+P33+P21+P31>
  2097. pcmpeqb mm5,mm5 ; Next W:<0xFFFF 0xFFFF 0xFFFF 0xFFFF>
  2098. paddw mm4,mm2 ; D:<P0F+P1F+P0D+P1D+P07+P17+P05+P15
  2099. ; ; P0B+P1B+P09+P19+P03+P13+P01+P11>
  2100. punpckldq mm7,mm3 ; D:<P0B+P1B+P09+P19+P03+P13+P01+P11 junk>
  2101. paddw mm7,mm3 ; [32:47]:<sum of odd pels of lines 0 and 1>
  2102. punpckldq mm6,mm4 ; W:<P2B+P3B+P29+P39+P23+P33+P21+P31 junk>
  2103. movq mm2,[edi] ; Next B:<P07 P06 P05 P04 P03 P02 P01 P00>
  2104. paddw mm6,mm4 ; [32:47]:<sum of odd pels of lines 2 and 3>
  2105. paddb mm2,[edi+ebp*1] ; Next B:<P07+P17 P06+P16 P05+P15 ...>
  2106. punpckhwd mm6,mm7 ; [0:31] W:<Line_0&1_odd Line_2&3_odd>
  2107. mov MBlockActionStream,edx
  2108. dec cl
  2109. movdf [esi],mm6 ; Save W:<Line_0&1_odd Line_2&3_odd>
  2110. psrlw mm5,8 ; Next W:<0x00FF 0x00FF 0x00FF 0x00FF>
  2111. lea esi,[esi+4] ; Advance output cursor
  2112. jne @b
  2113. ; ebp -- Pitch
  2114. ; edi -- Address of candidate reference MB's signature contribs.
  2115. ; esi -- Address at which target MB's signature contribs were stored, plus 16.
  2116. ; edx -- Scratch.
  2117. ; ecx -- Count down number of lines of signatures to try.
  2118. ; ebx -- Increment to get from end of one line of signatures to start of next.
  2119. ; al -- Count down number of signatures to try in a line.
  2120. ; ah -- Reinits counter of signatures to try in a line.
  2121. ; mm0 -- Target MB's sig contrib for first four even columns.
  2122. ; mm1 -- Target MB's sig contrib for last four even columns.
  2123. ; mm2 -- Target MB's sig contrib for first four pairs of rows, odd columns.
  2124. ; mm3 -- Amount and address of best signature seen so far.
  2125. IF PITCH-384
  2126. *** error: The magic here assumes a pitch of 384.
  2127. ENDIF
  2128. xor eax,eax
  2129. mov ecx,TargetToSig_Debiased
  2130. mov al,EMVLimitsForThisMB+4 ; Lower vert lim for sig srch (half pels)
  2131. xor ebx,ebx
  2132. add edi,ecx
  2133. mov bl,EMVLimitsForThisMB+0 ; Lower horz lim for sig srch (half pels)
  2134. shr ebx,1
  2135. lea ecx,[eax+eax*2]
  2136. shl ecx,6
  2137. add edi,ebx
  2138. add edi,ecx
  2139. xor ecx,ecx
  2140. add ebx,ebx
  2141. mov cl,EMVLimitsForThisMB+6 ; Upper vert lim for sig srch (half pels)
  2142. sub ecx,eax
  2143. mov al,EMVLimitsForThisMB+2 ; Upper horz lim for sig srch (half pels)
  2144. shr ecx,3 ; Number of lines of sigs to do, minus 1.
  2145. sub eax,ebx
  2146. shr eax,3 ; Number of columns of sigs to do.
  2147. lea ebx,[ebp-1+080000000H]
  2148. sub ebx,eax ; 1/4th amt to add to move to next line.
  2149. mov ah,al
  2150. inc ah ; To reinit cntr for line.
  2151. movq mm2,[esi-16]
  2152. pcmpeqd mm3,mm3 ; Set winning signature artificially high.
  2153. movdt mm4,[edi]
  2154. psrld mm3,2
  2155. punpckldq mm4,[edi+4] ; ref sig contribs of left even cols.
  2156. TryNextSignature:
  2157. movdt mm5,[edi+8]
  2158. psubw mm4,mm0 ; diffs for sums of left even columns.
  2159. punpckldq mm5,[edi+12] ; ref sig contribs of right even cols.
  2160. pmaddwd mm4,mm4 ; Squared differences.
  2161. movdt mm6,[edi+ebp*2] ; Sums for first two pairs of rows.
  2162. psubw mm5,mm1 ; diffs for sums of right even columns.
  2163. punpckldq mm6,[edi+PITCH*6] ; Sums for second two pairs of rows.
  2164. pmaddwd mm5,mm5 ; Squared differences.
  2165. movdt mm7,[edi+PITCH*10] ; Sums for third two pairs of rows.
  2166. psubw mm6,mm2 ; Words: diffs for sums of first 4 pairs rows.
  2167. punpckldq mm7,[edi+PITCH*14] ; Sums for last two pairs of rows.
  2168. pmaddwd mm6,mm6 ; Squared differences.
  2169. psubw mm7,[esi-8] ; Words: diffs for sums of first 4 pairs rows.
  2170. paddd mm4,mm5 ; Accumulate squared differences.
  2171. sub al,1 ; Decrement line counter.
  2172. pmaddwd mm7,mm7 ; Squared differences.
  2173. sbb edx,edx ; -1 if done with line, else 0.
  2174. paddd mm6,mm4 ; Accumulate squared differences.
  2175. and edx,ebx ; 1/4 Amt to sub to goto next line, else 0.
  2176. paddd mm7,mm6 ; Accumulate squared differences.
  2177. movdt mm5,edi ; Address of this signature
  2178. punpckldq mm6,mm7 ; <low_order_accumulator junk>
  2179. paddd mm7,mm6 ; <full_signature_amt junk>
  2180. psllq mm5,32 ; <Addr_of_this_signature 0>
  2181. lea edi,[edi+edx*4+4] ; advance signature position to next cand.
  2182. punpckhdq mm5,mm7 ; <cand_signature_amt cand_signature_addr>
  2183. sar edx,31 ; -1 if done with line, else 0.
  2184. pcmpgtd mm7,mm3 ; <0xFFFFFFFF if cand not better junk>
  2185. movdt mm4,[edi]
  2186. punpckhdq mm7,mm7 ; <0xFFFFFFFFFFFFFFFF if cand not better>
  2187. punpckldq mm4,[edi+4]
  2188. pand mm3,mm7 ; 1st_best if cand not better, else 0.
  2189. and dl,ah ; Num cols in a line if done with line, else 0.
  2190. pandn mm7,mm5 ; cand if better than 1st_best, else 0.
  2191. add al,dl ; Reinit col count if finishing with line.
  2192. por mm3,mm7 ; Better of cand and 1st_best.
  2193. sbb ecx,0 ; Decrement line count if just finished line.
  2194. jge TryNextSignature
  2195. movdf ecx,mm3 ; Fetch address of best signature.
  2196. pcmpeqb mm2,mm2 ; Set cand as worse than prev, in case skip.
  2197. mov edi,TargetMacroBlockBaseAddr
  2198. mov ebx,-4 ; Indicate trying MV of best signature.
  2199. sub ecx,edi
  2200. mov eax,SigToTarget
  2201. movdt mm7,BestMBFullPelSWD ; Reload SWD for best full pel MB MV.
  2202. lea esi,[ecx+eax] ; Linearized motion vector
  2203. add eax,ecx ; Linearized motion vector
  2204. sar esi,8 ; Full pel vert lin offset div 256.
  2205. mov edx,MBlockActionStream ; Reload pointer to MBA descriptor.
  2206. shl eax,25
  2207. punpckldq mm7,mm7
  2208. movsx ecx,UnlinearizedVertMV[esi] ; Get full pel vert MV component.
  2209. sar eax,24 ; Full pel HMV.
  2210. jmp ClampHeurMECandidateToRange
  2211. HeuristicME_CaseSigMVDone:
  2212. HeuristicME_SkipSigMV:
  2213. movdf eax,mm2 ; eax == 0 iff cand better, else -1.
  2214. pcmpeqd mm0,mm0 ; Init previous best SWD to huge.
  2215. mov ecx,Addr0MVRef ; Start to calc linearized MV.
  2216. mov bh,EMVLimitsForThisMB+1 ; HMV lower limit.
  2217. mov BestOfFourStartingPoints[eax*4],esi
  2218. add bh,4
  2219. movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
  2220. psrlq mm0,2
  2221. movq SWDURandLL[eax*8],mm5
  2222. psrld mm0,14
  2223. mov eax,BestOfFourStartingPoints
  2224. mov bl,EMVLimitsForThisMB+5 ; VMV lower limit.
  2225. mov esi,eax
  2226. sub eax,ecx ; Linearized motion vector
  2227. mov ecx,eax ; Linearized motion vector
  2228. add al,al ; Full pel HMV.
  2229. cmp al,bh
  2230. jl ClampHMV_2
  2231. mov bh,EMVLimitsForThisMB+3 ; HMV upper limit
  2232. sub bh,4
  2233. cmp al,bh
  2234. jle NoClampHMV_2
  2235. ClampHMV_2:
  2236. sar ecx,8 ; Full pel vert lin offset div 256.
  2237. add bl,4
  2238. movzx eax,bh
  2239. movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.
  2240. cmp cl,bl
  2241. jl @f
  2242. mov bl,EMVLimitsForThisMB+7 ; VMV upper limit.
  2243. movq mm7,mm0
  2244. sub bl,4
  2245. cmp cl,bl
  2246. jle NoClampVMV_2
  2247. @@:
  2248. movsx ecx,bl
  2249. movq mm7,mm0
  2250. NoClampVMV_2:
  2251. sar eax,1
  2252. lea ecx,[ecx+ecx*2]
  2253. shl ecx,6
  2254. mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number.
  2255. mov esi,Addr0MVRef
  2256. add eax,ecx ; Linearized motion vector.
  2257. add esi,eax
  2258. jmp ComputeMBSWD
  2259. NoClampHMV_2:
  2260. sar ecx,8 ; Full pel vert lin offset div 256.
  2261. add bl,4
  2262. mov ah,bl
  2263. movsx ecx,PB UnlinearizedVertMV[ecx] ; Get full pel vert MV component.
  2264. cmp cl,ah
  2265. jl @f
  2266. mov ah,EMVLimitsForThisMB+7 ; VMV upper limit.
  2267. lea esi,[esi+ebp+1]
  2268. sub ah,4
  2269. mov ebx,FIRST_HEURISTIC_EXHAUSTIVE ; New state number.
  2270. cmp cl,ah
  2271. jle ComputeMBSWD
  2272. @@:
  2273. movsx ecx,ah
  2274. movzx eax,al
  2275. sar eax,1
  2276. lea ecx,[ecx+ecx*2]
  2277. shl ecx,6
  2278. mov ebx,FIRST_HEURISTIC_EXHAUSTIVE_NEW_CTR ; New state number.
  2279. mov esi,Addr0MVRef
  2280. add eax,ecx ; Linearized motion vector.
  2281. add esi,eax
  2282. movq mm7,mm0
  2283. jmp ComputeMBSWD
  2284. ZeroMVDoneForNonHeuristicME:
  2285. movq SWDULandLR,mm6
  2286. movq SWDURandLL,mm5
  2287. cmp eax,ZEROVECTORTHRESHOLD ; Compare 0-MV against ZeroVectorThreshold.
  2288. jl BelowZeroThresh ; Jump if 0-MV is good enough.
  2289. xor ecx,ecx
  2290. sub eax,NONZEROMVDIFFERENTIAL
  2291. mov cl,StateEngineFirstRule[ebx] ; MV adjustment.
  2292. mov bl,StateEngineFirstRule[ebx+10] ; New state number.
  2293. shl ecx,11
  2294. mov SWDForNon0MVToBeat,eax
  2295. movq SWD0MVULandLR,mm6
  2296. movq SWD0MVURandLL,mm5
  2297. lea esi,[esi+ecx-PITCH*8]
  2298. jmp ComputeMBSWD
  2299. MEForNonZeroMVDone:
  2300. movdf eax,mm2 ; eax == 0 iff cand better, else -1.
  2301. MblkEst_EarlyOut:
  2302. xor ecx,ecx
  2303. test ebx,ebx
  2304. movq SWDULandLR[eax*8],mm6 ; Save blk SWDs if better (else toss).
  2305. pcmpeqb mm2,mm2 ; Set cand as worse than 0MV.
  2306. mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.
  2307. js HeuristicME_EarlyOut
  2308. add esi,ecx ; Adjust ref addr for horz motion.
  2309. mov bl,StateEngine[eax+ebx*4+3] ; 0:239 -> New state number;
  2310. ; ; 240:255 -> flags which 1/2 pel to do.
  2311. shr ecx,4
  2312. punpckldq mm7,mm7 ; Put new best in mm7[0:31] and mm7[32:63].
  2313. movq SWDURandLL[eax*8],mm5
  2314. pxor mm6,mm6 ; Speculatively zero to prep for half pel ME.
  2315. add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
  2316. cmp bl,240 ; Terminal state?
  2317. jb ComputeMBSWD
  2318. mov eax,esi
  2319. mov ecx,Addr0MVRef ; Start to calc linearized MV.
  2320. sub eax,ecx ; Linearized Motion Vector
  2321. ;
  2322. mov ecx,eax
  2323. ;
  2324. sar eax,8 ; Full pel vert lin offset div 256.
  2325. and cl,07FH ; Full pel HMV
  2326. add cl,cl
  2327. ;
  2328. mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
  2329. IFDEF H261
  2330. ELSE
  2331. mov eax,DoHalfPelME ; 0 if not, -4 if so.
  2332. test eax,eax
  2333. je SkipHalfPelMBME
  2334. cmp cl,EMVLimitsForThisMB+1 ; Skip half pel ME if at edge of range
  2335. jle SkipHalfPelMBME
  2336. cmp cl,EMVLimitsForThisMB+3
  2337. jge SkipHalfPelMBME
  2338. cmp ch,EMVLimitsForThisMB+5
  2339. jle SkipHalfPelMBME
  2340. cmp ch,EMVLimitsForThisMB+7
  2341. jge SkipHalfPelMBME
  2342. ; Registers:
  2343. ; ebp -- PITCH
  2344. ; esi -- Address of best full pel reference macroblock
  2345. ; edx -- MBlockActionStream
  2346. ; ecx -- Nothing presently.
  2347. ; edi -- Address of target macroblock.
  2348. ; ebx -- 240 + Flags to indicate which half pel ME to do:
  2349. ; 1 --> right; 2 --> left; 4 --> down; 8 --> up
  2350. ; eax -- Count from -4 to -1 for blocks of macroblock.
  2351. ; mm0:mm7 -- Scratch
  2352. movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
  2353. pxor mm7,mm7 ; Prep accumulator for half pel ME.
  2354. call HalfPelMotionEstimation
  2355. movdt mm7,InvalidateBadHalfPelMVs[eax*4] ; Need to inflate SWDs for
  2356. ; ; MVs that go off frame edge.
  2357. mov eax,esi
  2358. mov ebx,Addr0MVRef ; Start to calc linearized MV.
  2359. sub eax,ebx ; Linearized Motion Vector
  2360. punpcklbw mm7,mm7 ; Expand adjustment to words.
  2361. mov ecx,eax ; Linearized Motion Vector
  2362. paddusw mm7,mm3 ; Now have SWDs for half pel MBME.
  2363. sar eax,8 ; Full pel vert lin offset div 256.
  2364. and cl,07FH ; Full pel HMV
  2365. add cl,cl
  2366. movq mm6,mm7
  2367. mov [edx].BestFullPelMBHMV,cl ; Save HMV
  2368. mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
  2369. movdf eax,mm7 ; eax[ 0:15] -- SWD for leftward ref.
  2370. ; ; eax[16:31] -- SWD for rightward ref.
  2371. psrlq mm6,32
  2372. mov [edx].BestFullPelMBVMV,ch ; Save VMV
  2373. mov ebx,eax
  2374. shr eax,16 ; eax -- SWD for leftward ref.
  2375. and ebx,00000FFFFH ; ebx -- SWD for rightward ref.
  2376. cmp eax,ebx
  2377. jg MBME_RightBetterThanLeft
  2378. MBME_LeftBetterThanRight:
  2379. cmp eax,BestMBFullPelSWD
  2380. jge MBME_CtrIsBestHMV
  2381. MBME_LeftBestHMV:
  2382. movdf ebx,mm6 ; ebx[ 0:15] -- SWD for downward ref.
  2383. ; ; ebx[16:31] -- SWD for upward ref.
  2384. mov BestHalfPelHorzSWD,eax
  2385. mov eax,ebx
  2386. shr eax,16 ; eax -- SWD for upward ref.
  2387. and ebx,00000FFFFH ; ebx -- SWD for downward ref.
  2388. cmp eax,ebx
  2389. jg MBME_LeftBestHMV_DownBetterThanUp
  2390. MBME_LeftBestHMV_UpBetterThanDown:
  2391. cmp eax,BestMBFullPelSWD
  2392. jge MBME_LeftIsBest
  2393. MBME_LeftBestHMV_UpBestVMV:
  2394. sub esi,PITCH+1 ; Try ref 1/2 pel left and up
  2395. mov BestHalfPelVertSWD,eax
  2396. mov al,4
  2397. call HalfPelMotionEstimationBothWays
  2398. mov eax,BestHalfPelVertSWD
  2399. lea esi,[esi+ebp*1+1] ; Back to center.
  2400. cmp eax,ebx
  2401. jle MBME_UpBetterThanUpLeft
  2402. MBME_UpLeftBetterThanUp:
  2403. cmp ebx,BestHalfPelHorzSWD
  2404. jge MBME_LeftIsBest
  2405. MBME_UpLeftIsBest:
  2406. dec cl ; Back up the horz MV one to the left.
  2407. lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up
  2408. dec ch ; Back up the vert MV one up.
  2409. jmp MBME_HalfPelSearchDone
  2410. MBME_UpBetterThanUpLeft:
  2411. cmp eax,BestHalfPelHorzSWD
  2412. jg MBME_LeftIsBest
  2413. MBME_UpIsBest:
  2414. mov ebx,eax
  2415. dec ch ; Back up the vert MV one up.
  2416. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
  2417. jmp MBME_HalfPelSearchDone
  2418. MBME_LeftBestHMV_DownBetterThanUp:
  2419. cmp ebx,BestMBFullPelSWD
  2420. jge MBME_LeftIsBest
  2421. MBME_LeftBestHMV_DownBestVMV:
  2422. dec esi ; Try ref 1/2 pel left and down
  2423. mov BestHalfPelVertSWD,ebx
  2424. mov al,4
  2425. call HalfPelMotionEstimationBothWays
  2426. mov eax,BestHalfPelVertSWD
  2427. inc esi ; Back to center.
  2428. cmp eax,ebx
  2429. jle MBME_DownBetterThanDownLeft
  2430. MBME_DownLeftBetterThanDown:
  2431. cmp ebx,BestHalfPelHorzSWD
  2432. jge MBME_LeftIsBest
  2433. MBME_DownLeftIsBest:
  2434. dec cl ; Back up the horz MV one to the left.
  2435. lea eax,[esi-1] ; Best is ref 1/2 pel left and down
  2436. inc ch ; Advance the vert MV one down.
  2437. jmp MBME_HalfPelSearchDone
  2438. MBME_DownBetterThanDownLeft:
  2439. cmp eax,BestHalfPelHorzSWD
  2440. jle MBME_DownIsBest
  2441. MBME_LeftIsBest:
  2442. dec cl ; Back up the horz MV one to the left.
  2443. lea eax,[esi-1] ; Best is ref 1/2 pel left.
  2444. mov ebx,BestHalfPelHorzSWD
  2445. jmp MBME_HalfPelSearchDone
  2446. MBME_RightBetterThanLeft:
  2447. cmp ebx,BestMBFullPelSWD
  2448. jge MBME_CtrIsBestHMV
  2449. MBME_RightBestHMV:
  2450. movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref.
  2451. ; ; eax[16:31] -- SWD for upward ref.
  2452. mov BestHalfPelHorzSWD,ebx
  2453. mov ebx,eax
  2454. shr eax,16 ; eax -- SWD for upward ref.
  2455. and ebx,00000FFFFH ; ebx -- SWD for downward ref.
  2456. cmp eax,ebx
  2457. jg MBME_RightBestHMV_DownBetterThanUp
  2458. MBME_RightBestHMV_UpBetterThanDown:
  2459. cmp eax,BestMBFullPelSWD
  2460. jge MBME_RightIsBest
  2461. MBME_RightBestHMV_UpBestVMV:
  2462. sub esi,ebp ; Try ref 1/2 pel right and up
  2463. mov BestHalfPelVertSWD,eax
  2464. mov al,4
  2465. call HalfPelMotionEstimationBothWays
  2466. mov eax,BestHalfPelVertSWD
  2467. lea esi,[esi+ebp*1] ; Back to center.
  2468. cmp eax,ebx
  2469. jle MBME_UpBetterThanUpRight
  2470. MBME_UpRightBetterThanUp:
  2471. cmp ebx,BestHalfPelHorzSWD
  2472. jge MBME_RightIsBest
  2473. MBME_UpRightIsBest:
  2474. inc cl ; Advance the horz MV one to right.
  2475. lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up
  2476. dec ch ; Back up the vert MV one up.
  2477. jmp MBME_HalfPelSearchDone
  2478. MBME_UpBetterThanUpRight:
  2479. cmp eax,BestHalfPelHorzSWD
  2480. jle MBME_UpIsBest
  2481. MBME_RightIsBest:
  2482. mov ebx,BestHalfPelHorzSWD
  2483. inc cl ; Advance the horz MV one to right.
  2484. mov eax,esi
  2485. jmp MBME_HalfPelSearchDone
  2486. MBME_RightBestHMV_DownBetterThanUp:
  2487. cmp ebx,BestMBFullPelSWD
  2488. jge MBME_RightIsBest
  2489. MBME_RightBestHMV_DownBestVMV:
  2490. mov BestHalfPelVertSWD,ebx
  2491. mov al,4
  2492. call HalfPelMotionEstimationBothWays
  2493. mov eax,BestHalfPelVertSWD
  2494. cmp eax,ebx
  2495. jle MBME_DownBetterThanDownRight
  2496. MBME_DownRightBetterThanDown:
  2497. cmp ebx,BestHalfPelHorzSWD
  2498. jge MBME_RightIsBest
  2499. MBME_DownRightIsBest:
  2500. inc cl ; Advance the horz MV one to right.
  2501. mov eax,esi
  2502. inc ch ; Advance vert MV one down.
  2503. jmp MBME_HalfPelSearchDone
  2504. MBME_DownBetterThanDownRight:
  2505. cmp eax,BestHalfPelHorzSWD
  2506. jg MBME_RightIsBest
  2507. MBME_DownIsBest:
  2508. mov ebx,eax
  2509. inc ch ; Advance vert MV one down.
  2510. mov eax,esi
  2511. jmp MBME_HalfPelSearchDone
  2512. MBME_CtrIsBestHMV:
  2513. movdf eax,mm6 ; eax[ 0:15] -- SWD for downward ref.
  2514. ; ; eax[16:31] -- SWD for upward ref.
  2515. mov ebx,eax
  2516. shr eax,16 ; eax -- SWD for upward ref.
  2517. and ebx,00000FFFFH ; ebx -- SWD for downward ref.
  2518. cmp eax,ebx
  2519. jge MBME_CtrBestHMV_DownBetterThanUp
  2520. MBME_CtrBestHMV_UpBetterThanDown:
  2521. mov ebx,BestMBFullPelSWD
  2522. cmp eax,ebx
  2523. jge MBME_CenterIsBest
  2524. ; Up is best.
  2525. mov ebx,eax
  2526. dec ch ; Back up the vert MV one up.
  2527. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
  2528. jmp MBME_HalfPelSearchDone
  2529. MBME_CtrBestHMV_DownBetterThanUp:
  2530. mov eax,ebx
  2531. mov ebx,BestMBFullPelSWD
  2532. cmp eax,ebx
  2533. jge MBME_CenterIsBest
  2534. ; Down is best.
  2535. mov ebx,eax
  2536. inc ch ; Advande the vert MV one down.
  2537. mov eax,esi
  2538. jmp MBME_HalfPelSearchDone
  2539. ENDIF
  2540. SkipHalfPelMBME:
  2541. mov [edx].BestFullPelMBHMV,cl ; Save HMV
  2542. movdf ebx,mm7 ; SWD for best full pel MB MV.
  2543. mov [edx].BestFullPelMBVMV,ch ; Save VMV
  2544. MBME_CenterIsBest:
  2545. mov eax,esi
  2546. MBME_HalfPelSearchDone:
  2547. mov BestMBHalfPelSWD,ebx
  2548. mov BestMBHalfPelMV,cl ; Save HMV
  2549. mov BestMBHalfPelRefAddr,eax
  2550. mov BestMBHalfPelMV+1,ch ; Save VMV
  2551. IFDEF H261
  2552. ELSE ; H263
  2553. mov bl,EMVLimitsForThisMB+1 ; Lower limit comparison.
  2554. mov al,DoBlockLevelVectors ; Are we doing block level MVs?
  2555. dec al
  2556. jne NoBlockMotionVectors
  2557. mov cl,[edx].CodedBlocks ; Fetch coded block pattern.
  2558. add bl,2
  2559. and cl,080H
  2560. jne NoBlockMotionVectors ; Skip Block ME if forced intra.
  2561. mov al,[edx].BestFullPelMBHMV ; Compare full pel HMV against limits.
  2562. mov cl,EMVLimitsForThisMB+3
  2563. cmp al,bl
  2564. jl NoBlockMotionVectors
  2565. mov bl,EMVLimitsForThisMB+5
  2566. sub cl,2
  2567. cmp al,cl ; Upper limit comparison.
  2568. jg NoBlockMotionVectors
  2569. mov al,[edx].BestFullPelMBVMV ; Compare full pel VMV against limits.
  2570. add bl,2
  2571. mov cl,EMVLimitsForThisMB+7
  2572. cmp al,bl
  2573. mov ebx,PD [edx].BestFullPelMBVMV-3
  2574. jl NoBlockMotionVectors
  2575. sar ebx,18
  2576. sub cl,2
  2577. cmp al,cl ; Upper limit comparison.
  2578. jg NoBlockMotionVectors
  2579. mov ecx,BestMBHalfPelSWD ; Jump if SWD for MB MV < thresh.
  2580. IF PITCH-384
  2581. *** error: The magic here assumes a pitch of 384.
  2582. ENDIF
  2583. and ebx,0FFFFFF80H ; VMV*128
  2584. cmp ecx,BLOCKMOTIONTHRESHOLD
  2585. jle NoBlockMotionVectors
  2586. ;==========================================================================
  2587. ; Starting from the best full pel macroblock motion vector calculated above, we
  2588. ; search for the best block motion vectors.
  2589. ;
  2590. ; ebp -- PITCH
  2591. ; esi -- Address of ref block.
  2592. ; edi -- Address of target block.
  2593. ; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
  2594. ; ecx -- Scratch
  2595. ; ebx -- CurrSWDState
  2596. ; eax -- Scratch
  2597. ; mm7 -- Best SWD for current block
  2598. ; mm6 -- unused.
  2599. ; mm5 -- Best SWD for right block of pair worked on by inner loop.
  2600. ; mm0-mm4 Scratch
  2601. ;
  2602. movq mm0,HalfPelMBMESWDAccum+8
  2603. movq mm1,HalfPelMBMESWDAccum+16
  2604. psubusw mm7,mm0
  2605. movq mm2,HalfPelMBMESWDAccum+0
  2606. psubusw mm0,mm1
  2607. movq [edx].BlkY4.BlkLvlSWD+16,mm7
  2608. psubusw mm1,mm2
  2609. movq [edx].BlkY2.BlkLvlSWD+16,mm0
  2610. movq [edx].BlkY3.BlkLvlSWD+16,mm1
  2611. movq [edx].BlkY1.BlkLvlSWD+16,mm2
  2612. movsx eax,[edx].BestFullPelMBHMV
  2613. sar eax,1
  2614. lea ebx,[ebx+ebx*2]
  2615. mov esi,Addr0MVRef
  2616. add ebx,ebp
  2617. mov Addr0MVRefBlk,esi
  2618. add esi,eax
  2619. lea ecx,[ecx+ecx*2] ; Best MBMV SWD times 3.
  2620. add esi,ebx ; Try V+1 first
  2621. shr ecx,2 ; Best MBMV SWD * 3/4.
  2622. mov eax,SWDForNon0MVToBeat
  2623. mov BestBlockRefAddrVP1,esi ; Stash BestBlockRefAddr
  2624. sub ecx,BLOCKMVDIFFERENTIAL ; Best MBMV SWD * 3/4 - Differential.
  2625. lea eax,[eax+eax*2-BLOCKMVDIFFERENTIAL*4] ; Non0MBMVSWDToBeat*3-4*Diff.
  2626. mov LimitForSWDForBlkMV,ecx
  2627. shr eax,2 ; Non0MBMVSWDToBeat * 3/4.
  2628. mov ebx,FIRSTBLOCKMESTATE
  2629. cmp eax,ecx
  2630. jg @f
  2631. mov LimitForSWDForBlkMV,eax
  2632. mov ecx,eax
  2633. @@:
  2634. movdt mm5,SWDURandLL ; Get SWD for best MB level full pel MVs, blk 2.
  2635. test ecx,ecx
  2636. jle NoBlockMotionVectors
  2637. movdt mm7,SWDULandLR ; Get SWD for best MB level full pel MVs, blk 1.
  2638. movdf SWDForBlock2Or4,mm5
  2639. ;============================================================================
  2640. ; Compute SWD for block.
  2641. DoBlkMEForNextBlk:
  2642. ComputeBlkSWD:
  2643. movq mm0,[esi+ebp*1]
  2644. psubw mm0,[edi+ebp*1] ; Get diff for line 1.
  2645. movq mm1,[esi+PITCH*3] ; Ref MB, upper left block, Line 3.
  2646. psllw mm0,8 ; Extract diffs for line 1 even pels.
  2647. psubw mm1,[edi+PITCH*3] ; Diff for line 3.
  2648. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
  2649. movq mm2,[esi+PITCH*5]
  2650. psllw mm1,8
  2651. psubw mm2,[edi+PITCH*5]
  2652. pmaddwd mm1,mm1
  2653. movq mm3,[esi+PITCH*7]
  2654. psllw mm2,8
  2655. psubw mm3,[edi+PITCH*7]
  2656. pmaddwd mm2,mm2
  2657. movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
  2658. psllw mm3,8
  2659. psubw mm4,[edi] ; Diff for line 0.
  2660. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
  2661. movq mm1,[esi+ebp*2]
  2662. pmaddwd mm3,mm3
  2663. psubw mm1,[edi+ebp*2]
  2664. paddusw mm0,mm2
  2665. movq mm2,[esi+ebp*4]
  2666. pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
  2667. psubw mm2,[edi+ebp*4]
  2668. paddusw mm0,mm3
  2669. movq mm3,[esi+PITCH*6]
  2670. pmaddwd mm1,mm1
  2671. psubw mm3,[edi+PITCH*6]
  2672. pmaddwd mm2,mm2
  2673. paddusw mm0,mm4
  2674. pmaddwd mm3,mm3
  2675. paddusw mm0,mm1
  2676. ;
  2677. paddusw mm0,mm2
  2678. ;
  2679. paddusw mm0,mm3
  2680. ;
  2681. punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
  2682. movq mm4,mm7 ; Get original Best SWD for block
  2683. paddusw mm1,mm0 ; mm1[48:63] is SWD for block.
  2684. pxor mm2,mm2
  2685. psrlq mm1,48 ; mm1 is SWD for block.
  2686. ;
  2687. psubusw mm4,mm1
  2688. xor ecx,ecx
  2689. pcmpeqd mm2,mm4 ; mm2[0:31] == 0 iff cand better, else -1.
  2690. psubusw mm7,mm4 ; BestSWD dim (BestSWD dim CandSWD) --> new best.
  2691. ;
  2692. ;
  2693. movdf eax,mm2 ; edi == 0 iff cand better, else -1.
  2694. ;
  2695. ; Registers at this point:
  2696. ; ebp -- PITCH
  2697. ; esi -- Address of block of candidate ref area.
  2698. ; edi -- 0 iff candidate SWD better, else -1.
  2699. ; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
  2700. ; ecx -- Scratch
  2701. ; ebx -- CurrSWDState.
  2702. ; eax -- CurrSWDState.
  2703. ; mm7 -- New best SWD for current block
  2704. ; mm6 -- Unused.
  2705. movq [edx].BlkY1.BlkLvlSWD,mm7 ; Save best blk level SWD.
  2706. pxor mm6,mm6 ; Spec zero to prep for half pel ME.
  2707. mov cl,StateEngine[eax+ebx*4+1] ; Index of MV adjustment.
  2708. mov bl,StateEngine[eax+ebx*4+3] ; New state number; 255 means done.
  2709. add esi,ecx ; Adjust ref addr for horz motion.
  2710. mov eax,DoHalfPelME ; 0 if not, -4 if so.
  2711. shr ecx,4
  2712. cmp bl,240 ; Terminal state?
  2713. jae @f
  2714. add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
  2715. jmp ComputeBlkSWD
  2716. @@:
  2717. add esi,FullPelMotionVectorAdjustment[ecx*4] ; Adjust ref addr for VMV.
  2718. add eax,4
  2719. mov ecx,esi
  2720. jne SkipHalfPelBlkME
  2721. ; Registers:
  2722. ; ebp -- PITCH
  2723. ; esi -- Address of best full pel reference macroblock
  2724. ; edx -- Induction variable over luma blocks in MBlockAction Descriptor.
  2725. ; ecx -- Copy of esi.
  2726. ; edi -- Address of target block.
  2727. ; ebx -- Scratch
  2728. ; eax -- Set to 0 to cause HalfPelMotionEstimation to quit after one block.
  2729. ; mm0:mm7 -- Scratch
  2730. mov ebx,BestBlockRefAddrVP1
  2731. add ecx,ebp
  2732. cmp ebx,ecx
  2733. jne FullPelBlkMEMovedFromCenter
  2734. movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
  2735. movq mm3,[edx].BlkY1.BlkLvlSWD+16 ; SWDs: H+1, H-1, V+1, V-1.
  2736. jmp FullPelBlkMEDidNotMoveFromCenter
  2737. FullPelBlkMEMovedFromCenter:
  2738. movdf BestBlkFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
  2739. pxor mm7,mm7 ; Prep accumulator for half pel ME.
  2740. call HalfPelMotionEstimation
  2741. lea esi,[esi+ebp*8+8] ; Fix reference pointer.
  2742. lea edi,[edi+ebp*8+8] ; Fix target pointer.
  2743. FullPelBlkMEDidNotMoveFromCenter:
  2744. mov eax,esi
  2745. mov ebx,Addr0MVRefBlk ; Start to calc linearized MV.
  2746. sub ecx,ebx ; Linearized Motion Vector
  2747. sub eax,ebx ; Linearized Motion Vector
  2748. sar eax,8 ; Full pel vert lin offset div 256.
  2749. and cl,07FH ; Full pel HMV
  2750. movdf ebx,mm3 ; ebx[ 0:15] -- SWD for leftward ref.
  2751. ; ; ebx[16:31] -- SWD for rightward ref.
  2752. psrlq mm3,32
  2753. mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
  2754. mov eax,ebx
  2755. shr eax,16 ; eax -- SWD for leftward ref.
  2756. and ebx,00000FFFFH ; ebx -- SWD for rightward ref.
  2757. cmp eax,ebx
  2758. jg BlkME_RightBetterThanLeft
  2759. BlkME_LeftBetterThanRight:
  2760. add cl,cl
  2761. mov ebx,BestBlkFullPelSWD
  2762. cmp eax,ebx
  2763. jge BlkME_CtrIsBestHMV
  2764. BlkME_LeftBestHMV:
  2765. movdf ebx,mm3 ; ebx[ 0:15] -- SWD for downward ref.
  2766. ; ; ebx[16:31] -- SWD for upward ref.
  2767. mov BestHalfPelHorzSWD,eax
  2768. mov eax,ebx
  2769. shr eax,16 ; eax -- SWD for upward ref.
  2770. and ebx,00000FFFFH ; ebx -- SWD for downward ref.
  2771. cmp eax,ebx
  2772. jg BlkME_LeftBestHMV_DownBetterThanUp
  2773. BlkME_LeftBestHMV_UpBetterThanDown:
  2774. cmp eax,BestBlkFullPelSWD
  2775. jge BlkME_LeftIsBest
  2776. BlkME_LeftBestHMV_UpBestVMV:
  2777. sub esi,PITCH+1 ; Try ref 1/2 pel left and up
  2778. mov BestHalfPelVertSWD,eax
  2779. mov al,1
  2780. call HalfPelMotionEstimationBothWays
  2781. lea edi,[edi+ebp*8+8]
  2782. mov eax,BestHalfPelVertSWD
  2783. lea esi,[esi+PITCH*9+9] ; Back to center.
  2784. cmp eax,ebx
  2785. jle BlkME_UpBetterThanUpLeft
  2786. BlkME_UpLeftBetterThanUp:
  2787. cmp ebx,BestHalfPelHorzSWD
  2788. jge BlkME_LeftIsBest
  2789. BlkME_UpLeftIsBest:
  2790. dec cl ; Back up the horz MV one to the left.
  2791. lea eax,[esi-PITCH-1] ; Best is ref 1/2 pel left and up
  2792. dec ch ; Back up the vert MV one up.
  2793. jmp BlkME_HalfPelSearchDone
  2794. BlkME_UpBetterThanUpLeft:
  2795. cmp eax,BestHalfPelHorzSWD
  2796. jg BlkME_LeftIsBest
  2797. BlkME_UpIsBest:
  2798. dec ch ; Back up the vert MV one up.
  2799. mov ebx,eax
  2800. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
  2801. jmp BlkME_HalfPelSearchDone
  2802. BlkME_LeftBestHMV_DownBetterThanUp:
  2803. cmp ebx,BestBlkFullPelSWD
  2804. jge BlkME_LeftIsBest
  2805. BlkME_LeftBestHMV_DownBestVMV:
  2806. dec esi ; Try ref 1/2 pel left and down
  2807. mov BestHalfPelVertSWD,ebx
  2808. mov al,1
  2809. call HalfPelMotionEstimationBothWays
  2810. lea edi,[edi+ebp*8+8]
  2811. mov eax,BestHalfPelVertSWD
  2812. lea esi,[esi+ebp*8+9] ; Back to center.
  2813. cmp eax,ebx
  2814. jle BlkME_DownBetterThanDownLeft
  2815. BlkME_DownLeftBetterThanDown:
  2816. cmp ebx,BestHalfPelHorzSWD
  2817. jge BlkME_LeftIsBest
  2818. BlkME_DownLeftIsBest:
  2819. dec cl ; Back up the horz MV one to the left.
  2820. lea eax,[esi-1] ; Best is ref 1/2 pel left and down
  2821. inc ch ; Advance the vert MV one down.
  2822. jmp BlkME_HalfPelSearchDone
  2823. BlkME_DownBetterThanDownLeft:
  2824. cmp eax,BestHalfPelHorzSWD
  2825. jle BlkME_DownIsBest
  2826. BlkME_LeftIsBest:
  2827. dec cl ; Back up the horz MV one to the left.
  2828. lea eax,[esi-1] ; Best is ref 1/2 pel left.
  2829. mov ebx,BestHalfPelHorzSWD
  2830. jmp BlkME_HalfPelSearchDone
  2831. BlkME_RightBetterThanLeft:
  2832. add cl,cl
  2833. mov eax,BestBlkFullPelSWD
  2834. cmp eax,ebx
  2835. jle BlkME_CtrIsBestHMV
  2836. BlkME_RightBestHMV:
  2837. movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref.
  2838. ; ; eax[16:31] -- SWD for upward ref.
  2839. mov BestHalfPelHorzSWD,ebx
  2840. mov ebx,eax
  2841. shr eax,16 ; eax -- SWD for upward ref.
  2842. and ebx,00000FFFFH ; ebx -- SWD for downward ref.
  2843. cmp eax,ebx
  2844. jg BlkME_RightBestHMV_DownBetterThanUp
  2845. BlkME_RightBestHMV_UpBetterThanDown:
  2846. cmp eax,BestBlkFullPelSWD
  2847. jge BlkME_RightIsBest
  2848. BlkME_RightBestHMV_UpBestVMV:
  2849. sub esi,ebp ; Try ref 1/2 pel right and up
  2850. mov BestHalfPelVertSWD,eax
  2851. mov al,1
  2852. call HalfPelMotionEstimationBothWays
  2853. lea edi,[edi+ebp*8+8]
  2854. mov eax,BestHalfPelVertSWD
  2855. lea esi,[esi+PITCH*9+8] ; Back to center.
  2856. cmp eax,ebx
  2857. jle BlkME_UpBetterThanUpRight
  2858. BlkME_UpRightBetterThanUp:
  2859. cmp ebx,BestHalfPelHorzSWD
  2860. jge BlkME_RightIsBest
  2861. BlkME_UpRightIsBest:
  2862. inc cl ; Advance the horz MV one to right.
  2863. lea eax,[esi-PITCH] ; Best is ref 1/2 pel right and up
  2864. dec ch ; Back up the vert MV one up.
  2865. jmp BlkME_HalfPelSearchDone
  2866. BlkME_UpBetterThanUpRight:
  2867. cmp eax,BestHalfPelHorzSWD
  2868. jle BlkME_UpIsBest
  2869. BlkME_RightIsBest:
  2870. mov ebx,BestHalfPelHorzSWD
  2871. inc cl ; Advance the horz MV one to right.
  2872. mov eax,esi
  2873. jmp BlkME_HalfPelSearchDone
  2874. BlkME_RightBestHMV_DownBetterThanUp:
  2875. cmp ebx,BestBlkFullPelSWD
  2876. jge BlkME_RightIsBest
  2877. BlkME_RightBestHMV_DownBestVMV:
  2878. mov BestHalfPelVertSWD,ebx
  2879. mov al,1
  2880. call HalfPelMotionEstimationBothWays
  2881. lea edi,[edi+ebp*8+8]
  2882. mov eax,BestHalfPelVertSWD
  2883. lea esi,[esi+ebp*8+8] ; Back to center.
  2884. cmp eax,ebx
  2885. jle BlkME_DownBetterThanDownRight
  2886. BlkME_DownRightBetterThanDown:
  2887. cmp ebx,BestHalfPelHorzSWD
  2888. jge BlkME_RightIsBest
  2889. BlkME_DownRightIsBest:
  2890. inc cl ; Advance the horz MV one to right.
  2891. mov eax,esi
  2892. inc ch ; Advance vert MV one down.
  2893. jmp BlkME_HalfPelSearchDone
  2894. BlkME_DownBetterThanDownRight:
  2895. cmp eax,BestHalfPelHorzSWD
  2896. jg BlkME_RightIsBest
  2897. BlkME_DownIsBest:
  2898. inc ch ; Advance vert MV one down.
  2899. mov ebx,eax
  2900. mov eax,esi
  2901. jmp BlkME_HalfPelSearchDone
  2902. BlkME_CtrIsBestHMV:
  2903. movdf eax,mm3 ; eax[ 0:15] -- SWD for downward ref.
  2904. ; ; eax[16:31] -- SWD for upward ref.
  2905. mov ebx,eax
  2906. shr eax,16 ; eax -- SWD for upward ref.
  2907. and ebx,00000FFFFH ; ebx -- SWD for downward ref.
  2908. cmp eax,ebx
  2909. jge BlkME_CtrBestHMV_DownBetterThanUp
  2910. BlkME_CtrBestHMV_UpBetterThanDown:
  2911. mov ebx,BestBlkFullPelSWD
  2912. cmp eax,ebx
  2913. jge BlkME_CenterIsBest
  2914. ; Up is best.
  2915. mov ebx,eax
  2916. dec ch ; Back up the vert MV one up.
  2917. lea eax,[esi-PITCH] ; Best is ref 1/2 pel up
  2918. jmp BlkME_HalfPelSearchDone
  2919. BlkME_CtrBestHMV_DownBetterThanUp:
  2920. mov eax,ebx
  2921. mov ebx,BestBlkFullPelSWD
  2922. cmp eax,ebx
  2923. jge BlkME_CenterIsBest
  2924. ; Down is best.
  2925. mov ebx,eax
  2926. inc ch ; Advande the vert MV one down.
  2927. mov eax,esi
  2928. jmp BlkME_HalfPelSearchDone
  2929. SkipHalfPelBlkME:
  2930. mov eax,esi
  2931. mov ebx,Addr0MVRefBlk ; Start to calc linearized MV.
  2932. sub ecx,ebx ; Linearized Motion Vector
  2933. sub eax,ebx ; Linearized Motion Vector
  2934. sar eax,8 ; Full pel vert lin offset div 256.
  2935. and cl,07FH ; Full pel HMV
  2936. add cl,cl
  2937. ;
  2938. mov ch,UnlinearizedVertMV[eax] ; Get full pel vert MV component.
  2939. ;
  2940. movdf ebx,mm7 ; SWD for best full pel block MV.
  2941. BlkME_CenterIsBest:
  2942. mov eax,esi
  2943. BlkME_HalfPelSearchDone:
  2944. mov [edx].BlkY1.BlkLvlSWD,ebx
  2945. mov [edx].BlkY1.PastRef,eax
  2946. mov [edx].BlkY1.PHMV,cl ; Save HMV
  2947. mov eax,LimitForSWDForBlkMV ; Does block's SWD put us over limit?
  2948. mov [edx].BlkY1.PVMV,ch ; Save VMV
  2949. sub eax,ebx
  2950. jl BlkEst_EarlyOut
  2951. mov LimitForSWDForBlkMV,eax ; Remember how much is left for other blks.
  2952. mov esi,BestBlockRefAddrVP1
  2953. add edi,8 ; Move to blk 2 or 4, V+4.
  2954. mov ecx,Addr0MVRefBlk ; Calc addr of 0MV ref for this blk.
  2955. add esi,8 ; Move to blk 2 or 4, V+4.
  2956. add ecx,8
  2957. mov Addr0MVRefBlk,ecx
  2958. add edx,SIZEOF T_Blk ; Increment to next block.
  2959. test dl,SIZEOF T_Blk
  2960. movdt mm7,SWDForBlock2Or4
  2961. mov ebx,FIRSTBLOCKMESTATE
  2962. jne DoBlkMEForNextBlk ; If so, go do blk 2 or 4.
  2963. lea esi,[esi+ebp*8-8] ; Move to blk 3
  2964. lea ecx,[ecx+ebp*8-16]
  2965. mov BestBlockRefAddrVP1,esi
  2966. lea edi,[edi+ebp*8-16]
  2967. movdt mm5,SWDULandLR+4 ; Get SWD for best MB level MVs, blk 4.
  2968. movdt mm7,SWDURandLL+4 ; Get SWD for best MB level MVs, blk 3.
  2969. movdf SWDForBlock2Or4,mm5
  2970. test dl,2*SIZEOF T_Blk ; Just finishing blk 2?
  2971. mov Addr0MVRefBlk,ecx
  2972. jne DoBlkMEForNextBlk ; If so, go do blk 3.
  2973. ;==============================================================================
  2974. ; Block motion vectors are best.
  2975. mov esi,[edx-4*SIZEOF T_Blk].BlkY1.BlkLvlSWD
  2976. mov edi,[edx-4*SIZEOF T_Blk].BlkY4.BlkLvlSWD
  2977. mov SWDULandLR,esi
  2978. mov SWDULandLR+4,edi
  2979. mov esi,[edx-4*SIZEOF T_Blk].BlkY3.BlkLvlSWD
  2980. mov edi,[edx-4*SIZEOF T_Blk].BlkY2.BlkLvlSWD
  2981. mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs
  2982. mov ebx,[edx-4*SIZEOF T_Blk].BlkY2.MVs
  2983. mov ecx,eax
  2984. xor eax,ebx
  2985. xor ecx,[edx-4*SIZEOF T_Blk].BlkY3.MVs
  2986. xor ebx,[edx-4*SIZEOF T_Blk].BlkY4.MVs
  2987. mov SWDURandLL,edi
  2988. or eax,ebx
  2989. sub edx,4*SIZEOF T_Blk ; Restore MacroBlockActionStream ptr.
  2990. or eax,ecx
  2991. test eax,0FFFFH
  2992. mov SWDURandLL+4,esi
  2993. je MotionVectorSettled
  2994. mov al,INTER4MV ; Set type for MB to INTER-coded, 4 MVs.
  2995. mov [edx].BlockType,al
  2996. jmp MotionVectorSettled
  2997. BlkEst_EarlyOut:
  2998. and edx,-1-3*SIZEOF T_Blk
  2999. mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
  3000. BlockMVNotBigEnoughGain: ; Try MB-level motion vector.
  3001. cmp ecx,SWDForNon0MVToBeat
  3002. jge NonZeroMVNotBigEnoughGain
  3003. ENDIF ; H263
  3004. mov ebx,BestMBHalfPelMV
  3005. mov esi,BestMBHalfPelRefAddr ; Reload BestMBHalfPelRefAddr
  3006. NonZeroMBLevelMVBest:
  3007. ; Non-zero macroblock level motion vector is best.
  3008. mov [edx].BlkY1.MVs,ebx
  3009. mov [edx].BlkY2.MVs,ebx
  3010. mov [edx].BlkY3.MVs,ebx
  3011. mov [edx].BlkY4.MVs,ebx
  3012. mov [edx].BlkY1.PastRef,esi
  3013. lea ecx,[esi+ebp*8]
  3014. mov [edx].BlkY3.PastRef,ecx
  3015. add esi,8
  3016. mov [edx].BlkY2.PastRef,esi
  3017. add ecx,8
  3018. mov [edx].BlkY4.PastRef,ecx
  3019. jmp MotionVectorSettled
  3020. NoBlockMotionVectors:
  3021. mov ecx,BestMBHalfPelSWD ; Get total SWD for macroblock MV.
  3022. mov eax,SWDForNon0MVToBeat
  3023. cmp eax,ecx
  3024. mov ebx,BestMBHalfPelMV
  3025. mov esi,BestMBHalfPelRefAddr
  3026. jge NonZeroMBLevelMVBest
  3027. NonZeroMVNotBigEnoughGain:
  3028. mov esi,Addr0MVRef ; 0-MV ref block.
  3029. movq mm6,SWD0MVULandLR
  3030. movq mm5,SWD0MVURandLL
  3031. movq SWDULandLR,mm6
  3032. movq SWDURandLL,mm5
  3033. BelowZeroThresh:
  3034. mov [edx].BlkY1.PastRef,esi ; Save address of ref block, all blks.
  3035. lea eax,[esi+8]
  3036. mov [edx].BlkY2.PastRef,eax
  3037. lea eax,[esi+ebp*8]
  3038. mov [edx].BlkY3.PastRef,eax
  3039. add eax,8
  3040. mov [edx].BlkY4.PastRef,eax
  3041. xor eax,eax
  3042. mov [edx].BlkY1.MVs,eax ; Set horz and vert MVs to 0 in all blks.
  3043. mov [edx].BlkY2.MVs,eax
  3044. mov [edx].BlkY3.MVs,eax
  3045. mov [edx].BestFullPelMBHMV,al
  3046. mov [edx].BlkY4.MVs,eax
  3047. mov [edx].BestFullPelMBVMV,al
  3048. mov BestMBHalfPelMV,eax
  3049. MotionVectorSettled:
  3050. IFDEF H261
  3051. ;===============================================================================
  3052. ; For H261, we've settled on the best motion vector. Now we need to determine
  3053. ; if spatial filtering should be done.
  3054. ;
  3055. ; ebp -- PITCH
  3056. ; esi -- Address of block of ref area.
  3057. ; edi -- Address of spatially filtred block.
  3058. ; edx -- MBlockActionStream
  3059. ; ecx -- Loop counter.
  3060. ; ebx -- Address of constant 0x7F in all 8 bytes.
  3061. ; eax -- Scratch
  3062. ; mm7 -- Mask to extract bytes 0 and 7. (High bit of bytes 1:6 must be off).
  3063. ; mm6 -- All bytes -1.
  3064. ; mm5 -- Mask to extract bytes 1:6 and clear bit 8 thereof.
  3065. movdf esi,mm7 ; Restore non-SLF SWD for macroblock.
  3066. cmp esi,SpatialFiltThreshold
  3067. jle SkipSpatialFiltering
  3068. mov ecx,DoSpatialFiltering ; Are we doing spatial filtering?
  3069. mov esi,[edx].BlkY1.PastRef
  3070. test cl,cl
  3071. je SkipSpatialFiltering
  3072. DoSpatialFilterForChroma:
  3073. DoSpatialFilterForLuma:
  3074. movq mm5,C7F7F7F7F7F7F7F7F ; Mask to extract bytes 1:6.
  3075. movdf BestMBFullPelSWD,mm7 ; Stash SWD for best full pel MB MV.
  3076. psllq mm5,16
  3077. psrlq mm5,8
  3078. pcmpeqb mm7,mm7
  3079. pxor mm7,mm5 ; Mask to extract bytes 0 and 7.
  3080. mov edi,SpatiallyFilteredMB
  3081. lea eax,[esi+ebp*4]
  3082. lea ebx,C7F7F7F7F7F7F7F7F ; Address of this useful constant.
  3083. SpatialFilterLoop:
  3084. movq mm0,[esi] ; 0a: <P7 P6 P5 P4 P3 P2 P1 P0>
  3085. pcmpeqb mm6,mm6 ; To add one to all bytes.
  3086. movq mm4,mm0 ; 0b: <P7 P6 P5 P4 P3 P2 P1 P0>
  3087. psllq mm0,16 ; 0c: <P5 P4 P3 P2 P1 P0 0 0>
  3088. movq mm3,[esi+ebp*1]; 1a
  3089. paddb mm0,mm4 ; 0d: <P7+P5 P6+P4 ... P3+P1 P2+P0 jnk jnk >
  3090. movq mm1,mm3 ; 1b
  3091. psrlq mm0,9 ; 0e: <0 (P7+P5)/2 ... (P2+P0)/2 jnk> (dirty)
  3092. SpatialFilterLoop_BlockToRight:
  3093. pand mm0,mm5 ; 0f: <0 (P7+P5)/2 ... (P2+P0)/2 0> (clean)
  3094. psllq mm1,16 ; 1c
  3095. paddb mm0,mm4 ; 0g: <jnk (P7+2P6+P5)/2 ... (P2+2P1+P0)/2 jnk>
  3096. paddb mm1,mm3 ; 1d
  3097. psubb mm0,mm6 ; 0h: <jnk (P7+2P6+P5+2)/2 ... (P2+2P1+P0+2)/2 jnk>
  3098. psrlq mm1,9 ; 1e
  3099. psrlq mm0,1 ; 0i: <jnk (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 jnk>
  3100. pand mm4,mm7 ; 0j: <P7 0 0 0 0 0 0 P0>
  3101. pand mm0,mm5 ; 0k: < 0 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/2 0>
  3102. pand mm1,mm5 ; 1f
  3103. por mm0,mm4 ; 0l: <P7 (P7+2P6+P5+2)/4 ... (P2+2P1+P0+2)/4 P0>
  3104. paddb mm1,mm3 ; 1g
  3105. movq mm2,[esi+ebp*2]; 2a
  3106. psubb mm1,mm6 ; 1h
  3107. movq [edi],mm0 ; 0m: Store line 0 of filtered block. This is R0.
  3108. movq mm4,mm2 ; 2b
  3109. psrlq mm1,1 ; 1i
  3110. pand mm3,mm7 ; 1j
  3111. pand mm1,mm5 ; 1k
  3112. psllq mm2,16 ; 2c
  3113. por mm1,mm3 ; 1l: This is R1
  3114. paddb mm2,mm4 ; 2d
  3115. psubb mm1,mm6 ; 1A: R1+1
  3116. psrlq mm2,9 ; 2e
  3117. pand mm2,mm5 ; 2f
  3118. paddb mm0,mm1 ; 1B: R0+R1+1
  3119. paddb mm2,mm4 ; 2g
  3120. psrlq mm0,1 ; 1C: (R0+R1+1)/2 (dirty)
  3121. pand mm0,[ebx] ; 1D: (R0+R1+1)/2 (clean)
  3122. psubb mm2,mm6 ; 2h
  3123. psrlq mm2,1 ; 2i
  3124. pand mm4,mm7 ; 2j
  3125. movq mm3,[esi+PITCH*3] ; 3a
  3126. pand mm2,mm5 ; 2k
  3127. por mm2,mm4 ; 2l: This is R2.
  3128. movq mm4,mm3 ; 3b
  3129. paddb mm1,mm2 ; 1E & 2B: R1+R2+1
  3130. psllq mm3,16 ; 3c
  3131. psrlq mm1,1 ; 1F & 2C: (R1+R2+1)/2 (dirty)
  3132. paddb mm3,mm4 ; 3d
  3133. pand mm1,[ebx] ; 1G & 2D: (R1+R2+1)/2 (clean)
  3134. psrlq mm3,9 ; 3e
  3135. paddb mm0,mm1 ; 1H: (R0+2R1+R2+2)/2
  3136. pand mm3,mm5 ; 3f
  3137. psrlq mm0,1 ; 1I: (R0+2R1+R2+2)/4 (dirty)
  3138. paddb mm3,mm4 ; 3g
  3139. pand mm0,[ebx] ; 1J: (R0+2R1+R2+2)/4 (clean)
  3140. psubb mm3,mm6 ; 3h
  3141. psrlq mm3,1 ; 3i
  3142. pand mm4,mm7 ; 3j
  3143. movq [edi+ebp*1],mm0 ; 1K: Store line 1 of filtered block.
  3144. pand mm3,mm5 ; 3k
  3145. movq mm0,[eax] ; 4a
  3146. por mm3,mm4 ; 3l
  3147. psubb mm3,mm6 ; 3A: R3+1
  3148. movq mm4,mm0 ; 4b
  3149. paddb mm2,mm3 ; 2E & 3B: R2+R3+1
  3150. psllq mm0,16 ; 4c
  3151. psrlq mm2,1 ; 2F & 3C: (R2+R3+1)/2 (dirty)
  3152. paddb mm0,mm4 ; 4d
  3153. pand mm2,[ebx] ; 2G & 3D: (R2+R3+1)/2 (clean)
  3154. psrlq mm0,9 ; 4e
  3155. paddb mm1,mm2 ; 2H: (R1+2R2+R3+2)/2
  3156. pand mm0,mm5 ; 4f
  3157. psrlq mm1,1 ; 2I: (R1+2R2+R3+2)/4 (dirty)
  3158. paddb mm0,mm4 ; 4g
  3159. pand mm1,[ebx] ; 2J: (R1+2R2+R3+2)/4 (clean)
  3160. psubb mm0,mm6 ; 4h
  3161. psrlq mm0,1 ; 4i
  3162. pand mm4,mm7 ; 4j
  3163. movq [edi+ebp*2],mm1 ; 2K: Store line 2 of filtered block.
  3164. pand mm0,mm5 ; 4k
  3165. movq mm1,[eax+ebp*1] ; 5a
  3166. por mm0,mm4 ; 4l
  3167. movq mm4,mm1 ; 5b
  3168. psllq mm1,16 ; 5c
  3169. paddb mm3,mm0 ; 3E & 4B: R3+R4+1
  3170. paddb mm1,mm4 ; 5d
  3171. add esi,8
  3172. psrlq mm3,1 ; 3F & 4C: (R3+R4+1)/2 (dirty)
  3173. pand mm3,[ebx] ; 3G & 4D: (R3+R4+1)/2 (clean)
  3174. psrlq mm1,9 ; 5e
  3175. paddb mm2,mm3 ; 3H: (R2+2R3+R4+2)/2
  3176. pand mm1,mm5 ; 5f
  3177. psrlq mm2,1 ; 3I: (R2+2R3+R4+2)/4 (dirty)
  3178. paddb mm1,mm4 ; 5g
  3179. pand mm2,[ebx] ; 3J: (R2+2R3+R4+2)/4 (clean)
  3180. psubb mm1,mm6 ; 5h
  3181. psrlq mm1,1 ; 5i
  3182. pand mm4,mm7 ; 5j
  3183. movq [edi+PITCH*3],mm2 ; 3K: Store line 3 of filtered block.
  3184. pand mm1,mm5 ; 5k
  3185. movq mm2,[eax+ebp*2] ; 6a
  3186. por mm1,mm4 ; 5l
  3187. psubb mm1,mm6 ; 5A: R5+1
  3188. movq mm4,mm2 ; 6b
  3189. paddb mm0,mm1 ; 4E & 5B: R4+R5+1
  3190. psllq mm2,16 ; 6c
  3191. psrlq mm0,1 ; 4F & 5C: (R4+R5+1)/2 (dirty)
  3192. paddb mm2,mm4 ; 6d
  3193. pand mm0,[ebx] ; 4G & 5D: (R4+R5+1)/2 (clean)
  3194. psrlq mm2,9 ; 6e
  3195. paddb mm3,mm0 ; 4H: (R3+2R4+R5+2)/2
  3196. pand mm2,mm5 ; 6f
  3197. psrlq mm3,1 ; 4I: (R3+2R4+R5+2)/4 (dirty)
  3198. paddb mm2,mm4 ; 6g
  3199. pand mm3,[ebx] ; 4J: (R3+2R4+R5+2)/4 (clean)
  3200. psubb mm2,mm6 ; 6h
  3201. psrlq mm2,1 ; 6i
  3202. sub cl,2 ; Loop control
  3203. movq [edi+ebp*4],mm3 ; 4K: Store line 4 of filtered block.
  3204. pand mm4,mm7 ; 6j
  3205. movq mm3,[eax+PITCH*3] ; 7a
  3206. pand mm2,mm5 ; 6k
  3207. por mm2,mm4 ; 6l
  3208. movq mm4,mm3 ; 7b
  3209. paddb mm1,mm2 ; 5E & 6B: R5+R6+1
  3210. psllq mm3,16 ; 7c
  3211. psrlq mm1,1 ; 5F & 6C: (R5+R6+1)/2 (dirty)
  3212. paddb mm3,mm4 ; 7d
  3213. pand mm1,[ebx] ; 5G & 6D: (R5+R6+1)/2 (clean)
  3214. psrlq mm3,9 ; 7e
  3215. paddb mm0,mm1 ; 5H: (R4+2R5+R6+2)/2
  3216. pand mm3,mm5 ; 7f
  3217. psrlq mm0,1 ; 5I: (R4+2R5+R6+2)/4 (dirty)
  3218. paddb mm3,mm4 ; 7g
  3219. pand mm0,[ebx] ; 5J: (R4+2R5+R6+2)/4 (clean)
  3220. psubb mm3,mm6 ; 7h
  3221. psrlq mm3,1 ; 7i
  3222. pand mm4,mm7 ; 7j
  3223. movq [edi+PITCH*5],mm0 ; 5K: Store line 5 of filtered block.
  3224. pand mm3,mm5 ; 7k
  3225. psubb mm2,mm6 ; 7A: R6+1
  3226. por mm3,mm4 ; 7l
  3227. paddb mm2,mm3 ; 6E: R6+R7+1
  3228. lea eax,[esi+ebp*4]
  3229. movq mm0,[esi] ; 0a: for next iteration
  3230. psrlq mm2,1 ; 6F: (R6+R7+1)/2 (dirty)
  3231. pand mm2,[ebx] ; 6G: (R6+R7+1)/2 (clean)
  3232. movq mm4,mm0 ; 0b: for next iteration
  3233. movq [edi+PITCH*7],mm3 ; 7m: Store line 7 of filtered block.
  3234. paddb mm1,mm2 ; 6H: (R5+2R6+R7+2)/2
  3235. lea edi,[edi+8] ; Advance output cursor.
  3236. psrlq mm1,1 ; 6I: (R5+2R6+R7+2)/4 (dirty)
  3237. pand mm1,[ebx] ; 6J: (R5+2R6+R7+2)/4 (clean)
  3238. psllq mm0,16 ; 0c: for next iteration
  3239. movq mm3,[esi+ebp*1] ; 1a: for next iteration
  3240. paddb mm0,mm4 ; 0d: for next iteration
  3241. movq [edi+PITCH*6-8],mm1 ; 6K: Store line 6 of filtered block.
  3242. movq mm1,mm3 ; 1b: for next iteration
  3243. psrlq mm0,9 ; 0e: for next iteration
  3244. jg SpatialFilterLoop_BlockToRight
  3245. lea esi,[esi+ebp*8-16]
  3246. lea eax,[eax+ebp*8-16]
  3247. lea edi,[edi+ebp*8-16]
  3248. mov cl,4
  3249. jl SpatialFilterLoop
  3250. SpatialFilterDone:
  3251. mov edi,TargetMacroBlockBaseAddr
  3252. mov esi,SpatiallyFilteredMB
  3253. test ch,ch
  3254. jg ReturnFromSpatialFilterForU
  3255. ; Registers at this point:
  3256. ; ebp -- PITCH
  3257. ; esi -- Address of upper left block of spatially filtered candidate ref area.
  3258. ; edi -- Address of upper left block of target.
  3259. ; edx -- MBlockActionStream
  3260. ; ecx -- Scratch
  3261. ; ebx -- Scratch
  3262. ; eax -- Loop control
  3263. ; mm0-mm4 -- Scratch
  3264. ; mm5,mm6 -- SWD for each block
  3265. ; mm7 -- SWD for macroblock
  3266. ;
  3267. movq mm0,[esi+ebp*1]
  3268. pxor mm7,mm7
  3269. mov al,3
  3270. jl ReturnFromSpatialFilterForV
  3271. ComputeSWDforSLFBlock:
  3272. psubw mm0,[edi+ebp*1] ; Get diff for line 1.
  3273. ComputeSWDforSLFBlock_BlkToRight:
  3274. movq mm1,[esi+PITCH*3] ; Ref MB, Line 3.
  3275. psllw mm0,8 ; Extract diffs for line 1 even pels.
  3276. psubw mm1,[edi+PITCH*3] ; Diff for line 3.
  3277. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
  3278. movq mm2,[esi+PITCH*5]
  3279. psllw mm1,8
  3280. psubw mm2,[edi+PITCH*5]
  3281. pmaddwd mm1,mm1
  3282. movq mm3,[esi+PITCH*7]
  3283. psllw mm2,8
  3284. psubw mm3,[edi+PITCH*7]
  3285. pmaddwd mm2,mm2
  3286. movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
  3287. psllw mm3,8
  3288. psubw mm4,[edi] ; Diff for line 0.
  3289. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
  3290. movq mm1,[esi+ebp*2]
  3291. pmaddwd mm3,mm3
  3292. psubw mm1,[edi+ebp*2]
  3293. paddusw mm0,mm2
  3294. movq mm2,[esi+ebp*4]
  3295. pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
  3296. psubw mm2,[edi+ebp*4]
  3297. paddusw mm0,mm3
  3298. movq mm3,[esi+PITCH*6]
  3299. pmaddwd mm1,mm1
  3300. psubw mm3,[edi+PITCH*6]
  3301. pmaddwd mm2,mm2
  3302. paddusw mm4,mm0
  3303. pmaddwd mm3,mm3
  3304. paddusw mm4,mm1
  3305. add esi,8
  3306. paddusw mm4,mm2
  3307. add edi,8
  3308. movq mm0,[esi+ebp*1]
  3309. paddusw mm4,mm3
  3310. psubw mm0,[edi+ebp*1] ; Get diff for line 1.
  3311. punpckldq mm1,mm4 ; Get low order SWD accum to high order of mm1.
  3312. paddusw mm1,mm4 ; mm1[48:63] is SWD for block.
  3313. psllq mm6,32 ; Shift previous block's SWD left.
  3314. psrlq mm1,48 ; mm1 is SWD for block.
  3315. sub al,2 ; Loop control.
  3316. paddusw mm7,mm1
  3317. por mm6,mm1 ; Save current block's SWD.
  3318. movq mm4,mm5
  3319. jg ComputeSWDforSLFBlock_BlkToRight
  3320. movq mm0,[esi+PITCH*9-16]
  3321. movq mm5,mm6
  3322. lea edi,[edi+ebp*8-16]
  3323. lea esi,[esi+ebp*8-16]
  3324. mov al,4
  3325. jl ComputeSWDforSLFBlock
  3326. mov ebx,BestMBFullPelSWD ; Restore non-SLF SWD for macroblock.
  3327. mov eax,SpatialFiltDifferential
  3328. sub ebx,eax
  3329. sub edi,PITCH*16+16
  3330. movdf eax,mm7 ; SLF SWD for macroblock.
  3331. cmp eax,ebx
  3332. jge SpatialFilterNotAsGood
  3333. movdf SWDULandLR+4,mm5
  3334. psrlq mm5,32
  3335. movdf SWDURandLL+4,mm5
  3336. movdf SWDURandLL,mm6
  3337. psrlq mm6,32
  3338. movdf SWDULandLR,mm6
  3339. mov al,INTERSLF
  3340. mov ebx,SpatiallyFilteredMB
  3341. mov [edx].BlockType,al
  3342. sub esi,PITCH*8-8
  3343. mov [edx].BlkY4.PastRef,esi
  3344. mov [edx].BlkY1.PastRef,ebx
  3345. sub esi,8
  3346. add ebx,8
  3347. mov [edx].BlkY3.PastRef,esi
  3348. mov [edx].BlkY2.PastRef,ebx
  3349. SkipSpatialFiltering:
  3350. SpatialFilterNotAsGood:
  3351. ENDIF ; H261
  3352. ;===============================================================================
  3353. ; We've settled on the motion vector that will be used if we do indeed code the
  3354. ; macroblock with inter-coding. We need to determine if some or all of the
  3355. ; blocks can be forced as empty (copy). If all the blocks can be forced
  3356. ; empty, we force the whole macroblock to be empty.
  3357. mov esi,EMPTYTHRESHOLD ; Get threshold for forcing block empty?
  3358. mov ebx,SWDULandLR ; Get SWD for block 1.
  3359. mov al,[edx].CodedBlocks
  3360. cmp ebx,esi ; Is SWD > threshold?
  3361. jg @f
  3362. and al,0FEH ; If not, indicate block 1 is NOT coded.
  3363. xor ebx,ebx
  3364. @@:
  3365. mov ecx,SWDURandLL ; Get SWD for block 2.
  3366. cmp ecx,esi
  3367. jg @f
  3368. and al,0FDH
  3369. xor ecx,ecx
  3370. @@:
  3371. add ebx,ecx
  3372. mov ecx,SWDURandLL+4 ; Get SWD for block 3.
  3373. cmp ecx,esi
  3374. jg @f
  3375. and al,0FBH
  3376. xor ecx,ecx
  3377. @@:
  3378. add ebx,ecx
  3379. mov ecx,SWDULandLR+4 ; Get SWD for block 4.
  3380. cmp ecx,esi
  3381. jg @f
  3382. and al,0F7H
  3383. xor ecx,ecx
  3384. @@:
  3385. mov [edx].CodedBlocks,al ; Store coded block pattern.
  3386. and al,00FH
  3387. add ebx,ecx
  3388. cmp al,00FH ; Are any blks marked empty?
  3389. jne InterBest ; If some blks are empty, can't code as Intra
  3390. mov edi,TargetMacroBlockBaseAddr
  3391. mov [edx].SWD,ebx
  3392. cmp ebx,INTERCODINGTHRESHOLD ; Is InterSWD below inter-coding thresh?
  3393. jae CalculateIntraSWD
  3394. InterBestX:
  3395. mov ebx,[edx].SWD
  3396. InterBest:
  3397. mov ecx,SWDTotal ; Add to total for this macroblock class.
  3398. add ecx,ebx
  3399. IFDEF H261
  3400. mov SWDTotal,ecx
  3401. ELSE ;H263
  3402. mov bl,DoAdvancedPrediction
  3403. mov SWDTotal,ecx
  3404. test bl,bl
  3405. jne OBMCDifferencing
  3406. ENDIF
  3407. ;============================================================================
  3408. ; Perform differencing for the non-empty luma blocks of an Inter-coded
  3409. ; macroblock. This is the non-OBMC case; i.e. Advanced Prediction is
  3410. ; not selected.
  3411. ;
  3412. ; ebp -- PITCH
  3413. ; esi -- Address of reference block.
  3414. ; edi -- Address of target block.
  3415. ; edx -- MBlockActionStream. Used as cursor over luma blocks.
  3416. ; ecx -- Not in use.
  3417. ; ebx -- Scratch. Used to test half pel MV resolution.
  3418. ; eax[0:3] -- Coded block pattern for luma blocks.
  3419. mov cl,INTER1MV
  3420. mov ebx,TargetMacroBlockBaseAddr
  3421. mov StashBlockType,cl
  3422. test al,1 ; Don't diff block 1 if marked empty.
  3423. mov edi,ebx
  3424. je @f
  3425. mov ebx,[edx].BlkY1.MVs
  3426. mov esi,[edx].BlkY1.PastRef
  3427. call DoNonOBMCDifferencing
  3428. ; (Finish differencing the last four lines.)
  3429. movq mm4,[edi+ebp*4] ; T4
  3430. psrlq mm1,1
  3431. movq mm5,[edi+PITCH*5]
  3432. psubb mm4,mm0 ; D4 = T4 - P4
  3433. movq mm0,[edi+PITCH*6]
  3434. psubb mm5,mm1
  3435. movq mm1,[edi+PITCH*7]
  3436. pand mm2,mm6
  3437. pand mm3,mm6
  3438. psrlq mm2,1
  3439. movq PelDiffsLine4,mm4 ; Store D4.
  3440. psubb mm0,mm2
  3441. movq PelDiffsLine5,mm5
  3442. psrlq mm3,1
  3443. movq PelDiffsLine6,mm0
  3444. psubb mm1,mm3
  3445. push eax ; Adjust stack pointer
  3446. StackOffset TEXTEQU <4>
  3447. call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
  3448. mov al,[edx].CodedBlocks
  3449. sub al,bl
  3450. mov ebx,TargetMacroBlockBaseAddr
  3451. mov [edx].CodedBlocks,al
  3452. pop edi ; Adjust stack pointer
  3453. StackOffset TEXTEQU <0>
  3454. @@:
  3455. lea edi,[ebx+8] ; Get address of next macroblock to do.
  3456. test al,2 ; Don't diff block 2 if marked empty.
  3457. je @f
  3458. mov ebx,[edx].BlkY2.MVs
  3459. mov esi,[edx].BlkY2.PastRef
  3460. call DoNonOBMCDifferencing
  3461. ; (Finish differencing the last four lines.)
  3462. movq mm4,[edi+ebp*4] ; T4
  3463. psrlq mm1,1
  3464. movq mm5,[edi+PITCH*5]
  3465. psubb mm4,mm0 ; D4 = T4 - P4
  3466. movq mm0,[edi+PITCH*6]
  3467. psubb mm5,mm1
  3468. movq mm1,[edi+PITCH*7]
  3469. pand mm2,mm6
  3470. pand mm3,mm6
  3471. psrlq mm2,1
  3472. movq PelDiffsLine4,mm4 ; Store D4.
  3473. psubb mm0,mm2
  3474. movq PelDiffsLine5,mm5
  3475. psrlq mm3,1
  3476. movq PelDiffsLine6,mm0
  3477. psubb mm1,mm3
  3478. push eax ; Adjust stack pointer
  3479. StackOffset TEXTEQU <4>
  3480. call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
  3481. shl bl,1
  3482. mov al,[edx].CodedBlocks
  3483. sub al,bl
  3484. mov ebx,TargetMacroBlockBaseAddr
  3485. mov [edx].CodedBlocks,al
  3486. pop edi ; Adjust stack pointer
  3487. StackOffset TEXTEQU <0>
  3488. @@:
  3489. lea edi,[ebx+ebp*8] ; Get address of next macroblock to do.
  3490. test al,4 ; Don't diff block 3 if marked empty.
  3491. je @f
  3492. mov ebx,[edx].BlkY3.MVs
  3493. mov esi,[edx].BlkY3.PastRef
  3494. call DoNonOBMCDifferencing
  3495. ; (Finish differencing the last four lines.)
  3496. movq mm4,[edi+ebp*4] ; T4
  3497. psrlq mm1,1
  3498. movq mm5,[edi+PITCH*5]
  3499. psubb mm4,mm0 ; D4 = T4 - P4
  3500. movq mm0,[edi+PITCH*6]
  3501. psubb mm5,mm1
  3502. movq mm1,[edi+PITCH*7]
  3503. pand mm2,mm6
  3504. pand mm3,mm6
  3505. psrlq mm2,1
  3506. movq PelDiffsLine4,mm4 ; Store D4.
  3507. psubb mm0,mm2
  3508. movq PelDiffsLine5,mm5
  3509. psrlq mm3,1
  3510. movq PelDiffsLine6,mm0
  3511. psubb mm1,mm3
  3512. push eax ; Adjust stack pointer
  3513. StackOffset TEXTEQU <4>
  3514. call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
  3515. shl bl,2
  3516. mov al,[edx].CodedBlocks
  3517. sub al,bl
  3518. mov ebx,TargetMacroBlockBaseAddr
  3519. mov [edx].CodedBlocks,al
  3520. pop edi ; Adjust stack pointer
  3521. StackOffset TEXTEQU <0>
  3522. @@:
  3523. lea edi,[ebx+ebp*8+8] ; Get address of next macroblock to do.
  3524. test al,8 ; Don't diff block 4 if marked empty.
  3525. je NonOBMCDifferencingDone
  3526. mov ebx,[edx].BlkY4.MVs
  3527. mov esi,[edx].BlkY4.PastRef
  3528. call DoNonOBMCDifferencing
  3529. ; (Finish differencing the last four lines.)
  3530. movq mm4,[edi+ebp*4] ; T4
  3531. psrlq mm1,1
  3532. movq mm5,[edi+PITCH*5]
  3533. psubb mm4,mm0 ; D4 = T4 - P4
  3534. movq mm0,[edi+PITCH*6]
  3535. psubb mm5,mm1
  3536. movq mm1,[edi+PITCH*7]
  3537. pand mm2,mm6
  3538. pand mm3,mm6
  3539. psrlq mm2,1
  3540. movq PelDiffsLine4,mm4 ; Store D4.
  3541. psubb mm0,mm2
  3542. movq PelDiffsLine5,mm5
  3543. psrlq mm3,1
  3544. movq PelDiffsLine6,mm0
  3545. psubb mm1,mm3
  3546. push eax ; Adjust stack pointer
  3547. StackOffset TEXTEQU <4>
  3548. call MMxDoForwardDCTx ; Block is in PelDiffs block; Pitch is 16
  3549. shl bl,3
  3550. mov al,[edx].CodedBlocks
  3551. sub al,bl
  3552. pop edi ; Adjust stack pointer
  3553. mov [edx].CodedBlocks,al
  3554. StackOffset TEXTEQU <0>
  3555. NonOBMCDifferencingDone:
  3556. IFDEF H261
  3557. ELSE
  3558. mov al,IsPlainPFrame
  3559. test al,al
  3560. jne NextMacroBlock
  3561. movq mm6,C0101010101010101
  3562. pxor mm7,mm7 ; Initialize SWD accumulator
  3563. call MMxDoBFrameLumaBlocks
  3564. ENDIF
  3565. jmp NextMacroBlock
  3566. ;============================================================================
  3567. ; Register usage in the following internal function. This function does
  3568. ; half pel motion estimation for whole macroblocks, or individual blocks.
  3569. ;
  3570. ; ebp -- PITCH
  3571. ; esi -- Address of best full pel reference macroblock. For MBME unchanged
  3572. ; at exit. For BlkME, adjusted by -8-8*PITCH.
  3573. ; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME,
  3574. ; adjusted by -8-8*PITCH.
  3575. ; edx -- MBlockActionStream
  3576. ; ecx -- Reserved.
  3577. ; ebx -- For MBME: 240 + Flags to indicate which half pel ME to do:
  3578. ; 1 --> right; 2 --> left; 4 --> down; 8 --> up
  3579. ; For BlkME: Garbage
  3580. ; eax -- Count from -4 to -1 for blocks of macroblock. 0 for single block.
  3581. ; mm7 -- Initialized to zero.
  3582. ; mm6 -- Initialized to zero.
  3583. ; mm0:mm7 -- Scratch
  3584. ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward
  3585. ; mm3[16:31] -- SWD for ref 1/2 pel leftward
  3586. ; mm3[32:47] -- SWD for ref 1/2 pel downward
  3587. ; mm3[48:63] -- SWD for ref 1/2 pel upward
  3588. StackOffset TEXTEQU <4>
  3589. HalfPelMotionEstimation:
  3590. and bl,15
  3591. HalfPelMBMEForUpperBlock:
  3592. HalfPelMEForFirst2LinesOfBlock:
  3593. movq mm0,[esi-PITCH] ; <P^7 P^6 P^5 P^4 P^3 P^2 P^1 P^0>
  3594. movq mm1,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
  3595. movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
  3596. paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
  3597. HalfPelMEForNext2LinesOfBlock:
  3598. movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
  3599. psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>
  3600. movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00>
  3601. psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
  3602. HalfPelMBMEForLowerBlock:
  3603. psubw mm0,[edi] ; <(P^7+P07)/2-T07 junk (P^5+P05)/2-T05 junk ...>
  3604. paddb mm5,mm2 ; <P07+P17 P06+P16 P05+P15 P04+P14 ...>
  3605. pmullw mm1,C0101010101010101 ; <(P07+P06)*256+P06 ...>
  3606. psllw mm5,8 ; <(P06+P16) 0 (P04+P14) 0 ...>
  3607. pmaddwd mm0,mm0 ; Square diff for line 0 odd pels, upward ref.
  3608. psrlw mm5,1 ; <(P06+P16)/2 0 (P04+P14)/2 0 ...>
  3609. movq mm3,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00>
  3610. psubw mm4,mm5 ; <T16-(P06+P16)/2 junk ...>
  3611. pmaddwd mm4,mm4 ; Square diff for line 1 even pels, upward ref.
  3612. psrlw mm1,1 ; <(P07+P06)*128+P06/2 ...>
  3613. psllw mm3,8 ; <T06 0 T04 0 T02 0 T00 0>
  3614. lea edi,[edi+ebp*2] ; Advance Target cursor
  3615. psubw mm3,mm1 ; <T06-(P07+P06)/2 junk T04-(P05+P03)/2 junk ...>
  3616. lea esi,[esi+ebp*2] ; Advance Reference cursor
  3617. psubw mm1,[edi-PITCH*2] ; <(P07+P06)/2-T07 junk (P05+P04)/2-T05 junk ...>
  3618. pmaddwd mm3,mm3 ; Square diff for line 0 even pels, rightwrd ref.
  3619. pmaddwd mm1,mm1 ; Square diff for line 0 odd pels, leftward ref.
  3620. paddusw mm0,mm4 ; SSD for line 0 and 1, upward ref.
  3621. pand mm0,CFFFF0000FFFF0000 ; Extract SSD for line 0 and 1, upward ref.
  3622. movq mm4,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10>
  3623. paddusw mm6,mm0 ; Accumulate SSD for line 0 and 1, upward ref.
  3624. psrlq mm4,8 ; < 0 P17 P16 P15 P14 P13 P12 P11>
  3625. pand mm1,CFFFF0000FFFF0000 ; Extract SSD for line 0, leftward ref.
  3626. psrld mm3,16 ; Extract SSD for line 0, rightward ref.
  3627. pmullw mm4,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>
  3628. paddw mm3,mm1 ; SSD for line 0, leftward and rightward refs.
  3629. movq mm1,[esi] ; <P27 P26 P25 P24 P23 P22 P21 P20>
  3630. movq mm0,mm2 ; <P17 P16 P15 P14 P13 P12 P11 P10>
  3631. paddusw mm7,mm3 ; Accumulate SSD for line 0, left and right refs.
  3632. paddb mm2,mm1 ; <P17+P27 P16+P26 P15+P25 P14+P24 ...>
  3633. movq mm3,mm0 ; <P17 P16 P15 P14 P13 P12 P11 P10>
  3634. psrlw mm4,1 ; <P17 (P16*P15)*128+P15/2 ...>
  3635. psubw mm4,[edi-PITCH*1] ; <P17-T17 junk (P16*P15)/2-T15 junk ...>
  3636. psllq mm3,8 ; <P16 P15 P14 P13 P12 P11 P10 0>
  3637. pmullw mm3,C0101010101010002 ; <(P16+P15)*256+P15 ... P10*256*2>
  3638. psrlw mm2,1 ; <(P17+P27)/2 junk (P15+P25)/2 junk ...>
  3639. movq StashMM6,mm6
  3640. pmaddwd mm4,mm4 ; Square diff for line 1 odd pels, rightward ref.
  3641. movq mm6,[edi-PITCH*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
  3642. psrlw mm3,1 ; <(P16+P15)*128+P15/2 ... P10*256>
  3643. psubw mm2,[edi-PITCH*1] ; <(P17+P27)/2-T17 junk (P15+P25)/2-T15 junk ...>
  3644. psllw mm6,8 ; <T16 0 T14 0 T12 0 T10 0>
  3645. psubw mm3,mm6 ; <(P16+P15)/2-T16 junk ... P10-T10>
  3646. psrld mm4,16 ; Extract SSD for line 1, rightward ref.
  3647. movq mm6,[edi-PITCH*2] ; <T07 T06 T05 T04 T03 T02 T01 T00>
  3648. pmaddwd mm3,mm3 ; Square diff for line 1 even pels, leftward ref.
  3649. pmaddwd mm2,mm2 ; Square diff for line 1 odd pels, downward ref.
  3650. psllw mm6,8 ; <T06 0 T04 0 T02 0 T00 0>
  3651. paddusw mm7,mm4 ; Accumulate SSD for line 1, rightward ref.
  3652. psubw mm6,mm5 ; <T06-(P06+P16)/2 junk ...>
  3653. pand mm3,CFFFF0000FFFF0000 ; Extract SSD for line 1, leftward ref.
  3654. pmaddwd mm6,mm6 ; Square diff for line 0 even pels, downward ref.
  3655. add bl,080H
  3656. psrld mm2,16 ; Extract SSD for line 1, downward ref.
  3657. paddusw mm2,StashMM6 ; Accumulate SSD for line 1, downward ref.
  3658. paddusw mm7,mm3 ; Accumulate SSD for line 1, leftward ref.
  3659. movq mm4,[edi+ebp*1] ; <T17 T16 T15 T14 T13 T12 T11 T10>
  3660. psrld mm6,16 ; Extract SSD for line 0, downward ref.
  3661. paddusw mm6,mm2 ; Accumulate SSD for line 0, downward ref.
  3662. paddb mm0,mm1 ; <P^7+P07 P^6+P06 P^5+P05 P^4+P04 ...>
  3663. punpckldq mm5,mm6 ; Speculatively start to accum partial SWDs.
  3664. jnc HalfPelMEForNext2LinesOfBlock ; Iterate twice, for half a block.
  3665. punpckldq mm3,mm7
  3666. add bl,040H
  3667. paddusw mm5,mm6
  3668. jns HalfPelMEForNext2LinesOfBlock ; Iterate twice, for a whole block.
  3669. paddusw mm3,mm7
  3670. psrlw mm0,1 ; <(P^7+P07)/2 junk (P^5+P05)/2 junk ...>
  3671. movq mm2,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
  3672. punpckhdq mm3,mm5 ; mm3[ 0:15] -- SWD for ref 1/2 pel rightward
  3673. ; ; mm3[16:31] -- SWD for ref 1/2 pel leftward
  3674. ; ; mm3[32:47] -- SWD for ref 1/2 pel downward
  3675. ; ; mm3[48:63] -- SWD for ref 1/2 pel upward
  3676. movq mm5,mm1 ; <P07 P06 P05 P04 P03 P02 P01 P00>
  3677. sub bl,080H
  3678. movq HalfPelMBMESWDAccum[eax*8+32],mm3
  3679. psllw mm4,8 ; <T16 0 T14 0 T12 0 T10 0>
  3680. add eax,2
  3681. jl HalfPelMBMEForLowerBlock ; Iterate twice for 2 blocks.
  3682. lea edi,[edi-PITCH*16+8]
  3683. lea esi,[esi-PITCH*16+8]
  3684. lea eax,[eax-3]
  3685. je HalfPelMBMEForUpperBlock ; Iterate twice for macroblock.
  3686. sub edi,16
  3687. xor eax,eax
  3688. sub esi,16
  3689. mov al,bl
  3690. ret
  3691. StackOffset TEXTEQU <0>
  3692. ;============================================================================
  3693. ; Register usage in the following internal function. This function does
  3694. ; half pel motion estimation in both directions for whole macroblocks, or
  3695. ; individual blocks.
  3696. ;
  3697. ; ebp -- PITCH
  3698. ; esi -- Address of best full pel reference macroblock. For MBME unchanged
  3699. ; at exit. For BlkME, adjusted by -8-8*PITCH.
  3700. ; edi -- Address of target macroblock. For MBME unchanged at exit. For BlkME,
  3701. ; adjusted by -8-8*PITCH.
  3702. ; edx -- MBlockActionStream
  3703. ; ecx -- Reserved. Contains motion vectors.
  3704. ; ebx -- Returns SWD for this reference block or macroblock.
  3705. ; al -- Count from 4 to 1 for blocks of macroblock. 1 for blk only.
  3706. ; mm0:mm6 -- Scratch
  3707. ; mm7 -- Reserved. Contains SWDs for four 1/2 pel refs at main compass points.
  3708. ; mm4 -- Returns SWD for this reference block or macroblock.
  3709. StackOffset TEXTEQU <4>
  3710. HalfPelMotionEstimationBothWays:
  3711. movq mm3,C0101010101010101
  3712. pxor mm6,mm6 ; Zero out SSD accumulator.
  3713. HalfPelMBMEForUpperBlockBothWays:
  3714. HalfPelMEForFirst2LinesOfBlockBothWays:
  3715. movq mm0,[esi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
  3716. HalfPelMEForNext2LinesOfBlockBothWays:
  3717. HalfPelMBMEForLowerBlockBothWays:
  3718. movq mm1,[esi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
  3719. pmullw mm0,mm3 ; <(P07+P06)*256+P06 ...>
  3720. movq mm2,[esi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20>
  3721. pmullw mm3,mm1 ; <(P17+P16)*256+P16 ...>
  3722. movq mm4,mm2 ; <P27 P26 P25 P24 P23 P22 P21 P20>
  3723. psrlq mm2,8 ; < 0 P27 P26 P25 P24 P23 P22 P21>
  3724. pmullw mm2,C0200010101010101 ; <P27*256*2 (P26+P25)*256+P25 ...>
  3725. psrlq mm1,8 ; < 0 P17 P16 P15 P14 P13 P12 P11>
  3726. pmullw mm1,C0200010101010101 ; <P17*256*2 (P16+P15)*256+P15 ...>
  3727. psrlw mm3,2 ; <(P17+P16)/4 junk ...> (w /2 frac bits)
  3728. movq mm5,[edi] ; <T07 T06 T05 T04 T03 T02 T01 T00>
  3729. psrlw mm0,2 ; <(P07+P06)/4 junk ...> (w/ 2 frac bits)
  3730. paddw mm3,mm0 ; <(P07+P06+P17+P16)/4 junk ...>
  3731. psrlw mm2,2 ; <P27/2 junk (P26+P25)/4 junk ...>
  3732. psubw mm2,[edi+ebp*1] ; <P27/2-T17 junk (P26+P25)/4-T15 junk ...>
  3733. psrlw mm1,2 ; <P17/2 junk (P16+P15)/4 junk ...>
  3734. paddw mm2,mm1 ; <(P17+P27)/2-T17 junk (P16+P15+P26+P25)-T15 junk ...>
  3735. psllw mm5,8 ; <T06 0 T04 0 T02 0 T00 0>
  3736. psubw mm5,mm3 ; <T06-(P07+P06+P17+P16)/4 junk ...>
  3737. pmaddwd mm2,mm2 ; Square diffs for odd pels of line 1.
  3738. pmaddwd mm5,mm5 ; Square diffs for even pels of line 0.
  3739. movq mm0,mm4 ; <P27 P26 P25 P24 P23 P22 P21 P20>
  3740. lea edi,[edi+ebp*2] ; Advance target cursor.
  3741. lea esi,[esi+ebp*2] ; Advance reference cursor.
  3742. paddusw mm6,mm2 ; Accumulate SSD for odd pels of line 1.
  3743. add al,080H
  3744. movq mm3,C0101010101010101
  3745. paddusw mm6,mm5 ; Accumulate SSD for even pels of line 0.
  3746. punpckldq mm4,mm6 ; Speculatively start to accum partial SWDs.
  3747. jnc HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for half a block.
  3748. add al,040H
  3749. paddusw mm4,mm6 ; After whole block, SSD is in mm4[48:63].
  3750. psrlq mm4,48
  3751. jns HalfPelMEForNext2LinesOfBlockBothWays ; Twice, for a whole block.
  3752. movdf ebx,mm4
  3753. sub al,082H
  3754. jg HalfPelMBMEForLowerBlockBothWays ; Iterate twice for 2 blocks.
  3755. lea edi,[edi-PITCH*16+8]
  3756. lea esi,[esi-PITCH*16+8]
  3757. mov al,3
  3758. je HalfPelMBMEForUpperBlockBothWays ; Iterate twice for macroblock.
  3759. sub edi,16
  3760. sub esi,16
  3761. ret
  3762. StackOffset TEXTEQU <0>
  3763. ;============================================================================
  3764. ; Register usage in the following internal function. This function is also
  3765. ; called to do frame differencing for chroma blocks.
  3766. ;
  3767. ; ebp -- PITCH
  3768. ; esi -- Address of reference block.
  3769. ; edi -- Address of target block.
  3770. ; edx -- Unavailable. In use by caller.
  3771. ; ecx -- Not in use.
  3772. ; ebx -- Motion vectors for the block. bl[0] indicates whether half-pel
  3773. ; horizontal interpolation is required; bh[0] same for vertical.
  3774. ; This register is then used for scratch purposes.
  3775. ; eax -- Unavailable. In use by caller.
  3776. ; mm0-mm5 -- Scratch
  3777. ; mm6 -- 8 bytes of 0xFE
  3778. ; mm7 -- 8 bytes of -1
  3779. StackOffset TEXTEQU <4>
  3780. DoNonOBMCDifferencing: ; Internal Function
  3781. pcmpeqb mm7,mm7
  3782. pcmpeqb mm6,mm6
  3783. IFDEF H261
  3784. ELSE ;H263
  3785. shr bl,1
  3786. jc NonOBMCDiff_Horz
  3787. ENDIF
  3788. movq mm1,[esi+ebp*1] ; BC . . . R0Dn
  3789. paddb mm6,mm6
  3790. IFDEF H261
  3791. ELSE ;H263
  3792. shr bh,1
  3793. jc NonOBMCDiff_Vert
  3794. ENDIF
  3795. psubb mm1,[edi+ebp*1] ; P1 - T1
  3796. pxor mm4,mm4
  3797. movq mm0,[edi] ; T0
  3798. psubb mm4,mm1 ; D1 = T1 - P1
  3799. psubb mm0,[esi] ; D0 = T0 - P0
  3800. movq mm2,[edi+ebp*2] ; T2
  3801. movq mm3,[edi+PITCH*3] ; T3
  3802. psubb mm2,[esi+ebp*2] ; D2 = T2 - P2
  3803. psubb mm3,[esi+PITCH*3] ; D3 = T3 - P3
  3804. movq PelDiffsLine0,mm0 ; Store D0.
  3805. movq PelDiffsLine1,mm4 ; Store D1.
  3806. movq PelDiffsLine2,mm2 ; Store D2.
  3807. movq PelDiffsLine3,mm3 ; Store D3.
  3808. movq mm3,[esi+PITCH*7] ; P7
  3809. movq mm2,[esi+PITCH*6] ; P6
  3810. paddb mm3,mm3 ; Double so that return will fix it.
  3811. movq mm1,[esi+PITCH*5] ; P5
  3812. paddb mm2,mm2 ; Double so that return will fix it.
  3813. movq mm0,[esi+ebp*4] ; P4
  3814. paddb mm1,mm1 ; Double so that return will fix it.
  3815. ret
  3816. IFDEF H261
  3817. ELSE ;H263
  3818. NonOBMCDiff_Vert: ; 0123 Detail for 0
  3819. movq mm0,[esi] ; C. . R0Up
  3820. psubb mm1,mm7 ; DD . R0Dn+1
  3821. call Get4LinesOfPred_InterpVert
  3822. movq mm5,[edi] ; T0
  3823. psrlq mm1,1 ; O .
  3824. movq mm7,[edi+ebp*1]
  3825. psubb mm5,mm0 ; D0 = T0 - P0
  3826. movq mm0,mm4
  3827. psubb mm7,mm1
  3828. movq mm1,[edi+ebp*2]
  3829. pand mm2,mm6 ; .N.
  3830. movq mm4,[edi+PITCH*3]
  3831. pand mm3,mm6 ; . N
  3832. psrlq mm2,1 ; .O.
  3833. movq PelDiffsLine0,mm5 ; Store D0.
  3834. psubb mm1,mm2
  3835. movq PelDiffsLine1,mm7 ; Store D1.
  3836. psrlq mm3,1 ; . O
  3837. movq PelDiffsLine2,mm1 ; Store D2.
  3838. psubb mm4,mm3
  3839. movq mm1,[esi+ebp*1] ; BC . . . R0Dn
  3840. pcmpeqb mm7,mm7
  3841. movq PelDiffsLine3,mm4 ; Store D3.
  3842. psubb mm1,mm7 ; DD . . . R0Dn+1
  3843. ; jmp Get4MoreLinesOfPred_InterpVert
  3844. ;===========================================================================
  3845. ; Internal function to get 4 lines of prediction, interpolating in the
  3846. ; vertical direction. The first 3 lines of the function are scheduled into
  3847. ; the caller's space, and so are commented out here. For 8 lines of prediction,
  3848. ; a second call, to the second entry point, is called after consuming the
  3849. ; outputs of the first function call. Certain registers must remain intact
  3850. ; to convey information from the first call to the second.
  3851. ;
  3852. ; ebp -- PITCH
  3853. ; edi -- Points to target block.
  3854. ; esi -- Points to Upper left corner of 8 column, 9 row block that will be
  3855. ; interpolated vertically to generate prediction.
  3856. ; edx -- Reserved (MBlockActionStream)
  3857. ; ecx -- Not in use.
  3858. ; ebx -- Will be used.
  3859. ; eax -- Reserved.
  3860. ; mm6 -- 8 bytes of 0xFE.
  3861. ; mm7 -- 8 bytes of -1.
  3862. ; mm0-mm5 -- Scratch.
  3863. StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
  3864. Get4LinesOfPred_InterpVert: ; 0123 Details for line 0
  3865. ; movq mm1,[esi+ebp*1] ; BC . R0Dn
  3866. ; movq mm0,[esi] ; C. . R0Up
  3867. ; psubb mm1,mm7 ; DD . R0Dn+1
  3868. Get4MoreLinesOfPred_InterpVert:
  3869. movq mm2,[esi+ebp*2] ; BC.
  3870. paddb mm0,mm1 ; E. . R0Up+R0Dn+1
  3871. movq mm3,[esi+PITCH*3] ; .BC
  3872. paddb mm1,mm2 ; E .
  3873. movq mm4,[esi+ebp*4] ; . BC
  3874. psubb mm3,mm7 ; .DD
  3875. paddb mm2,mm3 ; .E.
  3876. pand mm0,mm6 ; F. . Pre-clean
  3877. paddb mm3,mm4 ; E
  3878. pand mm1,mm6 ; F .
  3879. lea esi,[esi+ebp*4] ; Advance to next four lines.
  3880. psrlq mm0,1 ; G. . P0 = (R0Up + R0Dn + 1) / 2
  3881. ; pand mm2,mm6 ; G.
  3882. ; psrlq mm1,1 ; H .
  3883. ; pand mm3,mm6 ; G
  3884. ; psrlq mm2,1 ; H.
  3885. ; psrlq mm3,1 ; H
  3886. ret
  3887. StackOffset TEXTEQU <4>
  3888. ;===========================================================================
  3889. NonOBMCDiff_Horz:
  3890. movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
  3891. paddb mm6,mm6 ; . . 8 bytes of 0xFE
  3892. shr bh,1
  3893. jc NonOBMCDiff_Both
  3894. movq mm7,[edi+PITCH*3] ; T3
  3895. call Get4LinesOfPred_InterpHorz
  3896. movq mm4,[edi] ; T0
  3897. psrlq mm1,1 ; O .
  3898. movq mm5,[edi+ebp*1]
  3899. psubb mm4,mm0 ; D0 = T0 - P0
  3900. movq mm0,[edi+ebp*2]
  3901. psubb mm5,mm1
  3902. movq mm1,[edi+PITCH*3]
  3903. pand mm2,mm6 ; .N.
  3904. pand mm3,mm6 ; . N
  3905. psrlq mm2,1 ; .O.
  3906. movq PelDiffsLine0,mm4 ; Store D0.
  3907. psubb mm0,mm2
  3908. movq PelDiffsLine1,mm5 ; Store D1.
  3909. psrlq mm3,1 ; . O
  3910. movq PelDiffsLine2,mm0 ; Store D2.
  3911. psubb mm1,mm3
  3912. movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41>
  3913. ;
  3914. movq PelDiffsLine3,mm1 ; Store D3.
  3915. ;
  3916. ;===========================================================================
  3917. ; Internal function to get 4 lines of prediction, interpolating in the
  3918. ; horizontal direction. The first line of the function are scheduled into
  3919. ; the caller's space, and so are commented out here. For 8 lines of prediction,
  3920. ; a second call, to the second entry point, is called after consuming the
  3921. ; outputs of the first function call. Certain registers must remain intact
  3922. ; to convey information from the first call to the second.
  3923. ;
  3924. ; ebp -- PITCH
  3925. ; edi -- Points to target block.
  3926. ; esi -- Points to Upper left corner of 9 column, 8 row block that will be
  3927. ; interpolated horizontally to generate prediction.
  3928. ; edx -- Reserved (MBlockActionStream)
  3929. ; ecx -- Not in use.
  3930. ; ebx -- Will be used.
  3931. ; eax -- Reserved.
  3932. ; mm6 -- 8 bytes of 0xFE.
  3933. ; mm0-mm5 -- Will be used.
  3934. StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
  3935. Get4LinesOfPred_InterpHorz:
  3936. Get4MoreLinesOfPred_InterpHorz:
  3937. ; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
  3938. xor ebx,ebx ; . .
  3939. movq mm0,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01>
  3940. mov bl,[esi] ; C. . R00
  3941. psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0>
  3942. movq mm1,[esi+ebp*1+1] ; A .
  3943. paddb mm0,mm5 ; E. . <R08+R07 ... R02+R01 R01 >
  3944. paddb mm0,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1>
  3945. movq mm4,mm1 ; B .
  3946. mov bl,[esi+ebp*1] ; C .
  3947. psllq mm4,8 ; D .
  3948. movq mm2,[esi+ebp*2+1] ; A.
  3949. paddb mm1,mm4 ; E .
  3950. paddb mm1,Pel_Rnd[ebx*8] ; F .
  3951. movq mm5,mm2 ; B.
  3952. mov bl,[esi+ebp*2] ; C.
  3953. psllq mm5,8 ; D.
  3954. movq mm3,[esi+PITCH*3+1] ; A
  3955. paddb mm2,mm5 ; E.
  3956. paddb mm2,Pel_Rnd[ebx*8] ; F.
  3957. movq mm4,mm3 ; B
  3958. mov bl,[esi+PITCH*3] ; C
  3959. psllq mm4,8 ; D
  3960. paddb mm3,mm4 ; E
  3961. pand mm0,mm6 ; G. . pre-cleaned
  3962. paddb mm3,Pel_Rnd[ebx*8] ; F
  3963. psrlq mm0,1 ; H. . P0=<(R08+R07+1)/2 ... (R01+R00+1)/2>
  3964. lea esi,[esi+ebp*4] ; Advance to next four lines.
  3965. pand mm1,mm6 ; G .
  3966. ; pand mm2,mm6 ; G.
  3967. ; psrlq mm1,1 ; H .
  3968. ; pand mm3,mm6 ; G
  3969. ; psrlq mm2,1 ; H.
  3970. ; psrlq mm3,1 ; H
  3971. ret
  3972. StackOffset TEXTEQU <4>
  3973. ; The steps commented out above are scheduled into the mem-ops the caller has
  3974. ; to do at the point of return. As though these ops were done, the registers
  3975. ; look as follows:
  3976. ; mm0 -- Prediction for line 0.
  3977. ; mm1 -- Prediction for line 1.
  3978. ; mm2 -- Prediction for line 2.
  3979. ; mm3 -- Prediction for line 3.
  3980. ; mm6 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines.
  3981. ;=============================================================================
  3982. NonOBMCDiff_Both:
  3983. call Get4LinesOfPred_InterpBoth
  3984. movq mm7,[edi] ; T0
  3985. psrlq mm1,1 ; O .
  3986. psubb mm7,mm0 ; D0 = T0 - P0
  3987. pand mm2,mm6 ; .N.
  3988. movq mm0,[edi+ebp*1]
  3989. psrlq mm2,1 ; .O.
  3990. movq PelDiffsLine0,mm7 ; Store D0.
  3991. psubb mm0,mm1
  3992. movq mm7,[edi+ebp*2]
  3993. pand mm3,mm6 ; . N
  3994. movq PelDiffsLine1,mm0
  3995. psrlq mm3,1 ; . O
  3996. movq mm1,[edi+PITCH*3]
  3997. psubb mm7,mm2
  3998. psubb mm1,mm3
  3999. movq mm0,mm4
  4000. movq PelDiffsLine2,mm7
  4001. paddb mm5,mm5 ; . . Prepare for use for next 4 lines.
  4002. movq PelDiffsLine3,mm1 ; Store D3.
  4003. pcmpeqb mm7,mm7
  4004. jmp Get4MoreLinesOfPred_InterpBoth
  4005. ;===========================================================================
  4006. ; Internal function to get 4 lines of prediction, interpolating in both
  4007. ; directions. The first line of the function are scheduled into the
  4008. ; caller's space, and so are commented out here. For 8 lines of prediction,
  4009. ; a second call, to the second entry point, is called after consuming the
  4010. ; outputs of the first function call. Certain registers must remain intact
  4011. ; to convey information from the first call to the second.
  4012. ;
  4013. ; ebp -- PITCH
  4014. ; edi -- Points to target block.
  4015. ; esi -- Points to Upper left corner of 9*9 block that will be interpolated
  4016. ; horizontally and vertically to generate prediction.
  4017. ; edx -- Reserved (MBlockActionStream)
  4018. ; ecx -- Not in use
  4019. ; ebx -- Will be used.
  4020. ; eax -- Reserved.
  4021. ; mm6 -- 8 bytes of 0xFE.
  4022. ; mm7 -- 8 bytes of -1.
  4023. ; mm0-mm5 -- Scratch
  4024. StackOffset TEXTEQU <StackDepthVaries_DoNotUseStackVariables>
  4025. Get4LinesOfPred_InterpBoth: ; 01234 Details for line 0
  4026. ; movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
  4027. movq mm1,mm5 ; B. . <R08 R07 R06 R05 R04 R03 R02 R01>
  4028. xor ebx,ebx ; . .
  4029. mov bl,[esi] ; C. . R00
  4030. psllq mm5,8 ; D. . <R07 R06 R05 R04 R03 R02 R01 0>
  4031. paddb mm5,mm1 ; E. . <R08+R07 ... R02+R01 R01>
  4032. paddb mm5,Pel_Rnd[ebx*8] ; F. . <R08+R07+1 ... R02+R01+1 R01+R00+1>
  4033. movq mm0,mm6 ; G. . Mask to extract each pel's frac bit.
  4034. pandn mm0,mm5 ; H. . <(R08+R07+1)&1 ...>
  4035. pand mm5,mm6 ; I. . Pre-clean
  4036. Get4MoreLinesOfPred_InterpBoth: ; . .
  4037. movq mm2,[esi+ebp*1+1] ; A .
  4038. psrlq mm5,1 ; J. . <(R08+R07+1)/2 ... (R01+R00+1)/2)>
  4039. xor ebx,ebx ; . .
  4040. movq mm1,mm2 ; B .
  4041. mov bl,[esi+ebp*1] ; C .
  4042. psllq mm2,8 ; D .
  4043. movq mm3,[esi+ebp*2+1] ; .A.
  4044. paddb mm2,mm1 ; E .
  4045. paddb mm2,Pel_Rnd[ebx*8] ; F .
  4046. movq mm1,mm3 ; .B.
  4047. mov bl,[esi+ebp*2] ; .C.
  4048. psllq mm3,8 ; .D.
  4049. movq mm4,[esi+PITCH*3+1] ; . A
  4050. paddb mm3,mm1 ; .E.
  4051. paddb mm3,Pel_Rnd[ebx*8] ; .F.
  4052. movq mm1,mm4 ; . B
  4053. mov bl,[esi+PITCH*3] ; . C
  4054. pand mm0,mm2 ; K. . <(R08+R07+1)&(R18+R17+1)&1 ...>
  4055. paddb mm0,mm5 ; L. . <(R08+R07+1+((R18+R17+1)&1))/2 ...>
  4056. psllq mm4,8 ; . D
  4057. movq mm5,[esi+ebp*4+1] ; . .A
  4058. paddb mm4,mm1 ; . E
  4059. paddb mm4,Pel_Rnd[ebx*8] ; . F
  4060. movq mm1,mm5 ; . .B
  4061. mov bl,[esi+ebp*4] ; . .C
  4062. psllq mm5,8 ; . .D
  4063. paddb mm5,mm1 ; . .E
  4064. movq mm1,mm6 ; G .
  4065. pandn mm1,mm2 ; H .
  4066. pand mm2,mm6 ; I .
  4067. paddb mm5,Pel_Rnd[ebx*8] ; . .F
  4068. psrlq mm2,1 ; J .
  4069. paddb mm0,mm2 ; M. . <(R08+R07+R18+R17+2)/2 ...>
  4070. pand mm1,mm3 ; K .
  4071. paddb mm1,mm2 ; L .
  4072. movq mm2,mm6 ; .G.
  4073. pandn mm2,mm3 ; .H.
  4074. pand mm3,mm6 ; .I.
  4075. pand mm0,mm6 ; N. . Pre-clean
  4076. psrlq mm3,1 ; .J.
  4077. paddb mm1,mm3 ; M .
  4078. pand mm2,mm4 ; .K.
  4079. paddb mm2,mm3 ; .L.
  4080. movq mm3,mm6 ; . G
  4081. pandn mm3,mm4 ; . H
  4082. pand mm4,mm6 ; . I
  4083. pand mm3,mm5 ; . K
  4084. psrlq mm4,1 ; . J
  4085. paddb mm2,mm4 ; .M.
  4086. paddb mm3,mm4 ; . L
  4087. movq mm4,mm6 ; . .G
  4088. psrlq mm0,1 ; O. . P0 = <(R08+R07+R18+R17+2)/4 ...>
  4089. pandn mm4,mm5 ; . .H
  4090. pand mm5,mm6 ; . .I
  4091. pand mm1,mm6 ; N .
  4092. psrlq mm5,1 ; . .J
  4093. paddb mm3,mm5 ; . M
  4094. lea esi,[esi+ebp*4] ; Advance to next four lines.
  4095. ; pand mm2,mm6 ; .N.
  4096. ; psrlq mm1,1 ; O .
  4097. ; pand mm3,mm6 ; . N
  4098. ; psrlq mm2,1 ; .O.
  4099. ; paddb mm5,mm5 ; . . Prepare for use for next 4 lines.
  4100. ; psrlq mm3,1 ; . O
  4101. ret
  4102. StackOffset TEXTEQU <4>
  4103. ; The steps commented out above are scheduled into the mem-ops the caller has
  4104. ; to do at the point of return. As though these ops were done, the registers
  4105. ; look as follows:
  4106. ; mm0 -- Prediction for line 0.
  4107. ; mm1 -- Prediction for line 1.
  4108. ; mm2 -- Prediction for line 2.
  4109. ; mm3 -- Prediction for line 3.
  4110. ; mm4 -- Must be moved to mm0 before computing prediction for next 4 lines.
  4111. ; mm5 -- Must be doubled before computing prediction for next 4 lines.
  4112. ; mm6 -- 8 bytes of 0x01. Must be this when computing pred for next 4 lines.
  4113. ; mm7 -- 8 bytes of 0xFE. Must be this when computing pred for next 4 lines.
  4114. ;=============================================================================
  4115. ENDIF
  4116. StackOffset TEXTEQU <0>
  4117. IFDEF H261
  4118. ELSE ;H263
  4119. OBMCDifferencing:
  4120. mov al,PendingOBMC ; Do OBMC for previous block, if needed..
  4121. mov bl,1
  4122. test al,al
  4123. mov PendingOBMC,bl
  4124. mov cl,INTER1MV
  4125. je NextMacroBlock
  4126. mov StashBlockType,cl
  4127. call DoPendingOBMCDiff
  4128. mov al,IsPlainPFrame
  4129. test al,al
  4130. jne NextMacroBlock
  4131. add edx,-SIZEOF T_MacroBlockActionDescr
  4132. movq mm6,C0101010101010101
  4133. pxor mm7,mm7 ; Initialize SWD accumulator
  4134. call MMxDoBFrameLumaBlocks
  4135. sub edx,-SIZEOF T_MacroBlockActionDescr
  4136. jmp NextMacroBlock
  4137. ENDIF
  4138. ;============================================================================
  4139. ; Calculate the IntraSWD
  4140. ;
  4141. ; ebp -- PITCH
  4142. ; esi -- Accumulation for IntraSWD
  4143. ; edi -- Address of target macroblock.
  4144. ; edx -- MBlockActionStream
  4145. ; ecx -- Scratch
  4146. ; ebx -- Amount IntraSWD has to be less than to be the winner.
  4147. ; eax -- Reserved. Holds coded blk pattern, (except undef when IntraByDecree).
  4148. ; mm7 -- SWD total for macroblock.
  4149. ; mm6 -- Average pel value for block 1.
  4150. ; mm5 -- Average pel value for block 2.
  4151. ; mm4 -- Average pel value for block 3.
  4152. ; mm3 -- Average pel value for block 4.
  4153. ; mm0-mm2 Scratch
  4154. ;
  4155. IntraByDecree:
  4156. mov ebx,000080000H ; Set Inter SWD artificially high.
  4157. CalculateIntraSWD:
  4158. sub ebx,INTRACODINGDIFFERENTIAL
  4159. mov cl,1
  4160. movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
  4161. pcmpeqb mm5,mm5
  4162. ComputeIntraSWDForNextBlock:
  4163. movq mm2,[edi+ebp*2] ; <P27 P26 P25 P24 P23 P22 P21 P20>
  4164. psrlw mm5,8
  4165. movq mm4,[edi+ebp*4]
  4166. paddw mm0,mm2 ; <junk P06+P26 junk P04+P24 ...>
  4167. movq mm6,[edi+PITCH*6]
  4168. pand mm0,mm5 ; <P06+P26 P04+P24 P02+P22 P00+P20>
  4169. movq mm1,[edi+ebp*1] ; <P17 P16 P15 P14 P13 P12 P11 P10>
  4170. paddw mm4,mm6
  4171. movq mm3,[edi+PITCH*3] ; <P37 P36 P35 P34 P33 P32 P31 P30>
  4172. pand mm4,mm5
  4173. movq mm5,[edi+PITCH*5]
  4174. paddw mm1,mm3 ; <P17+P37 junk P15+P35 junk ...>
  4175. movq mm7,[edi+PITCH*7]
  4176. psrlw mm1,8 ; <P17+P37 P15+P35 P13+P33 P11+P31>
  4177. paddw mm0,mm1
  4178. paddw mm5,mm7
  4179. paddw mm0,mm4
  4180. psrlw mm5,8
  4181. paddw mm0,mm5
  4182. pcmpeqw mm5,mm5 ; Get words of -1
  4183. movq mm4,[edi+ebp*4]
  4184. pmaddwd mm0,mm5 ; <SumHi = Sum3+Sum2 | SumLo = Sum1+Sum0>
  4185. pcmpeqw mm1,mm1
  4186. psllw mm3,8 ; <P36 0 P34 0 P32 0 P30 0>
  4187. movq mm5,[edi+PITCH*5]
  4188. psllw mm1,3 ; 4 words of 0xFFF8
  4189. packssdw mm0,mm0 ; <SumHi | SumLo | SumHi | SumLo>
  4190. mov al,[edx].CodedBlocks ; Fetch coded block pattern.
  4191. pmaddwd mm0,mm1 ; <Sum = SumHi+SumLo | Sum = SumHi+SumLo>
  4192. psllw mm5,8
  4193. movq mm1,[edi+ebp*1]
  4194. psllw mm7,8
  4195. ;
  4196. psllw mm1,8
  4197. ;
  4198. packssdw mm0,mm0 ; <Sum | Sum | Sum | Sum>
  4199. psubw mm1,mm0 ; <P16-Avg frac P14-Avg frac ...>
  4200. psubw mm2,mm0 ; <P27-Avg frac P25-Avg frac ...>
  4201. pmaddwd mm1,mm1 ; Square of diff
  4202. psubw mm3,mm0
  4203. pmaddwd mm2,mm2
  4204. psubw mm4,mm0
  4205. pmaddwd mm3,mm3
  4206. psubw mm5,mm0
  4207. pmaddwd mm4,mm4
  4208. psubw mm6,mm0
  4209. psubw mm7,mm0
  4210. paddusw mm1,mm2
  4211. psubw mm0,[edi]
  4212. pmaddwd mm5,mm5
  4213. pmaddwd mm6,mm6
  4214. paddusw mm1,mm3
  4215. pmaddwd mm7,mm7
  4216. paddusw mm1,mm4
  4217. pmaddwd mm0,mm0
  4218. paddusw mm1,mm5
  4219. paddusw mm1,mm6
  4220. cmp cl,2
  4221. paddusw mm1,mm7
  4222. ;
  4223. paddusw mm0,mm1
  4224. ;
  4225. punpckldq mm1,mm0
  4226. ;
  4227. paddusw mm0,mm1
  4228. jg LowerBlkIntraDone
  4229. psrlq mm0,48
  4230. lea edi,[edi+ebp*8+8] ; Speculate going from blk 1 to blk 4
  4231. mov cl,4
  4232. je Blk2IntraDone
  4233. Blk1IntraDone:
  4234. movdf esi,mm0
  4235. sub ebx,esi
  4236. jle InterBestX
  4237. movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
  4238. pcmpeqb mm5,mm5
  4239. jmp ComputeIntraSWDForNextBlock
  4240. LowerBlkIntraDone:
  4241. psrlq mm0,48
  4242. sub edi,PITCH*8 ; Speculate going from blk 4 to blk 2
  4243. cmp cl,3
  4244. je Blk3IntraDone
  4245. Blk4IntraDone:
  4246. movdf ecx,mm0
  4247. add esi,ecx ; Accumulate IntraSWD
  4248. sub ebx,ecx
  4249. jle InterBestX
  4250. movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
  4251. pcmpeqb mm5,mm5
  4252. mov cl,2
  4253. jmp ComputeIntraSWDForNextBlock
  4254. Blk2IntraDone:
  4255. movdf ecx,mm0
  4256. add esi,ecx ; Accumulate IntraSWD
  4257. sub edi,16 ; Get to blk 3.
  4258. sub ebx,ecx
  4259. jle InterBestX
  4260. movq mm0,[edi] ; <P07 P06 P05 P04 P03 P02 P01 P00>
  4261. pcmpeqb mm5,mm5
  4262. mov cl,3
  4263. jmp ComputeIntraSWDForNextBlock
  4264. Blk3IntraDone:
  4265. movdf ecx,mm0
  4266. add esi,ecx ; Accumulate IntraSWD
  4267. sub ebx,ecx
  4268. jle InterBestX
  4269. IntraBest:
  4270. mov ecx,SWDTotal
  4271. and al,07FH ; Turn off FORCE-INTRA bit.
  4272. mov [edx].SWD,esi
  4273. add ecx,esi ; Add to total.
  4274. mov SWDTotal,ecx
  4275. mov cl,INTRA
  4276. mov [edx].BlockType,cl ; Indicate macroblock handling decision.
  4277. xor ecx,ecx
  4278. mov [edx].BlkY1.MVs,ecx
  4279. mov [edx].BlkY2.MVs,ecx
  4280. mov [edx].BlkY3.MVs,ecx
  4281. mov [edx].BlkY4.MVs,ecx
  4282. mov [edx].CodedBlocks,al
  4283. IFDEF H261
  4284. ELSE ;H263
  4285. mov al,PendingOBMC ; Do Prev MB if it needs to be OBMC'ed.
  4286. mov [edx].BestFullPelMBHMV,cl ; Kill MVs so extended EMV of other
  4287. ; ; blocks will work right.
  4288. dec al
  4289. mov [edx].BestFullPelMBVMV,cl
  4290. jne @f
  4291. mov PendingOBMC,al ; Go on to next MB, unless the prev MB
  4292. ; ; needs to be finished (OBMC).
  4293. mov cl,INTER1MV
  4294. mov StashBlockType,cl
  4295. call DoPendingOBMCDiff
  4296. mov al,IsPlainPFrame
  4297. test al,al
  4298. jne @f
  4299. add edx,-SIZEOF T_MacroBlockActionDescr
  4300. movq mm6,C0101010101010101
  4301. pxor mm7,mm7 ; Initialize SWD accumulator
  4302. call MMxDoBFrameLumaBlocks
  4303. sub edx,-SIZEOF T_MacroBlockActionDescr
  4304. @@:
  4305. ENDIF
  4306. mov cl,INTRA
  4307. mov esi,TargetMacroBlockBaseAddr
  4308. mov StashBlockType,cl
  4309. push eax ; Adjust stack pointer
  4310. StackOffset TEXTEQU <4>
  4311. call MMxDoForwardDCT
  4312. mov al,[edx].CodedBlocks
  4313. mov esi,TargetMacroBlockBaseAddr
  4314. sub al,bl
  4315. add esi,8
  4316. mov [edx].CodedBlocks,al
  4317. call MMxDoForwardDCT
  4318. shl bl,1
  4319. mov al,[edx].CodedBlocks
  4320. sub al,bl
  4321. mov esi,TargetMacroBlockBaseAddr
  4322. mov [edx].CodedBlocks,al
  4323. add esi,PITCH*8
  4324. call MMxDoForwardDCT
  4325. shl bl,2
  4326. mov al,[edx].CodedBlocks
  4327. sub al,bl
  4328. mov esi,TargetMacroBlockBaseAddr
  4329. mov [edx].CodedBlocks,al
  4330. add esi,PITCH*8+8
  4331. call MMxDoForwardDCT
  4332. shl bl,3
  4333. mov al,[edx].CodedBlocks
  4334. sub al,bl
  4335. pop edi ; Adjust stack pointer
  4336. StackOffset TEXTEQU <0>
  4337. mov [edx].CodedBlocks,al
  4338. IFDEF H261
  4339. ELSE
  4340. mov al,IsPlainPFrame
  4341. test al,al
  4342. jne NextMacroBlock
  4343. movq mm6,C0101010101010101
  4344. pxor mm7,mm7 ; Initialize SWD accumulator
  4345. call MMxDoBFrameLumaBlocks
  4346. ENDIF
  4347. jmp NextMacroBlock
  4348. IFDEF H261
  4349. ELSE; H263
  4350. StackOffset TEXTEQU <4>
  4351. DoPendingOBMCDiff: ; Internal function
  4352. ;============================================================================
  4353. ; Perform differencing for the non-empty luma blocks of an Inter-coded
  4354. ; macroblock. This is the OBMC case; i.e. Advanced Prediction is selected.
  4355. PrevMBAD EQU [edx-SIZEOF T_MacroBlockActionDescr]
  4356. pcmpeqb mm6,mm6
  4357. pcmpeqb mm7,mm7 ; 8 bytes of -1
  4358. paddb mm6,mm6 ; 8 bytes of 0xFE
  4359. mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks.
  4360. test al,1 ; Check if block 1 empty.
  4361. je OBMCDoneForBlock1
  4362. xor ebx,ebx
  4363. mov eax,SIZEOF T_Blk ; Blk to right is blk 2 of this MB.
  4364. mov bl,PrevMBAD.MBEdgeType
  4365. mov ecx,1 ; Mask to extract left edge indicator.
  4366. and ecx,ebx ; Extract left edge indicator.
  4367. and ebx,4 ; Extract top edge indicator.
  4368. mov esi,PrevMBAD.BlkY1.MVs
  4369. lea edi,[eax*2] ; Blk below is blk 3 of this MB.
  4370. mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.
  4371. mov edi,BlockAbove[ebx] ; Blk above is blk 3 of mb above, or off
  4372. ; ; upper edge.
  4373. mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 2 of mb to the
  4374. ; ; left, or off left edge.
  4375. mov DistToBADforBlockAbove,edi
  4376. call DoOBMCForBlock
  4377. mov al,PrevMBAD.CodedBlocks ; Bits 0- 3 set for non-empty Y blks.
  4378. sub al,bl
  4379. mov PrevMBAD.CodedBlocks,al
  4380. OBMCDoneForBlock1:
  4381. add edx,SIZEOF T_Blk
  4382. test al,2 ; Check if block 2 empty.
  4383. je OBMCDoneForBlock2
  4384. xor ebx,ebx
  4385. mov eax,2 ; Mask to extract right edge indicator.
  4386. mov bl,PrevMBAD[-SIZEOF T_Blk].MBEdgeType
  4387. mov edi,2*SIZEOF T_Blk ; Blk below is blk 4 of this MB.
  4388. and eax,ebx ; Extract right edge indicator.
  4389. and ebx,4 ; Extract top edge indicator.
  4390. mov DistToBADforBlockBelow,edi ; Stash BAD offset for lower remote MV.
  4391. lea ecx,[edi-3*SIZEOF T_Blk] ; Blk to left is blk 1 of this MB.
  4392. mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the
  4393. ; ; right, or off right edge.
  4394. mov edi,BlockAbove[ebx] ; Blk above is blk 4 of mb above, or off
  4395. ; ; upper edge.
  4396. mov esi,PrevMBAD.BlkY1.MVs
  4397. mov DistToBADforBlockAbove,edi
  4398. call DoOBMCForBlock
  4399. shl bl,1
  4400. mov al,PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks
  4401. sub al,bl
  4402. mov PrevMBAD[-1*SIZEOF T_Blk].CodedBlocks,al
  4403. OBMCDoneForBlock2:
  4404. add edx,SIZEOF T_Blk
  4405. test al,4 ; Check if block 3 empty.
  4406. je OBMCDoneForBlock3
  4407. xor ecx,ecx
  4408. xor ebx,ebx ; Blk below is this block.
  4409. mov cl,PrevMBAD[-2*SIZEOF T_Blk].MBEdgeType
  4410. mov eax,SIZEOF T_Blk ; Blk to right is blk 4 of this MB.
  4411. and ecx,1 ; Extract left edge indicator.
  4412. mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.
  4413. lea edi,[eax-3*SIZEOF T_Blk] ; Blk above is blk 1 of this MB.
  4414. mov esi,PrevMBAD.BlkY1.MVs
  4415. mov DistToBADforBlockAbove,edi
  4416. mov ecx,BlockToLeft[ecx*4] ; Blk to left is blk 1 of mb to the
  4417. ; ; left, or off left edge.
  4418. call DoOBMCForBlock
  4419. shl bl,2
  4420. mov al,PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks
  4421. sub al,bl
  4422. mov PrevMBAD[-2*SIZEOF T_Blk].CodedBlocks,al
  4423. OBMCDoneForBlock3:
  4424. add edx,SIZEOF T_Blk
  4425. test al,8 ; Check if block 4 empty.
  4426. je OBMCDoneForBlock4
  4427. xor eax,eax
  4428. xor ebx,ebx ; Blk below is this block.
  4429. mov al,PrevMBAD[-3*SIZEOF T_Blk].MBEdgeType
  4430. mov ecx,-SIZEOF T_Blk ; Blk to left is blk 3 of this MB.
  4431. and eax,2 ; Extract right edge indicator.
  4432. mov DistToBADforBlockBelow,ebx ; Stash BAD offset for lower remote MV.
  4433. lea edi,[ecx*2] ; Blk above is blk 2 of this MB.
  4434. mov esi,PrevMBAD.BlkY1.MVs
  4435. mov DistToBADforBlockAbove,edi
  4436. mov eax,BlockToRight[eax*2] ; Blk to right is blk 1 of mb to the
  4437. ; ; right, or off right edge.
  4438. call DoOBMCForBlock
  4439. shl bl,3
  4440. mov al,PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks
  4441. sub al,bl
  4442. mov PrevMBAD[-3*SIZEOF T_Blk].CodedBlocks,al
  4443. OBMCDoneForBlock4:
  4444. sub edx,3*SIZEOF T_Blk ; Get back to MacroBlock Action Descriptor
  4445. ret
  4446. StackOffset TEXTEQU <8>
  4447. DoOBMCForBlock: ; Internal Function
  4448. ; Present register contents.
  4449. ; ebp -- PITCH
  4450. ; esi -- Motion vectors for current block.
  4451. ; ecx -- Distance from BAD of blk we're doing to BAD for block that provides
  4452. ; remote MV from left.
  4453. ; eax -- Distance from BAD of blk we're doing to BAD for block that provides
  4454. ; remote MV from right.
  4455. ; edx -- MBlockActionStream, adjusted to reach BAD of blk we are doing OBMC to.
  4456. ; doing OBMC)
  4457. ; mm7 -- 8 bytes of -1.
  4458. ; mm6 -- 8 bytes of 0xFE.
  4459. ;
  4460. ; In the body of this code:
  4461. ;
  4462. ; edx -- Unchanged.
  4463. ; edi -- Saved to memory. Then used for address of destination for storing
  4464. ; remote prediction blocks.
  4465. ; ebp -- PITCH.
  4466. ; esi -- Pointer to 8*8, 8*9, 9*8, or 9*9 remote reference areas, which are
  4467. ; then interpolated and stored at edi.
  4468. ; ecx, eax -- Inputs are used, then these are scratch.
  4469. ; ebx -- Scratch
  4470. ; mm7 -- 8 bytes of -1
  4471. ; mm6 -- 8 bytes of 0xFE
  4472. ; mm0-mm5 -- Scratch
  4473. ; Compute left remote prediction block.
  4474. lea edi,PrevMBAD[ecx]
  4475. and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to left.
  4476. lea ebx,CentralPred
  4477. mov AddrOfLeftPred,ebx ; Speculate that left remote MV == center MV.
  4478. mov AddrOfRightPred,ebx ; Speculate that right remote MV == center MV.
  4479. mov bl,[edi].BlockType
  4480. cmp bl,INTRA
  4481. je LeftEqCtr ; Jump if INTRA. (Use central)
  4482. mov ebx,PrevMBAD[ecx].BlkY1.MVs
  4483. and ebx,00000FFFFH ; Blk to left may have B MVs set. Clear them.
  4484. cmp esi,ebx
  4485. je LeftEqCtr
  4486. mov edi,PrevMBAD[ecx].BlkY1.BlkOffset
  4487. mov esi,PrevMBAD[ecx].BlkY1.PastRef ; Get ref addr using left remote.
  4488. sub esi,edi
  4489. mov edi,PrevMBAD.BlkY1.BlkOffset
  4490. add esi,edi
  4491. lea edi,LeftPred
  4492. call GetPredForCenterLeftOrRight
  4493. pand mm2,mm6
  4494. psrlq mm1,1
  4495. movq [edi+32],mm0
  4496. psrlq mm2,1
  4497. movq [edi+40],mm1
  4498. pand mm3,mm6
  4499. movq [edi+48],mm2
  4500. psrlq mm3,1
  4501. lea ecx,PrevMBAD[eax]
  4502. and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.
  4503. mov esi,PrevMBAD.BlkY1.MVs
  4504. movq [edi+56],mm3
  4505. pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
  4506. ; Compute right remote prediction block.
  4507. mov AddrOfLeftPred,edi
  4508. mov bl,[ecx].BlockType
  4509. cmp bl,INTRA
  4510. je RightEqCtrButLeftNeCtr ; Jump if INTRA.(Use central)
  4511. mov ebx,PrevMBAD[eax].BlkY1.MVs
  4512. cmp esi,ebx
  4513. je RightEqCtrButLeftNeCtr
  4514. mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote.
  4515. mov edi,PrevMBAD[eax].BlkY1.BlkOffset
  4516. RightNeCtr:
  4517. sub esi,edi
  4518. mov edi,PrevMBAD.BlkY1.BlkOffset
  4519. add esi,edi
  4520. lea edi,RightPred
  4521. call GetPredForCenterLeftOrRight
  4522. pand mm2,mm6
  4523. psrlq mm1,1
  4524. movq [edi+32],mm0
  4525. psrlq mm2,1
  4526. movq [edi+40],mm1
  4527. pand mm3,mm6
  4528. movq [edi+48],mm2
  4529. psrlq mm3,1
  4530. mov AddrOfRightPred,edi
  4531. ;
  4532. movq [edi+56],mm3
  4533. pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
  4534. RightEqCtrButLeftNeCtr:
  4535. ; Compute central prediction block.
  4536. mov ebx,PrevMBAD.BlkY1.MVs
  4537. mov esi,PrevMBAD.BlkY1.PastRef
  4538. lea edi,CentralPred
  4539. mov eax,DistToBADforBlockBelow
  4540. call GetPredForCenterLeftOrRight
  4541. pand mm2,mm6
  4542. psrlq mm1,1
  4543. movq [edi+32],mm0
  4544. psrlq mm2,1
  4545. movq [edi+40],mm1
  4546. pand mm3,mm6
  4547. movq [edi+48],mm2
  4548. psrlq mm3,1
  4549. lea ecx,PrevMBAD[eax]
  4550. and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.
  4551. mov esi,PrevMBAD.BlkY1.MVs
  4552. movq [edi+56],mm3
  4553. pcmpeqb mm7,mm7
  4554. mov bl,[ecx].BlockType
  4555. mov ecx,PrevMBAD.BlkY1.BlkOffset
  4556. cmp bl,INTRA
  4557. je BelowEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
  4558. ; Compute bottom remote prediction block.
  4559. mov ebx,PrevMBAD[eax].BlkY1.MVs
  4560. mov edi,AddrOfLeftPred
  4561. cmp esi,ebx
  4562. jne BelowNeCtr
  4563. BelowEqCtrButSidesDiffer:
  4564. paddb mm1,mm1 ; Prep mm0-3, which have ctr, for reuse below.
  4565. paddb mm2,mm2
  4566. paddb mm3,mm3
  4567. mov edi,AddrOfLeftPred
  4568. jmp BelowEqCtr
  4569. BelowNeCtr:
  4570. mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
  4571. mov eax,PrevMBAD[eax].BlkY1.BlkOffset
  4572. sub esi,eax
  4573. lea eax,[ecx+ebp*4]
  4574. call GetPredForAboveOrBelow
  4575. BelowEqCtr:
  4576. ; Compute difference for lines 4 thru 7.
  4577. ; Lines 4 and 5: Cols 0,1,6, and 7 treated same. Cols 2-5 treated same.
  4578. mov esi,AddrOfRightPred
  4579. mov ebx,TargetFrameBaseAddress
  4580. movdt mm5,[edi+48] ; 6B: < 0 0 0 0 R63 R62 R61 R60>
  4581. pand mm2,mm6
  4582. punpckldq mm5,[esi+48+4] ; 6C: <L67 L66 L65 L64 R63 R62 R61 R60>
  4583. pand mm3,mm6
  4584. movq mm4,CFFFF00000000FFFF ; 6D: < FF FF 00 00 00 00 FF FF>
  4585. psrlq mm2,1 ; 6A: <B67 B66 B65 B64 B63 B62 B61 B60>
  4586. pand mm4,mm5 ; 6E: <L67 L66 00 00 00 00 R61 R60>
  4587. paddb mm5,mm2 ; 6F: <B67+L67 ... B65+L65 ...>
  4588. pand mm2,C0000FFFFFFFF0000 ; 6G: < 00 00 B65 B64 B63 B62 00 00>
  4589. psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>
  4590. paddb mm2,mm4 ; 6H: <L67 L66 B65 B64 B63 B62 R61 R60>
  4591. add ecx,ebx ; Address of target block.
  4592. movdt mm4,[edi+56] ; 7B: < 0 0 0 0 R73 R72 R71 R70>
  4593. psubb mm5,mm2 ; 6I: <B67 B66 L65 L64 R63 R62 B61 B60>
  4594. paddb mm5,CentralPred+48 ; 6J: <C67+B67 ... C65+L65 ...>
  4595. psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>
  4596. punpckldq mm4,[esi+56+4] ; 7C: <L77 L76 L75 L74 R73 R72 R71 R70>
  4597. pand mm5,mm6 ; 6K: <C67+B67 ... C65+L65 ...> pre-cleaned
  4598. mov eax,DistToBADforBlockAbove
  4599. psrlq mm5,1 ; 6L: <(C67+B67)/2 ... (C65+L65)/2 ...>
  4600. paddb mm2,mm5 ; 6M: <(C67+B67+2L67)/2 ...
  4601. ; ; (C65+2B65+L65)/2 ...>
  4602. lea ebx,PelDiffs
  4603. movq mm5,CFF000000000000FF ; 7D: < FF 00 00 00 00 00 00 FF>
  4604. pand mm2,mm6 ; 6N: pre-cleaned
  4605. pandn mm5,CentralPred+56 ; 7E: < 00 C76 C75 C74 C73 C72 C71 00>
  4606. psrlq mm2,1 ; 6O: <(C67+B67+2L67)/4 ...
  4607. ; ; (C65+2B65+L65)/4 ...>
  4608. paddb mm2,CentralPred+48 ; 6P: <(5C67+B67+2L67)/4 ...
  4609. ; ; (5C65+2B65+L65)/4 ...>
  4610. paddb mm5,mm4 ; 7F: <L77 C76+L76 ...>
  4611. pand mm4,CFF000000000000FF ; 7G: <L77 00 00 00 00 00 00 L70>
  4612. psubb mm2,mm7 ; 6Q: <(5C67+B67+2L67+4)/4 ...
  4613. ; ; (5C65+2B65+L65+4)/4 ...>
  4614. paddb mm4,mm5 ; 7H: <2L77 C76+L76 ...>
  4615. pand mm2,mm6 ; 6R: pre-cleaned
  4616. movq mm5,[ecx+PITCH*6] ; 6T: T6
  4617. psrlq mm2,1 ; 6S: P6 = <(5C67+B67+2L67+4)/8 ...
  4618. ; ; (5C65+2B65+L65+4)/8 ...>
  4619. psubb mm5,mm2 ; 6U: D6 = T6 - P6
  4620. ;
  4621. ; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0
  4622. movdt mm2,[edi+32] ; 4B: < 0 0 0 0 R43 R42 R41 R40>
  4623. pand mm4,mm6 ; 7I: <2L77 C76+L76 ...> pre-cleaned
  4624. movq [ebx+6*16],mm5 ; 6V: Store D6.
  4625. psrlq mm4,1 ; 7J: <2L77/2 (C76+L76)/2 ...>
  4626. punpckldq mm2,[esi+32+4] ; 4C: <L47 L46 L45 L44 R43 R42 R41 R40>
  4627. paddb mm3,mm4 ; 7K: <(2B77+2L77)/2 (C76+2B76+L76)/2 ...>
  4628. movq mm5,CFFFF00000000FFFF ; 4D: < FF FF 00 00 00 00 FF FF>
  4629. pand mm3,mm6 ; 7L: pre-cleaned
  4630. movq mm4,CentralPred+32 ; 4E: <C47 C46 C45 C44 C43 C42 C41 C40>
  4631. psrlq mm3,1 ; 7M: <(2B77+2L77)/4 (C76+2B76+L76)/4 ...>
  4632. paddb mm3,CentralPred+56 ; 7N: <(4C77+2B77+2L77)/4
  4633. ; ; (5C76+2B76+L76)/4 ...>
  4634. pand mm5,mm4 ; 4F: <C47 C46 00 00 00 00 C41 C40>
  4635. psubb mm3,mm7 ; 7O: <(4C77+2B77+2L77+4)/4
  4636. ; ; (5C76+2B76+L76+4)/4 ...>
  4637. paddb mm4,mm2 ; 4G: <C47+L47 ... C45+L45 ...>
  4638. pand mm2,C0000FFFFFFFF0000 ; 4H: < 00 00 L45 L44 R43 R42 00 00>
  4639. pand mm3,mm6 ; 7P: <(4C77+2B77+2L77+4)/4
  4640. ; ; (5C76+2B76+L76+4)/4 ...> pre-cleaned
  4641. paddb mm2,mm5 ; 4I: <C47 C46 L45 L44 R43 R42 C41 C40>
  4642. psrlq mm3,1 ; 7Q: P7 = <(4C77+2B77+2L77+4)/8
  4643. ; ; (5C76+2B76+L76+4)/8 ...>
  4644. movdt mm5,[edi+40] ; 5B: < 0 0 0 0 R53 R52 R51 R50>
  4645. psubb mm4,mm2 ; 4J: <L47 L46 C45 C44 C43 C42 R41 R40>
  4646. punpckldq mm5,[esi+40+4] ; 5C: <L57 L56 L55 L54 R53 R52 R51 R50>
  4647. paddb mm0,mm2 ; 4K: <C47+B47 ... B45+L45 ...>
  4648. movq mm2,[ecx+PITCH*7] ; 7R: T7
  4649. pand mm0,mm6 ; 4L: <C47+B47 ... B45+L45 ...> pre-cleaned
  4650. psubb mm2,mm3 ; 7S: D7 = T7 - P7
  4651. psrlq mm0,1 ; 4M: <(C47+B47)/2 ... (B45+L45)/2 ...>
  4652. movq mm3,CFFFF00000000FFFF ; 5D: < FF FF 00 00 00 00 FF FF>
  4653. paddb mm0,mm4 ; 4N: <(C47+B47+2L47)/2 ...
  4654. ; ; (2C45+B45+L45)/2 ...>
  4655. movq mm4,CentralPred+40 ; 5E: <C57 C56 C55 C54 C53 C52 C51 C50>
  4656. pand mm0,mm6 ; 4O: pre-cleaned
  4657. pand mm3,mm4 ; 5F: <C57 C56 00 00 00 00 C51 C50>
  4658. paddb mm4,mm5 ; 5G: <C57+L57 ... C55+L55 ...>
  4659. pand mm5,C0000FFFFFFFF0000 ; 5H: < 00 00 L55 L54 R53 R52 00 00>
  4660. psrlq mm0,1 ; 4P: <(C47+B47+2L47)/4 ...
  4661. ; ; (2C45+B45+L45)/4 ...>
  4662. paddb mm0,CentralPred+32 ; 4Q: <(5C47+B47+2L47)/4 ...
  4663. ; ; (6C45+B45+L45)/4 ...>
  4664. paddb mm5,mm3 ; 5I: <C57 C56 L55 L54 R53 R52 C51 C50>
  4665. psubb mm4,mm5 ; 5J: <L57 L56 C55 C54 C53 C52 R51 R50>
  4666. paddb mm1,mm5 ; 5K: <C57+B57 ... B55+L55 ...>
  4667. pand mm1,mm6 ; 5L: <C57+B57 ... B55+L55 ...> pre-cleaned
  4668. psubb mm0,mm7 ; 4R: <(5C47+B47+2L47+4)/4 ...
  4669. ; ; (6C45+B45+L45+4)/4 ...>
  4670. pand mm0,mm6 ; 4S: pre-cleaned
  4671. psrlq mm1,1 ; 5M: <(C57+B57)/2 ... (B55+L55)/2 ...>
  4672. paddb mm1,mm4 ; 5N: <(C57+B57+2L57)/2 ...
  4673. ; ; (2C55+B55+L55)/2 ...>
  4674. psrlq mm0,1 ; 4T: P4 = <(5C47+B47+2L47+4)/8 ...
  4675. ; ; (6C45+B45+L45+4)/8 ...>
  4676. movq mm3,[ecx+PITCH*5] ; 5U: T5
  4677. pand mm1,mm6 ; 5O: pre-cleaned
  4678. movq mm4,[ecx+ebp*4] ; 4U: T4
  4679. psrlq mm1,1 ; 5P: <(C57+B57+2L57)/4 ...
  4680. ; ; (2C55+B55+L55)/4 ...>
  4681. paddb mm1,CentralPred+40 ; 5Q: <(5C57+B57+2L57)/4 ...
  4682. ; ; (6C55+B55+L55)/4 ...>
  4683. psubb mm4,mm0 ; 4V: D4 = T4 - P4
  4684. lea esi,PrevMBAD[eax]
  4685. psubb mm1,mm7 ; 5R: <(5C57+B57+2L57+4)/4 ...
  4686. ; ; (6C55+B55+L55+4)/4 ...>
  4687. and esi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.
  4688. pand mm1,mm6 ; 5S: pre-cleaned
  4689. movq [ebx+7*16],mm2 ; 7T
  4690. psrlq mm1,1 ; 5T: P5 = <(5C57+B57+2L57+4)/8 ...
  4691. ; ; (6C55+B55+L55+4)/8 ...>
  4692. movq [ebx+4*16],mm4 ; 4W: Store D4.
  4693. psubb mm3,mm1 ; 5V: D5 = T5 - P5
  4694. mov cl,[esi].BlockType ; Bottom bit set if above neighbor is INTRA.
  4695. mov esi,PrevMBAD.BlkY1.MVs
  4696. movq [ebx+5*16],mm3 ; 5W: Store D5.
  4697. cmp cl,INTRA
  4698. je AboveEqCtrButSidesDiffer ; Jump if INTRA. (Use central)
  4699. ; Compute top remote prediction block.
  4700. mov ebx,PrevMBAD[eax].BlkY1.MVs
  4701. and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them.
  4702. mov ecx,PrevMBAD.BlkY1.BlkOffset
  4703. cmp esi,ebx
  4704. jne AboveNeCtr
  4705. AboveEqCtrButSidesDiffer:
  4706. movq mm3,CentralPred+24 ; Prep mm0-3, which have ctr, for reuse below.
  4707. movq mm2,CentralPred+16
  4708. paddb mm3,mm3
  4709. movq mm1,CentralPred+8
  4710. paddb mm2,mm2
  4711. movq mm0,CentralPred
  4712. paddb mm1,mm1
  4713. mov ecx,PrevMBAD.BlkY1.BlkOffset
  4714. jmp AboveEqCtr
  4715. AboveNeCtr:
  4716. mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
  4717. mov eax,PrevMBAD[eax].BlkY1.BlkOffset
  4718. sub esi,eax
  4719. mov eax,ecx
  4720. call GetPredForAboveOrBelow
  4721. AboveEqCtr:
  4722. ; Compute difference for lines 0 thru 3.
  4723. mov esi,AddrOfRightPred
  4724. mov ebx,TargetFrameBaseAddress
  4725. movdt mm5,[edi+8] ; 1B: < 0 0 0 0 R13 R12 R11 R10>
  4726. psrlq mm1,1 ; 1A: <A17 A16 A15 A14 A13 A12 A11 A10>
  4727. punpckldq mm5,[esi+8+4] ; 1C: <L17 L16 L15 L14 R13 R12 R11 R10>
  4728. pand mm3,mm6
  4729. movq mm4,CFFFF00000000FFFF ; 1D: < FF FF 00 00 00 00 FF FF>
  4730. psrlq mm3,1 ; 3A: <A37 A36 A35 A34 A33 A32 A31 A30>: mm0
  4731. pand mm4,mm5 ; 1E: <L17 L16 00 00 00 00 R11 R10>
  4732. paddb mm5,mm1 ; 1F: <A17+L17 ... A15+L15 ...>
  4733. pand mm1,C0000FFFFFFFF0000 ; 1G: < 00 00 A15 A14 A13 A12 00 00>
  4734. pand mm2,mm6
  4735. paddb mm5,CentralPred+8 ; 1H: <C17+A17+L17 ... C15+A15+L15 ...>
  4736. paddb mm1,mm4 ; 1I: <L17 L16 A15 A14 A13 A12 R11 R10>
  4737. ; 0A: <A07 A06 A05 A04 A03 A02 A01 A00>:mm0
  4738. movdt mm4,[edi] ; 0B: < 0 0 0 0 R03 R02 R01 R00>
  4739. psubb mm5,mm1 ; 1J: <C17+A17 ... C15+L15 ...>
  4740. punpckldq mm4,[esi+4] ; 0C: <L07 L06 L05 L04 R03 R02 R01 R00>
  4741. pand mm5,mm6 ; 1K: <C17+A17 ... C15+L15 ...> pre-cleaned
  4742. add ecx,ebx ; Address of target block.
  4743. psrlq mm5,1 ; 1L: <(C17+A17)/2 ... (C15+L15)/2 ...>
  4744. paddb mm1,mm5 ; 1M: <(C17+A17+2L17)/2 ...
  4745. ; ; (C15+2A15+L15)/2 ...>
  4746. psrlq mm2,1 ; 2A: <A27 A26 A25 A24 A23 A22 A21 A20>
  4747. movq mm5,CFF000000000000FF ; 0D: < FF 00 00 00 00 00 00 FF>
  4748. pand mm1,mm6 ; 1N: pre-cleaned
  4749. pandn mm5,CentralPred ; 0E: < 00 C06 C05 C04 C03 C02 C01 00>
  4750. psrlq mm1,1 ; 1O: <(C17+A17+2L17)/4 ...
  4751. ; ; (C15+2A15+L15)/4 ...>
  4752. paddb mm1,CentralPred+8 ; 1P: <(5C17+A17+2L17)/4 ...
  4753. ; ; (5C15+2A15+L15)/4 ...>
  4754. paddb mm5,mm4 ; 0F: <L07 C06+L06 ...>
  4755. pand mm4,CFF000000000000FF ; 0G: <L07 00 00 00 00 00 00 L00>
  4756. psubb mm1,mm7 ; 1Q: <(5C17+A17+2L17+4)/4 ...
  4757. ; ; (5C15+2A15+L15+4)/4 ...>
  4758. paddb mm4,mm5 ; 0H: <2L07 C06+L06 ...>
  4759. pand mm1,mm6 ; 1R: pre-cleaned
  4760. movq mm5,[ecx+ebp*1] ; 1T: T1
  4761. psrlq mm1,1 ; 1S: P1 = <(5C17+A17+2L17+4)/8 ...
  4762. ; ; (5C15+2A15+L15+4)/8 ...>
  4763. psubb mm5,mm1 ; 1U: D1 = T1 - P1
  4764. ;
  4765. movdt mm1,[edi+24] ; 3B: < 0 0 0 0 R33 R32 R31 R30>
  4766. pand mm4,mm6 ; 0I: <2L07 C06+L06 ...> pre-cleaned
  4767. movq PelDiffsLine1,mm5 ; 1V: Store D1.
  4768. psrlq mm4,1 ; 0J: <2L07/2 (C06+L06)/2 ...>
  4769. punpckldq mm1,[esi+24+4] ; 3C: <L37 L36 L35 L34 R33 R32 R31 R30>
  4770. paddb mm0,mm4 ; 0K: <(2A07+2L07)/2 (C06+2A06+L06)/2 ...>
  4771. movq mm5,CFFFF00000000FFFF ; 3D: < FF FF 00 00 00 00 FF FF>
  4772. pand mm0,mm6 ; 0L: pre-cleaned
  4773. movq mm4,CentralPred+24 ; 3E: <C37 C36 C35 C34 C33 C32 C31 C30>
  4774. psrlq mm0,1 ; 0M: <(2A07+2L07)/4 (C06+2A06+L06)/4 ...>
  4775. paddb mm0,CentralPred ; 0N: <(4C07+2A07+2L07)/4
  4776. ; ; (5C06+2A06+L06)/4 ...>
  4777. pand mm5,mm4 ; 3F: <C37 C36 00 00 00 00 C31 C30>
  4778. psubb mm0,mm7 ; 0O: <(4C07+2A07+2L07+4)/4
  4779. ; ; (5C06+2A06+L06+4)/4 ...>
  4780. paddb mm4,mm1 ; 3G: <C37+L37 ... C35+L35 ...>
  4781. pand mm1,C0000FFFFFFFF0000 ; 3H: < 00 00 L35 L34 R33 R32 00 00>
  4782. pand mm0,mm6 ; 0P: <(4C07+2A07+2L07+4)/4
  4783. ; ; (5C06+2A06+L06+4)/4 ...> pre-cleaned
  4784. paddb mm1,mm5 ; 3I: <C37 C36 L35 L34 R33 R32 C31 C30>
  4785. psrlq mm0,1 ; 0Q: P0 = <(4C07+2A07+2L07+4)/8
  4786. ; ; (5C06+2A06+L06+4)/8 ...>
  4787. movdt mm5,[edi+16] ; 2B: < 0 0 0 0 R23 R22 R21 R20>
  4788. psubb mm4,mm1 ; 3J: <L37 L36 C35 C34 C33 C32 R31 R30>
  4789. punpckldq mm5,[esi+16+4] ; 2C: <L27 L26 L25 L24 R23 R22 R21 R20>
  4790. paddb mm3,mm1 ; 3K: <C37+A37 ... A35+L35 ...>
  4791. movq mm1,[ecx] ; 0R: T0
  4792. pand mm3,mm6 ; 3L: <C37+A37 ... A35+L35 ...> pre-cleaned
  4793. psubb mm1,mm0 ; 0S: D0 = T0 - P0
  4794. psrlq mm3,1 ; 3M: <(C37+A37)/2 ... (A35+L35)/2 ...>
  4795. movq mm0,CFFFF00000000FFFF ; 2D: < FF FF 00 00 00 00 FF FF>
  4796. paddb mm3,mm4 ; 3N: <(C37+A37+2L37)/2 ...
  4797. ; ; (2C35+A35+L35)/2 ...>
  4798. movq mm4,CentralPred+16 ; 2E: <C27 C26 C25 C24 C23 C22 C21 C20>
  4799. pand mm3,mm6 ; 3O: pre-cleaned
  4800. pand mm0,mm4 ; 2F: <C27 C26 00 00 00 00 C21 C20>
  4801. paddb mm4,mm5 ; 2G: <C27+L27 ... C25+L25 ...>
  4802. pand mm5,C0000FFFFFFFF0000 ; 2H: < 00 00 L25 L24 R23 R22 00 00>
  4803. psrlq mm3,1 ; 3P: <(C37+A37+2L37)/4 ...
  4804. ; ; (2C35+A35+L35)/4 ...>
  4805. paddb mm3,CentralPred+24 ; 3Q: <(5C37+A37+2L37)/4 ...
  4806. ; ; (6C35+A35+L35)/4 ...>
  4807. paddb mm5,mm0 ; 2I: <C27 C26 L25 L24 R23 R22 C21 C20>
  4808. psubb mm4,mm5 ; 2J: <L27 L26 C25 C24 C23 C22 R21 R20>
  4809. paddb mm2,mm5 ; 2K: <C27+A27 ... A25+L25 ...>
  4810. pand mm2,mm6 ; 2L: <C27+A27 ... A25+L25 ...> pre-cleaned
  4811. psubb mm3,mm7 ; 3R: <(5C37+A37+2L37+4)/4 ...
  4812. ; ; (6C35+A35+L35+4)/4 ...>
  4813. pand mm3,mm6 ; 3S: pre-cleaned
  4814. psrlq mm2,1 ; 2M: <(C27+A27)/2 ... (A25+L25)/2 ...>
  4815. paddb mm2,mm4 ; 2N: <(C27+A27+2L27)/2 ...
  4816. ; ; (2C25+A25+L25)/2 ...>
  4817. psrlq mm3,1 ; 3T: P3 = <(5C37+A37+2L37+4)/8 ...
  4818. ; ; (6C35+A35+L35+4)/8 ...>
  4819. movq mm0,[ecx+ebp*2] ; 2U: T2
  4820. pand mm2,mm6 ; 2O: pre-cleaned
  4821. movq mm4,[ecx+PITCH*3] ; 3U: T3
  4822. psrlq mm2,1 ; 2P: <(C27+A27+2L27)/4 ...
  4823. ; ; (2C25+A25+L25)/4 ...>
  4824. paddb mm2,CentralPred+16 ; 2Q: <(5C27+A27+2L27)/4 ...
  4825. ; ; (6C25+A25+L25)/4 ...>
  4826. psubb mm4,mm3 ; 3V: D3 = T3 - P3
  4827. movq PelDiffsLine0,mm1 ; 0T
  4828. psubb mm2,mm7 ; 2R: <(5C27+A27+2L27+4)/4 ...
  4829. ; ; (6C25+A25+L25+4)/4 ...>
  4830. movq PelDiffsLine3,mm4 ; 3W: Store D3.
  4831. pand mm2,mm6 ; 2S: pre-cleaned
  4832. psrlq mm2,1 ; 2T: P2 = <(5C27+A27+2L27+4)/8 ...
  4833. ; ; (6C25+A25+L25+4)/8 ...>
  4834. ;
  4835. psubb mm0,mm2 ; 2V: D2 = T2 - P2
  4836. ;
  4837. ;
  4838. ;
  4839. movq PelDiffsLine2,mm0 ; 2W: Store D2.
  4840. ;
  4841. jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
  4842. LeftEqCtr:
  4843. ; Left remote motion vector was same as center.
  4844. ; Compute right remote prediction block.
  4845. lea edi,PrevMBAD[eax]
  4846. and edi,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk to right.
  4847. mov esi,PrevMBAD.BlkY1.MVs
  4848. ;
  4849. ;
  4850. mov cl,[edi].BlockType
  4851. mov ebx,PrevMBAD[eax].BlkY1.MVs
  4852. cmp cl,INTRA
  4853. je LeftEqCtrAndRightEqCtr ; Jump if INTRA. (Use central)
  4854. cmp esi,ebx
  4855. mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using right remote.
  4856. mov edi,PrevMBAD[eax].BlkY1.BlkOffset
  4857. jne RightNeCtr
  4858. ; Left and right remote motion vectors were same as center.
  4859. ; Compute central prediction block.
  4860. LeftEqCtrAndRightEqCtr:
  4861. mov ebx,PrevMBAD.BlkY1.MVs
  4862. mov esi,PrevMBAD.BlkY1.PastRef
  4863. lea edi,CentralPred
  4864. mov eax,DistToBADforBlockBelow
  4865. call GetPredForCenterLeftOrRight
  4866. pand mm2,mm6
  4867. psrlq mm1,1
  4868. movq [edi+32],mm0
  4869. psrlq mm2,1
  4870. movq [edi+40],mm1
  4871. pand mm3,mm6
  4872. movq [edi+48],mm2
  4873. psrlq mm3,1
  4874. lea ecx,PrevMBAD[eax]
  4875. and ecx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk below.
  4876. mov esi,PrevMBAD.BlkY1.MVs
  4877. movq [edi+56],mm3
  4878. pcmpeqb mm7,mm7 ; . . Restore 8 bytes of -1
  4879. mov bl,[ecx].BlockType
  4880. mov ecx,PrevMBAD.BlkY1.BlkOffset
  4881. cmp bl,INTRA
  4882. mov edi,AddrOfLeftPred
  4883. mov ebx,PrevMBAD[eax].BlkY1.MVs
  4884. je BottomHalfAllSame ; Jump if INTRA. (Use central)
  4885. ; Compute bottom remote prediction block.
  4886. cmp esi,ebx
  4887. mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
  4888. mov eax,PrevMBAD[eax].BlkY1.BlkOffset
  4889. je BottomHalfAllSame
  4890. sub esi,eax
  4891. lea eax,[ecx+ebp*4]
  4892. call GetPredForAboveOrBelow
  4893. ; Compute difference for lines 4 thru 7. Only the remote motion vector below
  4894. ; was different than the central motion vector.
  4895. ; 4A: <B47 B46 B45 B44 B43 B42 B41 B40>: mm0
  4896. movq mm5,CentralPred+48 ; 6b: <C67 C66 C65 C64 C63 C62 C61 C60>
  4897. pand mm2,mm6
  4898. movq mm4,CentralPred+32 ; 4B: <C47 C46 C45 C44 C43 C42 C41 C40>
  4899. psrlq mm2,1 ; 6a: <B67 B66 B65 B64 B63 B62 B61 B60>
  4900. paddb mm2,mm5 ; 6c: <C67+B67 ... C65+B65 ...>
  4901. paddb mm0,mm4 ; 4C: <C47+B47>
  4902. pand mm0,mm6 ; 4D: <C47+B47> pre-cleaned
  4903. psrlq mm1,1 ; 5A: <B57 B56 B55 B54 B53 B52 B51 B50>
  4904. pand mm2,mm6 ; 6d: <C67+B67 ... C65+B65 ...> pre-cleaned
  4905. psrlq mm0,1 ; 4E: <(C47+B47)/2 ...>
  4906. paddb mm0,mm4 ; 4F: <(3C47+B47)/2 ...>
  4907. psrlq mm2,1 ; 6e: <(C67+B67)/2 ... (C65+B65)/2 ...>
  4908. pmullw mm2,C0001000200020001 ; 6f: <(C67+B67)/2 ... (2C65+2B65)/2 ...>
  4909. pand mm0,mm6 ; 4G: <(3C47+B47)/2 ...> pre-cleaned
  4910. pand mm3,mm6
  4911. psrlq mm0,1 ; 4H: <(3C47+B47)/4 ...>
  4912. paddb mm0,mm4 ; 4I: <(7C47+B47)/4 ...>
  4913. psrlq mm3,1 ; 7A: <B77 B76 B75 B74 B73 B72 B71 B70>
  4914. movq mm4,C0000FFFFFFFF0000 ; 6g: < 00 00 FF FF FF FF 00 00>
  4915. psubb mm0,mm7 ; 4J: <(7C47+B47+4)/4 ...>
  4916. pandn mm4,mm5 ; 6h: <C67 C66 00 00 00 00 C61 C60>
  4917. psubb mm5,mm7 ; 6i: <C67+1 ... C65+1 ...>
  4918. paddb mm2,mm4 ; 6j: <(3C67+B67)/2 ... (2C65+2B65)/2 ...>
  4919. pand mm0,mm6 ; 4K: <(7C47+B47+4)/4 ...> pre-cleaned
  4920. movq mm4,CentralPred+40 ; 5B
  4921. pand mm2,mm6 ; 6k: pre-cleaned
  4922. paddb mm1,mm4 ; 5C
  4923. psrlq mm0,1 ; 4L: <(7C47+B47+4)/8 ...>
  4924. pand mm1,mm6 ; 5D
  4925. psrlq mm2,1 ; 6l: <(3C67+B67)/4 ... (2C65+2B65)/4 ...>
  4926. paddb mm2,mm5 ; 6m: <(7C67+B67+4)/4 ... (6C65+2B65+4)/4...>
  4927. psrlq mm1,1 ; 5E
  4928. movq mm5,CentralPred+56 ; 7B: <C77 C76 C75 C74 C73 C72 C71 C70>
  4929. paddb mm1,mm4 ; 5F
  4930. paddb mm3,mm5 ; 7C: <C77+B47>
  4931. pand mm1,mm6 ; 5G
  4932. pand mm3,mm6 ; 7D: <C77+B47> pre-cleaned
  4933. psrlq mm1,1 ; 5H
  4934. paddb mm1,mm4 ; 5I
  4935. psrlq mm3,1 ; 7E: <(C77+B47)/2 ...>
  4936. psubb mm1,mm7 ; 5J
  4937. paddb mm3,mm5 ; 7F: <(3C77+B47)/2 ...>
  4938. pand mm1,mm6 ; 5K
  4939. psubb mm3,mm7 ; 7G: <(3C77+B47+2)/2 ...>
  4940. pand mm2,mm6 ; 6n: pre-cleaned
  4941. psrlq mm1,1 ; 5L
  4942. pand mm3,mm6 ; 7H: <(3C77+B47+2)/2 ...> pre-cleaned
  4943. psrlq mm2,1 ; 6o: <(7C67+B67+4)/8 ... (6C65+2B65+4)/8...>
  4944. psrlq mm3,1 ; 7I: <(3C77+B47+2)/4 ...>
  4945. BottomHalfAllSame:
  4946. mov ebx,TargetFrameBaseAddress
  4947. mov eax,DistToBADforBlockAbove
  4948. mov esi,PrevMBAD.BlkY1.MVs
  4949. movq mm5,[ecx+ebx+PITCH*5] ; 5M
  4950. add ecx,ebx ; Address of target block.
  4951. lea ebx,PrevMBAD[eax]
  4952. and ebx,-SIZEOF T_MacroBlockActionDescr ; Addr of MBD for blk above.
  4953. psubb mm5,mm1 ; 5N
  4954. movq mm4,[ecx+ebp*4] ; 4M: T4
  4955. movq mm1,[ecx+PITCH*7] ; 7J: T7
  4956. psubb mm4,mm0 ; 4N: D4 = T4 - P4
  4957. movq mm0,[ecx+PITCH*6] ; 6p: T6
  4958. psubb mm1,mm3 ; 7K: D7 = T7 - P7
  4959. movq PelDiffsLine4,mm4 ; 4O: Store D4.
  4960. psubb mm0,mm2 ; 6q: D6 = T6 - P6
  4961. movq PelDiffsLine5,mm5 ; 5O
  4962. movq PelDiffsLine6,mm0 ; 6r
  4963. movq PelDiffsLine7,mm1 ; 7L
  4964. mov cl,[ebx].BlockType
  4965. cmp cl,INTRA
  4966. mov ecx,PrevMBAD.BlkY1.BlkOffset
  4967. mov ebx,PrevMBAD[eax].BlkY1.MVs
  4968. je SidesEqCtrAndAboveEqCtr ; Jump if INTRA. (Use central)
  4969. ; Compute top remote prediction block.
  4970. and ebx,00000FFFFH ; Blk above may have B MVs set. Clear them.
  4971. cmp esi,ebx
  4972. mov esi,PrevMBAD[eax].BlkY1.PastRef ; Get ref addr using above remote.
  4973. mov eax,PrevMBAD[eax].BlkY1.BlkOffset
  4974. jne SidesEqCtrButAboveNeCtr
  4975. SidesEqCtrAndAboveEqCtr:
  4976. movq mm0,CentralPred
  4977. movq mm1,CentralPred+8
  4978. paddb mm0,mm0
  4979. movq mm2,CentralPred+16
  4980. paddb mm1,mm1
  4981. movq mm3,CentralPred+24
  4982. paddb mm2,mm2
  4983. jmp TopHalfAllSame
  4984. SidesEqCtrButAboveNeCtr:
  4985. sub esi,eax
  4986. mov eax,ecx
  4987. call GetPredForAboveOrBelow
  4988. ; Compute difference for lines 0 thru 3. Only the remote motion vector above
  4989. ; was different than the central motion vector.
  4990. movq mm5,CentralPred+8 ; 1b
  4991. pand mm3,mm6
  4992. movq mm4,CentralPred+24 ; 3B
  4993. psrlq mm3,1 ; 3A
  4994. paddb mm3,mm4 ; 3C
  4995. psrlq mm1,1 ; 1A
  4996. paddb mm1,mm5 ; 1c
  4997. pand mm3,mm6 ; 3D
  4998. pand mm1,mm6 ; 1d
  4999. psrlq mm3,1 ; 3E
  5000. paddb mm3,mm4 ; 3F
  5001. psrlq mm1,1 ; 1e
  5002. pmullw mm1,C0001000200020001 ; 1f
  5003. pand mm3,mm6 ; 3G
  5004. pand mm2,mm6
  5005. psrlq mm3,1 ; 3H
  5006. paddb mm3,mm4 ; 3I
  5007. psrlq mm2,1 ; 2a
  5008. movq mm4,C0000FFFFFFFF0000 ; 1g
  5009. psubb mm3,mm7 ; 3J
  5010. pandn mm4,mm5 ; 1h
  5011. psubb mm5,mm7 ; 1i
  5012. paddb mm1,mm4 ; 1j
  5013. pand mm3,mm6 ; 3K
  5014. movq mm4,CentralPred+16 ; 2B
  5015. pand mm1,mm6 ; 1k
  5016. paddb mm2,mm4 ; 2C
  5017. psrlq mm3,1 ; 3L
  5018. pand mm2,mm6 ; 2D
  5019. psrlq mm1,1 ; 1l
  5020. paddb mm1,mm5 ; 1m
  5021. psrlq mm2,1 ; 2E
  5022. movq mm5,CentralPred ; 0B
  5023. paddb mm2,mm4 ; 2F
  5024. paddb mm0,mm5 ; 0C
  5025. pand mm2,mm6 ; 2G
  5026. pand mm0,mm6 ; 0D
  5027. psrlq mm2,1 ; 2H
  5028. paddb mm2,mm4 ; 2I
  5029. psrlq mm0,1 ; 0E
  5030. psubb mm2,mm7 ; 2J
  5031. paddb mm0,mm5 ; 0F
  5032. pand mm2,mm6 ; 2K
  5033. psubb mm0,mm7 ; 0G
  5034. TopHalfAllSame:
  5035. mov ebx,TargetFrameBaseAddress
  5036. lea edi,[ecx+ebx]
  5037. pand mm1,mm6 ; 1n
  5038. movq mm7,[ecx+ebx] ; 0J
  5039. pand mm0,mm6 ; 0H
  5040. movq mm5,[edi+PITCH*3] ; 3M
  5041. psrlq mm2,1 ; 2L
  5042. movq mm4,[edi+ebp*2] ; 2M
  5043. psubb mm5,mm3 ; 3N
  5044. psubb mm4,mm2 ; 2N
  5045. psrlq mm1,1 ; 1o
  5046. movq mm3,[edi+ebp*1] ; 1p
  5047. psubb mm3,mm1 ; 1q
  5048. movq PelDiffsLine3,mm5 ; 3O
  5049. psrlq mm0,1 ; 0I
  5050. movq PelDiffsLine2,mm4 ; 2O
  5051. psubb mm7,mm0 ; 0K
  5052. movq PelDiffsLine1,mm3 ; 1r
  5053. movq PelDiffsLine0,mm7 ; 0L
  5054. jmp MMxDoForwardDCTy ; Block is in PelDiffs block; Pitch is 16
  5055. ;=============================================================================
  5056. ; This internal function computes the OBMC contribution for the reference
  5057. ; block that uses the left, central, or right remote motion vector.
  5058. ;
  5059. ; ebp -- PITCH
  5060. ; edi -- Address of where to put the contribution.
  5061. ; esi -- Address of reference block.
  5062. ; edx -- Reserved. MBlockActionStream
  5063. ; ecx -- Unavailable.
  5064. ; ebx -- Scratch. Initially the horizontal and vertical motion vectors.
  5065. ; eax -- Unavailable.
  5066. ; mm7 -- 8 bytes of -1
  5067. ; mm6 -- 8 bytes of 0xFE
  5068. ; mm0-mm5 -- Scratch
  5069. StackOffset TEXTEQU <12_ButAccessToLocalVariablesShouldNotBeNeeded>
  5070. GetPredForCenterLeftOrRight:
  5071. shr ebx,1
  5072. jc HorzInterpInCLRPred
  5073. movq mm1,[esi+ebp*1]
  5074. and bl,080H
  5075. je NoInterpInCLRPred
  5076. VertInterpInCLRPred:
  5077. movq mm0,[esi]
  5078. psubb mm1,mm7
  5079. call Get4LinesOfPred_InterpVert
  5080. pand mm2,mm6
  5081. psrlq mm1,1
  5082. movq [edi+0],mm0
  5083. pand mm3,mm6
  5084. movq [edi+8],mm1
  5085. psrlq mm2,1
  5086. movq mm1,[esi+ebp*1]
  5087. psrlq mm3,1
  5088. movq [edi+16],mm2
  5089. movq mm0,mm4
  5090. movq [edi+24],mm3
  5091. psubb mm1,mm7
  5092. jmp Get4MoreLinesOfPred_InterpVert
  5093. HorzInterpInCLRPred:
  5094. movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
  5095. and bl,080H
  5096. jne BothInterpInCLRPred
  5097. call Get4LinesOfPred_InterpHorz
  5098. pand mm2,mm6
  5099. psrlq mm1,1
  5100. movq [edi+0],mm0
  5101. pand mm3,mm6
  5102. movq [edi+8],mm1
  5103. psrlq mm2,1
  5104. movq mm5,[esi+1] ; <R48 R47 R46 R45 R44 R43 R42 R41>
  5105. psrlq mm3,1
  5106. movq [edi+16],mm2
  5107. ;
  5108. movq [edi+24],mm3
  5109. ;
  5110. jmp Get4MoreLinesOfPred_InterpHorz
  5111. BothInterpInCLRPred:
  5112. call Get4LinesOfPred_InterpBoth
  5113. pand mm2,mm6
  5114. psrlq mm1,1
  5115. movq [edi+0],mm0
  5116. pand mm3,mm6
  5117. movq [edi+8],mm1
  5118. psrlq mm2,1
  5119. movq mm1,[esi+ebp*1]
  5120. psrlq mm3,1
  5121. movq [edi+16],mm2
  5122. movq mm0,mm4
  5123. movq [edi+24],mm3
  5124. psubb mm1,mm7
  5125. paddb mm5,mm5
  5126. jmp Get4MoreLinesOfPred_InterpBoth
  5127. NoInterpInCLRPred:
  5128. movq mm0,[esi]
  5129. movq mm2,[esi+ebp*2]
  5130. movq mm3,[esi+PITCH*3]
  5131. movq [edi+0],mm0
  5132. movq [edi+8],mm1
  5133. movq [edi+16],mm2
  5134. movq [edi+24],mm3
  5135. movq mm3,[esi+PITCH*7]
  5136. movq mm2,[esi+PITCH*6]
  5137. paddb mm3,mm3
  5138. movq mm1,[esi+PITCH*5]
  5139. paddb mm2,mm2
  5140. movq mm0,[esi+ebp*4]
  5141. paddb mm1,mm1
  5142. ret
  5143. ;=============================================================================
  5144. ; This internal function computes the OBMC contribution for the reference
  5145. ; block that uses the remote motion vector from block above or below.
  5146. ;
  5147. ; ebp -- PITCH
  5148. ; edi -- Not used.
  5149. ; esi -- Address of reference block (after ecx is added in).
  5150. ; edx -- Reserved. MBlockActionStream
  5151. ; ecx -- Unavailable. Must not be changed.
  5152. ; ebx -- Scratch. Initially the horizontal and vertical motion vectors.
  5153. ; eax -- Offset within frame for block being worked on.
  5154. ; mm7 -- 8 bytes of -1
  5155. ; mm6 -- 8 bytes of 0xFE
  5156. ; mm0-mm5 -- Scratch
  5157. GetPredForAboveOrBelow:
  5158. shr ebx,1
  5159. lea esi,[esi+eax]
  5160. jc HorzInterpInABPred
  5161. movq mm1,[esi+ebp*1]
  5162. movq mm0,[esi]
  5163. psubb mm1,mm7
  5164. and bl,080H
  5165. jne Get4LinesOfPred_InterpVert
  5166. movq mm2,[esi+ebp*2]
  5167. paddb mm1,mm7
  5168. movq mm3,[esi+PITCH*3]
  5169. paddb mm1,mm1
  5170. paddb mm2,mm2
  5171. paddb mm3,mm3
  5172. ret
  5173. HorzInterpInABPred:
  5174. movq mm5,[esi+1] ; A. . <R08 R07 R06 R05 R04 R03 R02 R01>
  5175. and bl,080H
  5176. jne Get4LinesOfPred_InterpBoth
  5177. jmp Get4LinesOfPred_InterpHorz
  5178. StackOffset TEXTEQU <0>
  5179. ;=============================================================================
  5180. ENDIF
  5181. Done:
  5182. IFDEF H261
  5183. ELSE; H263
  5184. mov bl,PendingOBMC
  5185. mov cl,INTER1MV
  5186. test bl,bl
  5187. je TrulyDone
  5188. mov StashBlockType,cl
  5189. call DoPendingOBMCDiff
  5190. mov al,IsPlainPFrame
  5191. add edx,-SIZEOF T_MacroBlockActionDescr
  5192. test al,al
  5193. jne TrulyDone
  5194. movq mm6,C0101010101010101
  5195. pxor mm7,mm7 ; Initialize SWD accumulator
  5196. call MMxDoBFrameLumaBlocks
  5197. ENDIF
  5198. TrulyDone:
  5199. emms
  5200. IFDEF H261
  5201. mov eax,SWDTotal
  5202. mov esp,StashESP
  5203. mov edi,[esp+PSWDTotal]
  5204. mov [edi],eax
  5205. ELSE
  5206. mov eax,SWDTotal
  5207. mov ebx,BSWDTotal
  5208. mov esp,StashESP
  5209. mov edi,[esp+PSWDTotal]
  5210. mov esi,[esp+PBSWDTotal]
  5211. mov [edi],eax
  5212. mov [esi],ebx
  5213. ENDIF
  5214. pop ebx
  5215. pop ebp
  5216. pop edi
  5217. pop esi
  5218. rturn
  5219. MMxEDTQ endp
  5220. END