Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1879 lines
68 KiB

  1. ;/* *************************************************************************
  2. ;** INTEL Corporation Proprietary Information
  3. ;**
  4. ;** This listing is supplied under the terms of a license
  5. ;** agreement with INTEL Corporation and may not be copied
  6. ;** nor disclosed except in accordance with the terms of
  7. ;** that agreement.
  8. ;**
  9. ;** Copyright (c) 1995 Intel Corporation.
  10. ;** All Rights Reserved.
  11. ;**
  12. ;** *************************************************************************
  13. ;*/
  14. ;////////////////////////////////////////////////////////////////////////////
  15. ;//
  16. ;// $Header: R:\h26x\h26x\src\enc\e35bme.asv 1.10 29 May 1996 15:37:38 BNICKERS $
  17. ;// $Log: R:\h26x\h26x\src\enc\e35bme.asv $
  18. ;//
  19. ;// Rev 1.10 29 May 1996 15:37:38 BNICKERS
  20. ;// Acceleration of IA version of ME.
  21. ;//
  22. ;// Rev 1.9 14 May 1996 12:18:18 BNICKERS
  23. ;// Initial debugging of MMx B-Frame ME.
  24. ;//
  25. ;// Rev 1.8 09 Jan 1996 16:14:46 BNICKERS
  26. ;// Avoid generating delta MV's that make B frame MV's out of range.
  27. ;//
  28. ;// Rev 1.7 27 Dec 1995 15:32:34 RMCKENZX
  29. ;// Added copyright notice
  30. ;//
  31. ;////////////////////////////////////////////////////////////////////////////
  32. ;
  33. ; BFrameMotionEstimation -- This function performs B frame motion estimation for the macroblocks identified in the
  34. ; input list. This is only applicable for H263. This version is tuned for best performance
  35. ; on the Pentium Microprocessor.
  36. ;
  37. ; This function works correctly only if Unrestricted Motion Vectors is enabled. It is not
  38. ; possible to select full pel resolution only; half pel resolution is always selected.
  39. ;
  40. ; Input Arguments:
  41. ;
  42. ; MBlockActionStream
  43. ;
  44. ; The list of macroblocks for which we need to perform motion estimation.
  45. ;
  46. ; Upon input, the following fields must be defined:
  47. ;
  48. ; CodedBlocks -- Bit 6 must be set for the last macroblock to be processed.
  49. ;
  50. ; BlkOffset -- must be defined for each of the blocks in the macroblocks.
  51. ;
  52. ; TargetFrameBaseAddress -- Address of upper left viewable pel in the target Y plane.
  53. ;
  54. ; PreviousFrameBaseAddress -- Address of upper left viewable pel in the Y plane of the previous P frame. Whether this
  55. ; is the reconstructed previous frame, or the original, is up to the caller to decide.
  56. ;
  57. ; FutureFrameBaseAddress -- Address of upper left viewable pel in the Y plane of the future P frame. Whether this
  58. ; is the reconstructed previous frame, or the original, is up to the caller to decide.
  59. ;
  60. ; WeightForwardMotion -- Array of 64 signed chars, each element I equal to ((TRb * (I-32)) / TRd). (See H263 spec.)
  61. ;
  62. ; WeightBackwardMotion -- Array of 64 signed chars, each element I equal to ((TRb - TRd) * (I-32) / TRd). (See spec.)
  63. ;
  64. ; ZeroVectorThreshold -- If the SWDB for a macroblock is less than this threshold, we do not bother searching for a
  65. ; better motion vector. Compute as follows, where D is the average tolerable pel difference
  66. ; to satisfy this threshold. (Initial recommendation: D=2 ==> ZVT=384)
  67. ; ZVT = (128 * ((int)((D**1.6)+.5)))
  68. ;
  69. ; NonZeroDifferential -- After searching for the best motion vector (or individual block motion vectors, if enabled),
  70. ; if the macroblock's SWDB is not better than it was for the zero vector -- not better by at
  71. ; least this amount -- then we revert to the zero vector. We are comparing two macroblock
  72. ; SWDs, both calculated as follows: (Initial recommendation: NZD=128)
  73. ; For each of 128 match points, where D is its Abs Diff, accumulate ((int)(M**1.6)+.5)))
  74. ;
  75. ; EmptyThreshold -- If the SWD for a block is less than this, the block is forced empty. Compute as follows, where D
  76. ; is the average tolerable pel diff to satisfy threshold. (Initial recommendation: D=3 ==> ET=96)
  77. ; ET = (32 * ((int)((D**1.6)+.5)))
  78. ;
  79. ; Output Arguments
  80. ;
  81. ; MBlockActionStream
  82. ;
  83. ; These fields are defined as follows upon return:
  84. ;
  85. ; BHMV and BVMV -- The horizontal and vertical motion vectors, in units of a half pel. These values are intended
  86. ; for coding in the macroblock layer.
  87. ;
  88. ; If Horizontal MV indicates a half pel position, the prediction for the upper left pel of the block
  89. ; is the average of the pel at PastRef and the one at PastRef+1.
  90. ;
  91. ; If Vertical MV indicates a half pel position, the prediction for the upper left pel of the block
  92. ; is the average of the pel at PastRef and the one at PastRef+PITCH.
  93. ;
  94. ; If both MVs indicate half pel positions, the prediction for the upper left pel of the block is the
  95. ; average of the pels at PastRef, PastRef+1, PastRef+PITCH, and PastRef+PITCH+1.
  96. ;
  97. ; BestHMVf, BestVMVf, BestHMVb, BestVMVb -- Motion vector components, as described in H263 spec. They are biased
  98. ; by 060H. Only defined for luma blocks. Caller must define for
  99. ; chroma blocks.
  100. ;
  101. ; CandidateHMVf, CandidateVMVf, CandidateHMVb, CandidateVMVb -- Scratch space for this function.
  102. ;
  103. ; CodedBlocksB -- Bits 4 and 5 are turned on, indicating that the U and V blocks should be processed. (If the
  104. ; FDCT function finds them to quantize to empty, it will mark them as empty.)
  105. ;
  106. ; Bits 0 thru 3 are cleared for each of blocks 1 thru 4 that BFrameMotionEstimation forces empty;
  107. ; they are set otherwise.
  108. ;
  109. ; Bits 6 and 7 are left unchanged.
  110. ;
  111. ; SWDB -- Set to the sum of the SWDBs for the four luma blocks in the macroblock. The SWD for any block that is
  112. ; forced empty, is NOT included in the sum.
  113. ;
  114. ; InterSWDTotal -- The sum of the block SWDBs for all Intercoded macroblocks. None of the blocks forced empty are
  115. ; included in this.
  116. ;
  117. ; InterSWDBlocks -- The number of blocks that make up the InterSWDTotal.
  118. ;
  119. ;
  120. ; Other assumptions:
  121. ;
  122. ; For performance reasons, it is assumed that the current and previous frame are 32-byte aligned, and the pitch is a
  123. ; constant 384. Moreover, the current and previous frames must be out of phase by 2K bytes, i.e. must be an odd
  124. ; multiple of 2K bytes apart. This will assure best utilization of the on-chip cache.
  125. ;
  126. ; Many of the techniques described in MotionEstimation are used here. It is wise to study that module before trying
  127. ;to understand this one.
  128. ;
  129. ; Data structures used for bi-directional motion search:
  130. ;
  131. ; Target Macroblock:
  132. ;
  133. ; The target macroblock is copied to the stack frame so that esp can be used as an induction variable for the block:
  134. ;
  135. ; esp+ 0 AAAABBBB AAAABBBB AAAABBBB AAAABBBB
  136. ; esp+ 64 CCCCDDDDEEEEFFFFCCCCDDDDEEEEFFFFCCCCDDDDEEEEFFFFCCCCDDDDEEEEFFFF
  137. ; esp+ 128 LLLLDDDDLLLLLLLLLLLLLLLLLLLLLLLL
  138. ; esp+ 160 +------++------++------++------++------++------++------++------+
  139. ; esp+ 224 | Blk1 || Blk1 || Blk2 || Blk2 || Blk3 || Blk3 || Blk4 || Blk4 |
  140. ; esp+ 288 |Ln 0-3||Ln 4-7||Ln 0-3||Ln 4-7||Ln 0-3||Ln 4-7||Ln 0-3||Ln 4-7|
  141. ; esp+ 352 +------++------++------++------++------++------++------++------+
  142. ; esp+ 416
  143. ;
  144. ; AAAA is the address of the Future Reference Block to be used.
  145. ; BBBB is the address of the Past Reference Block to be used.
  146. ; AAAA, BBBB, and the next 8 bytes are overwritten by the offset to apply when interpolating future ref block or
  147. ; past ref block.
  148. ; CCCC is the accumulated SWD for the current candidate motion vector.
  149. ; DDDD is the accumulated SWD for the best motion vector so far.
  150. ; One extra DDDD occupies esp+132.
  151. ; EEEE is the address at which to transfer control after calculating SWD.
  152. ; FFFF is the accumulated SWD for the zero motion vector.
  153. ; LLLL is space for local variables.
  154. ;
  155. ; Future reference:
  156. ;
  157. ; For each macroblock, the corresponding macroblock from the future frame is copied into the following reference
  158. ; area, wherein all the X's are bytes initialized to 255. When the projection of the B-frame's future motion
  159. ; vector component falls on a byte valued at 255, we know that it is outside the future macroblock, and thus this
  160. ; is a pel that is only predicted from the past reference.
  161. ;
  162. ; esp+ 704 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  163. ; esp+ 744 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  164. ; esp+ 784 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  165. ; esp+ 824 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  166. ; esp+ 864 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  167. ; esp+ 904 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX > Rarely used
  168. ; esp+ 944 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  169. ; esp+ 984 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  170. ; esp+1024 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  171. ; esp+1064 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  172. ; esp+1104 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  173. ; esp+1144 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  174. ; esp+1184 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  175. ; esp+1224 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  176. ; esp+1264 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  177. ; esp+1304 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  178. ; esp+1344 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  179. ; esp+1384 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  180. ; esp+1424 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  181. ; esp+1464 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  182. ; esp+1504 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  183. ; esp+1544 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  184. ; esp+1584 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
  185. ; esp+1624 XXXXXXXXXXXXXXXXXXXXXXXX| || |
  186. ; esp+1664 XXXXXXXXXXXXXXXXXXXXXXXX|Future||Future|
  187. ; esp+1704 XXXXXXXXXXXXXXXXXXXXXXXX| Ref || Ref |
  188. ; esp+1744 XXXXXXXXXXXXXXXXXXXXXXXX| Blk || Blk |
  189. ; esp+1784 XXXXXXXXXXXXXXXXXXXXXXXX| 1 || 2 |
  190. ; esp+1824 XXXXXXXXXXXXXXXXXXXXXXXX| || |
  191. ; esp+1864 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
  192. ; esp+1904 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
  193. ; esp+1944 XXXXXXXXXXXXXXXXXXXXXXXX| || |
  194. ; esp+1984 XXXXXXXXXXXXXXXXXXXXXXXX|Future||Future|
  195. ; esp+2024 XXXXXXXXXXXXXXXXXXXXXXXX| Ref || Ref |
  196. ; esp+2064 XXXXXXXXXXXXXXXXXXXXXXXX| Blk || Blk |
  197. ; esp+2104 XXXXXXXXXXXXXXXXXXXXXXXX| 3 || 4 |
  198. ; esp+2144 XXXXXXXXXXXXXXXXXXXXXXXX| || |
  199. ; esp+2184 XXXXXXXXXXXXXXXXXXXXXXXX+------++------+
  200. ; esp+2224 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  201. ; esp+2264 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  202. ; esp+2304 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  203. ; esp+2344 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  204. ; esp+2384 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  205. ; esp+2424 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  206. ; esp+2464 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  207. ; esp+2504 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  208. ; esp+2544 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  209. ; esp+2584 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  210. ; esp+2624 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  211. ; esp+2664 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  212. ; esp+2704 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  213. ; esp+2744 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  214. ; esp+2784 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  215. ; esp+2824 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX \
  216. ; esp+2864 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX > Rarely used
  217. ; esp+2904 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  218. ; esp+2944 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  219. ; esp+2984 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  220. ; esp+3024 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  221. ; esp+3064 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /
  222. ; esp+3104 XXXXXXXXXXXXXXXXXXXXXXXX________ /
  223. ; esp+3136
  224. ;
  225. ; Past Reference:
  226. ;
  227. ; The past reference search area is taken directly from the past frame. It is not necessary to copy any portion
  228. ; of the past frame to a scratch area.
  229. ;
  230. ;
  231. ; Memory layout of the target macroblock, the future reference macroblock, and the full range for the reference area
  232. ; (as restricted to +/- 7 in vertical, and +/- 7 (expandable to +/- 15) in horizontal, is as shown here. Each box
  233. ; represents a cache line (32 bytes), increasing incrementally from left to right, and then to the next row (like
  234. ; reading a book). The 128 boxes taken as a whole represent 4Kbytes. The boxes are populated as follows:
  235. ;
  236. ; R -- Data from the past reference area. Each box contains 23 of the pels belonging to a line of the reference
  237. ; area. The remaining 7 pels of the line is either in the box to the left (for reference areas used to provide
  238. ; predictions for target macroblocks that begin at an address 0-mod-32), or to the right (for target MBs that begin
  239. ; at an address 16-mod-32). There are 30 R's corresponding to the 30-line limit on the vertical distance we might
  240. ; search. The lowercase r's correspond to the lines above and below zero-vertical-motion.
  241. ;
  242. ; F -- Data from the future reference area. Eacg box contains a full line (16 pels) for each of two adjacent
  243. ; macroblocks. There are 16 F's corresponding to the 16 lines of the macroblocks.
  244. ;
  245. ; T -- Data from the target macroblock. Each box contains a full line (16 pels) for each of two adjacent
  246. ; macroblocks. There are 16 C's corresponding to the 16 lines of the macroblocks.
  247. ;
  248. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  249. ; | T | | r | | T | | R | | F | | R | |
  250. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  251. ; | T | | r | | T | | R | | F | | R | |
  252. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  253. ; | T | | r | | T | | R | | F | | r | |
  254. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  255. ; | T | | r | | T | | R | | F | | r | |
  256. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  257. ; | T | | r | | T | | R | | F | | r | |
  258. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  259. ; | T | | r | | F | | R | | F | | r | |
  260. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  261. ; | T | | r | | F | | R | | F | | r | |
  262. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  263. ; | T | | r | | F | | R | | F | | r | |
  264. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  265. ; | T | | R | | F | | R | | F | | r | |
  266. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  267. ; | T | | R | | F | | R | | F | | r | |
  268. ; +---+---+---+---+---+---+---+---+---+---+---+---+
  269. ; | T | | R | | F | | R | |
  270. ; +---+---+---+---+---+---+---+---+
  271. ;
  272. ; Thus, in a logical sense, the above data fits into one of the 4K data cache pages, leaving the other for all other
  273. ; data. Care has been taken to assure that the tables and the stack space needed by this function fit nicely into
  274. ; the other data cache page. Only the MBlockActionStream remains to conflict with the above data structures. That
  275. ; is both unavoidable, and of minimal consequence.
  276. OPTION PROLOGUE:None
  277. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  278. OPTION M510
  279. include e3inst.inc
  280. include e3mbad.inc
  281. .xlist
  282. include memmodel.inc
  283. .list
  284. .DATA
  285. LocalStorage LABEL DWORD ; Local storage goes on the stack at addresses whose lower 12 bits match this address.
  286. DB 544 DUP (?) ; Low 12 bits match those of heavily used part of stack frame.
  287. SWDState LABEL BYTE ; State engine rules for finding best motion vector.
  288. ; 1st number: Horizontal Motion displacement to try, in half pel increments.
  289. ; 2nd number: Vertical Motion displacement to try, in half pel increments.
  290. ; 3rd number: Next state to enter if this motion is better than previous best.
  291. ; 4th number: Next state to enter if previous best is still best.
  292. DB -2, 0, 8, 4 ; 0 -- ( 0, 0) Try (-2, 0)
  293. DB 2, 0, 12, 12 ; 4 -- ( 0, 0) Try ( 2, 0)
  294. DB 4, 0, 12, 12 ; 8 -- (-2, 0) Try ( 2, 0)
  295. DB 0, -2, 20, 16 ; 12 -- ( N, 0) Try ( N,-2) (N = {-2,0,2})
  296. DB 0, 2, 24, 24 ; 16 -- ( N, 0) Try ( N, 2)
  297. DB 0, 4, 24, 24 ; 20 -- ( N,-2) Try ( N, 2)
  298. DB -1, 0, 32, 28 ; 24
  299. DB 1, 0, 36, 36 ; 28
  300. DB 2, 0, 36, 36 ; 32
  301. DB 0, -1, 44, 40 ; 36
  302. DB 0, 1, 0, 0 ; 40
  303. DB 0, 2, 0, 0 ; 44
  304. DB 48 DUP (?) ; Additional space for more states, if needed.
  305. DB 64 DUP (?) ; Low 12 bits match those of heavily used part of stack frame.
  306. InterpFutureRef LABEL BYTE ; Map FPEL+FPEL to its average. If one FPEL is out
  307. ; of range (255), map FPEL+FPEL to 255.
  308. CNT = 0
  309. REPEAT 127
  310. DB CNT,CNT
  311. CNT = CNT + 1
  312. ENDM
  313. DB 127
  314. DB 257 DUP (255)
  315. DB 1472 DUP (?) ; Low 12 bits match those of heavily used part of stack frame.
  316. Interp2PastAndFutureRef LABEL BYTE ; Map PPEL+PPEL+FPELavg*2 to 2*average. If
  317. ; FPELavg out of range, map PPEL+PPEL to
  318. ; -(PPEL+PPEL).
  319. CNT = 0
  320. REPEAT 255
  321. DB CNT,CNT
  322. CNT = CNT - 1
  323. ENDM
  324. InterpPastAndFutureRef LABEL BYTE ; Map PPEL+FPELavg to 2*average. If
  325. ; FPELavg out of range, map PPEL to
  326. ; 2(PPEL).
  327. CNT = 0
  328. REPEAT 255
  329. DB CNT
  330. CNT = CNT + 1
  331. ENDM
  332. CNT = 0
  333. REPEAT 128
  334. DB CNT
  335. CNT = CNT + 2
  336. ENDM
  337. DB ?,?,?
  338. DB 255
  339. WeightedDiff LABEL BYTE ; Label placed here because negative pel value is
  340. ; not sign extended, so we need to subtract 256.
  341. DB 191 DUP (255)
  342. DB 255,250,243,237,231,225,219,213,207,201,195,189,184,178,172,167
  343. DB 162,156,151,146,141,135,130,126,121,116,111,107,102, 97, 93, 89
  344. DB 84, 80, 76, 72, 68, 64, 61, 57, 53, 50, 46, 43, 40, 37, 34, 31
  345. DB 28, 25, 22, 20, 18, 15, 13, 11, 9, 7, 6, 4, 3, 2, 1, 0
  346. DB 0
  347. DB 0, 1, 2, 3, 4, 6, 7, 9, 11, 13, 15, 18, 20, 22, 25, 28
  348. DB 31, 34, 37, 40, 43, 46, 50, 53, 57, 61, 64, 68, 72, 76, 80, 84
  349. DB 89, 93, 97,102,107,111,116,121,126,130,135,141,146,151,156,162
  350. DB 167,172,178,184,189,195,201,207,213,219,225,231,237,243,250,255
  351. DB 191 DUP (255)
  352. .CODE
  353. ASSUME cs : FLAT
  354. ASSUME ds : FLAT
  355. ASSUME es : FLAT
  356. ASSUME fs : FLAT
  357. ASSUME gs : FLAT
  358. ASSUME ss : FLAT
  359. BFRAMEMOTIONESTIMATION proc C AMBAS: DWORD,
  360. ATFBA: DWORD,
  361. APFBA: DWORD,
  362. AFFBA: DWORD,
  363. AWFM: DWORD,
  364. AWBM: DWORD,
  365. AZVT: DWORD,
  366. ANZMVD:DWORD,
  367. AEBT: DWORD,
  368. ASWDT: DWORD,
  369. ASWDB: DWORD
  370. RegisterStorageSize = 16
  371. ; Arguments:
  372. MBlockActionStream_arg = RegisterStorageSize + 4
  373. TargetFrameBaseAddress_arg = RegisterStorageSize + 8
  374. PreviousFrameBaseAddress_arg = RegisterStorageSize + 12
  375. FutureFrameBaseAddress_arg = RegisterStorageSize + 16
  376. WeightForwardMotion_arg = RegisterStorageSize + 20
  377. WeightBackwardMotion_arg = RegisterStorageSize + 24
  378. ZeroVectorThreshold_arg = RegisterStorageSize + 28
  379. NonZeroMVDifferential_arg = RegisterStorageSize + 32
  380. EmptyBlockThreshold_arg = RegisterStorageSize + 36
  381. InterSWDTotal_arg = RegisterStorageSize + 40
  382. InterSWDBlocks_arg = RegisterStorageSize + 44
  383. EndOfArgList = RegisterStorageSize + 48
  384. ; Locals (on local stack frame)
  385. ; 0 thru 415 are Target MV scratch structure, described above, with room for
  386. ; 7 DWORDs of local variables.
  387. Block1 EQU [esp+ 0]
  388. Block2 EQU [esp+ 16]
  389. Block3 EQU [esp+ 32]
  390. Block4 EQU [esp+ 48]
  391. BlockN EQU [esp+ 64]
  392. BlockNM1 EQU [esp+ 48]
  393. TargetBlock EQU 160
  394. FutureRefBlockAddr EQU 0
  395. PastRefBlockAddr EQU 4
  396. FutureRefInterpOffset EQU FutureRefBlockAddr
  397. CandidateSWDAccum EQU 64
  398. BestSWDAccum EQU 68
  399. SWD0MVAccum EQU 72
  400. TransferCase EQU 76
  401. TPITCH EQU 64
  402. ; 416 thru 479 and 640 thru 703 for weighting motion vectors.
  403. WeightForwardMotion EQU [esp+ 416] ; 32 bytes at 416 for positive MV; 32
  404. ; bytes at 640 for negative MV.
  405. WeightBackwardMotion EQU [esp+ 448] ; 32 bytes at 448 for positive MV; 32
  406. ; bytes at 672 for negative MV.
  407. ; 480 thru 543 are stack storage for more local variables.
  408. ; 128:131, 136:159, and 480:543 are available for local variables.
  409. TargetFrameBaseAddress EQU [esp+ 128]
  410. PreviousFrameBaseAddress EQU [esp+ 136]
  411. FutureFrameBaseAddress EQU [esp+ 140]
  412. MBlockActionStream EQU [esp+ 144]
  413. ZeroVectorThreshold EQU [esp+ 148]
  414. NonZeroMVDifferential EQU [esp+ 152]
  415. EmptyBlockThreshold EQU [esp+ 156]
  416. InterSWDTotal EQU [esp+ 480]
  417. InterSWDBlocks EQU [esp+ 484]
  418. StashESP EQU [esp+ 488]
  419. PastMBAddr EQU [esp+ 492]
  420. CurrSWDState EQU [esp+ 496]
  421. CandidateMV EQU [esp+ 500]
  422. BestMV EQU [esp+ 504]
  423. BlkY1_0deltaBiDiMVs EQU [esp+ 508]
  424. BlkY2_0deltaBiDiMVs EQU [esp+ 512]
  425. BlkY3_0deltaBiDiMVs EQU [esp+ 516]
  426. BlkY4_0deltaBiDiMVs EQU [esp+ 520]
  427. FirstTransferCase EQU [esp+ 524]
  428. ; 544: 639 is for static data, namely the state engine rules.
  429. ; 640 thru 703, as stated above is for weighting motion vectors.
  430. ; 704 thru 1215 hit static data structure to interpolate 2 future pels.
  431. ; Future Reference Area also starts at 704 on stack, but collision at 704
  432. ; thru 1215 will occur very infrequently. Future Reference Area continues
  433. ; thru 3135. 2688 thru 3135 collide with the static data structure to
  434. ; interpolate between past and future pels, but that portion of the Future
  435. ; Reference Area is rarely accessed. 3136 thru 3583 continue that static
  436. ; structure. 3584 thru 4095 have the static structure to look up the
  437. ; weighted difference for a target pel and it's prediction.
  438. FutureRefArea EQU [esp+ 704]
  439. FutureBlock EQU [esp+1608]
  440. FPITCH EQU 40
  441. push esi
  442. push edi
  443. push ebp
  444. push ebx
  445. ; Adjust stack ptr so that local frame fits nicely in cache w.r.t. other data.
  446. mov esi,esp
  447. sub esp,000001000H
  448. mov ebx, [esp]
  449. sub esp,000001000H
  450. and esp,0FFFFF000H
  451. mov ebx,OFFSET LocalStorage+63
  452. and ebx,000000FC0H
  453. mov edx,PD [esi+MBlockActionStream_arg]
  454. or esp,ebx
  455. mov eax,PD [esi+TargetFrameBaseAddress_arg]
  456. mov TargetFrameBaseAddress,eax
  457. mov eax,PD [esi+PreviousFrameBaseAddress_arg]
  458. mov PreviousFrameBaseAddress,eax
  459. mov eax,PD [esi+FutureFrameBaseAddress_arg]
  460. mov FutureFrameBaseAddress,eax
  461. mov eax,PD [esi+EmptyBlockThreshold_arg]
  462. mov EmptyBlockThreshold,eax
  463. mov eax,PD [esi+ZeroVectorThreshold_arg]
  464. mov ZeroVectorThreshold,eax
  465. mov eax,PD [esi+NonZeroMVDifferential_arg]
  466. mov NonZeroMVDifferential,eax
  467. mov ebx,3116
  468. @@:
  469. mov [esp+ebx],0FFFFFFFFH
  470. sub ebx,4
  471. cmp ebx,688
  472. jae @b
  473. xor ebx,ebx
  474. mov StashESP,esi
  475. mov edi,[esi+WeightForwardMotion_arg]
  476. mov esi,[esi+WeightBackwardMotion_arg]
  477. mov InterSWDBlocks,ebx
  478. mov InterSWDTotal,ebx
  479. mov eax,[edi]
  480. mov ebx,[edi+4]
  481. mov ecx,03F3F3F3FH
  482. mov ebp,060606060H
  483. and eax,ecx
  484. and ebx,ecx
  485. xor eax,ebp
  486. xor ebx,ebp
  487. mov WeightForwardMotion+224,eax
  488. mov WeightForwardMotion+228,ebx
  489. mov eax,[edi+8]
  490. mov ebx,[edi+12]
  491. and eax,ecx
  492. and ebx,ecx
  493. xor eax,ebp
  494. xor ebx,ebp
  495. mov WeightForwardMotion+232,eax
  496. mov WeightForwardMotion+236,ebx
  497. mov eax,[edi+16]
  498. mov ebx,[edi+20]
  499. and eax,ecx
  500. and ebx,ecx
  501. xor eax,ebp
  502. xor ebx,ebp
  503. mov WeightForwardMotion+240,eax
  504. mov WeightForwardMotion+244,ebx
  505. mov eax,[edi+24]
  506. mov ebx,[edi+28]
  507. and eax,ecx
  508. and ebx,ecx
  509. xor eax,ebp
  510. xor ebx,ebp
  511. mov WeightForwardMotion+248,eax
  512. mov WeightForwardMotion+252,ebx
  513. mov eax,[edi+32]
  514. mov ebx,[edi+36]
  515. and eax,ecx
  516. and ebx,ecx
  517. xor eax,ebp
  518. xor ebx,ebp
  519. mov WeightForwardMotion+0,eax
  520. mov WeightForwardMotion+4,ebx
  521. mov eax,[edi+40]
  522. mov ebx,[edi+44]
  523. and eax,ecx
  524. and ebx,ecx
  525. xor eax,ebp
  526. xor ebx,ebp
  527. mov WeightForwardMotion+8,eax
  528. mov WeightForwardMotion+12,ebx
  529. mov eax,[edi+48]
  530. mov ebx,[edi+52]
  531. and eax,ecx
  532. and ebx,ecx
  533. xor eax,ebp
  534. xor ebx,ebp
  535. mov WeightForwardMotion+16,eax
  536. mov WeightForwardMotion+20,ebx
  537. mov eax,[edi+56]
  538. mov ebx,[edi+60]
  539. and eax,ecx
  540. and ebx,ecx
  541. xor eax,ebp
  542. xor ebx,ebp
  543. mov WeightForwardMotion+24,eax
  544. mov WeightForwardMotion+28,ebx
  545. mov eax,[esi]
  546. mov ebx,[esi+4]
  547. and eax,ecx
  548. and ebx,ecx
  549. xor eax,ebp
  550. xor ebx,ebp
  551. mov WeightBackwardMotion+224,eax
  552. mov WeightBackwardMotion+228,ebx
  553. mov eax,[esi+8]
  554. mov ebx,[esi+12]
  555. and eax,ecx
  556. and ebx,ecx
  557. xor eax,ebp
  558. xor ebx,ebp
  559. mov WeightBackwardMotion+232,eax
  560. mov WeightBackwardMotion+236,ebx
  561. mov eax,[esi+16]
  562. mov ebx,[esi+20]
  563. and eax,ecx
  564. and ebx,ecx
  565. xor eax,ebp
  566. xor ebx,ebp
  567. mov WeightBackwardMotion+240,eax
  568. mov WeightBackwardMotion+244,ebx
  569. mov eax,[esi+24]
  570. mov ebx,[esi+28]
  571. and eax,ecx
  572. and ebx,ecx
  573. xor eax,ebp
  574. xor ebx,ebp
  575. mov WeightBackwardMotion+248,eax
  576. mov WeightBackwardMotion+252,ebx
  577. mov eax,[esi+32]
  578. mov ebx,[esi+36]
  579. and eax,ecx
  580. and ebx,ecx
  581. xor eax,ebp
  582. xor ebx,ebp
  583. mov WeightBackwardMotion+0,eax
  584. mov WeightBackwardMotion+4,ebx
  585. mov eax,[esi+40]
  586. mov ebx,[esi+44]
  587. and eax,ecx
  588. and ebx,ecx
  589. xor eax,ebp
  590. xor ebx,ebp
  591. mov WeightBackwardMotion+8,eax
  592. mov WeightBackwardMotion+12,ebx
  593. mov eax,[esi+48]
  594. mov ebx,[esi+52]
  595. and eax,ecx
  596. and ebx,ecx
  597. xor eax,ebp
  598. xor ebx,ebp
  599. mov WeightBackwardMotion+16,eax
  600. mov WeightBackwardMotion+20,ebx
  601. mov eax,[esi+56]
  602. mov ebx,[esi+60]
  603. and eax,ecx
  604. and ebx,ecx
  605. xor eax,ebp
  606. xor ebx,ebp
  607. mov WeightBackwardMotion+24,eax
  608. mov WeightBackwardMotion+28,ebx
  609. jmp FirstMacroBlock
  610. ALIGN 16
  611. NextMacroBlock:
  612. mov bl,[edx].CodedBlocks
  613. add edx,SIZEOF T_MacroBlockActionDescr
  614. and ebx,000000040H ; Check for end-of-stream
  615. jne Done
  616. FirstMacroBlock:
  617. mov esi,[edx].BlkY1.BlkOffset ; Get address of next macroblock to do.
  618. mov edi,TargetFrameBaseAddress
  619. mov eax,FutureFrameBaseAddress
  620. mov ebp,PreviousFrameBaseAddress
  621. lea edi,[esi+edi+PITCH*3]
  622. mov MBlockActionStream,edx ; Stash list ptr.
  623. add ebp,esi
  624. lea esi,[esi+eax+PITCH*15]
  625. mov PastMBAddr,ebp ; Stash addr of past MB w/ zero motion.
  626. mov ecx,FPITCH*15
  627. mov ebp,PITCH
  628. xor eax,eax
  629. @@: ; Copy future reference to scratch area
  630. ; that is surrounded by "255" so we can
  631. ; handle access to this surrounding area
  632. ; as the future ref falls out of the MB.
  633. mov eax,[esi]
  634. mov ebx,[esi+4]
  635. mov FutureBlock[ecx],eax
  636. mov FutureBlock[ecx+4],ebx
  637. mov eax,[esi+8]
  638. mov ebx,[esi+12]
  639. mov FutureBlock[ecx+8],eax
  640. mov FutureBlock[ecx+12],ebx
  641. sub esi,ebp
  642. sub ecx,FPITCH
  643. lea edx,Block1.TargetBlock
  644. jge @b
  645. sar ecx,31
  646. lea ebx,Block1.TargetBlock+TPITCH*3
  647. @@: ; Copy target macroblock to scratch area
  648. ; so that we can pick up the target points
  649. ; from a static offset added to esp.
  650. mov eax,[edi]
  651. mov esi,[edi+8]
  652. add eax,eax
  653. add esi,esi
  654. xor eax,ecx
  655. xor esi,ecx
  656. mov [ebx],eax
  657. mov [ebx+16],esi
  658. mov eax,[edi+ebp*4]
  659. mov esi,[edi+ebp*4+8]
  660. add eax,eax
  661. add esi,esi
  662. xor eax,ecx
  663. xor esi,ecx
  664. mov [ebx+8],eax
  665. mov [ebx+24],esi
  666. mov eax,[edi+ebp*8]
  667. mov esi,[edi+ebp*8+8]
  668. add eax,eax
  669. add esi,esi
  670. xor eax,ecx
  671. xor esi,ecx
  672. mov [ebx+32],eax
  673. mov [ebx+48],esi
  674. mov eax,[edi+PITCH*12]
  675. mov esi,[edi+PITCH*12+8]
  676. add eax,eax
  677. add esi,esi
  678. xor eax,ecx
  679. xor esi,ecx
  680. mov [ebx+40],eax
  681. mov [ebx+56],esi
  682. sub edi,ebp
  683. sub ebx,TPITCH
  684. cmp ebx,edx
  685. jge @b
  686. mov eax,16
  687. lea edi,[edi+ebp*4+4]
  688. test edi,4
  689. lea ebx,Block1.TargetBlock+TPITCH*3+4
  690. jne @b
  691. mov edx,MBlockActionStream
  692. xor ebx,ebx
  693. mov Block4.TransferCase,eax ; After block 4, transfer to done 0-MV.
  694. xor ecx,ecx
  695. mov bl,[edx].BlkY4.PVMV
  696. mov esi,PastMBAddr
  697. mov al,[edx].BlkY4.PHMV
  698. xor ebp,ebp
  699. mov bl,WeightForwardMotion[ebx]
  700. mov [edx].BlkY4.BestVMVf,bl
  701. sar ebx,1 ; CF == 1 if past vert is at half pel.
  702. mov cl,WeightForwardMotion[eax]
  703. adc ebp,ebp ; ebp == 1 if past vert is at half pel.
  704. mov [edx].BlkY4.BestHMVf,cl
  705. sar ecx,1 ; CF == 1 if past horz is at half pel.
  706. IF PITCH-384
  707. **** The magic leaks out if PITCH != 384
  708. ENDIF
  709. lea edi,[ebx+ebx*2] ; Multiply vertical component by PITCH.
  710. adc ebp,ebp ; ebp odd if past horz is at half pel.
  711. mov bl,[edx].BlkY4.PVMV
  712. shl edi,7
  713. lea esi,[esi+ecx-48-48*PITCH+PITCH*8+8]; Add horz full pel disp to ref addr.
  714. add esi,edi ; Add vert full pel disp to past ref addr.
  715. mov bl,WeightBackwardMotion[ebx]
  716. mov [edx].BlkY4.BestVMVb,bl
  717. mov Block4.PastRefBlockAddr,esi ; Stash address of ref block from past.
  718. sar ebx,1 ; CF == 1 if future vert is at half pel.
  719. mov al,WeightBackwardMotion[eax]
  720. adc ebp,ebp ; ebp odd if future vert is at half pel.
  721. mov [edx].BlkY4.BestHMVb,al
  722. sar eax,1 ; CF == 1 if future horz is at half pel.
  723. IF FPITCH-40
  724. **** The magic leaks out if FPITCH != 40
  725. ENDIF
  726. lea edi,[ebx+ebx*4] ; Multiply vertical component by FPITCH.
  727. adc ebp,ebp ; ebp odd if future horz is at half pel.
  728. lea esi,FutureBlock+80-48-48*FPITCH+FPITCH*8+8
  729. lea edi,[eax+edi*8] ; Linearized MV for future ref.
  730. mov Block3.TransferCase,ebp ; Stash case to do after block 3.
  731. add esi,edi
  732. mov bl,[edx].BlkY3.PVMV
  733. mov Block4.FutureRefBlockAddr,esi ; Stash address of ref block from future.
  734. mov al,[edx].BlkY3.PHMV
  735. mov esi,PastMBAddr
  736. mov bl,WeightForwardMotion[ebx]
  737. mov [edx].BlkY3.BestVMVf,bl
  738. xor ebp,ebp
  739. sar ebx,1
  740. mov cl,WeightForwardMotion[eax]
  741. adc ebp,ebp
  742. mov [edx].BlkY3.BestHMVf,cl
  743. sar ecx,1
  744. lea edi,[ebx+ebx*2]
  745. adc ebp,ebp
  746. mov bl,[edx].BlkY3.PVMV
  747. shl edi,7
  748. lea esi,[esi+ecx-48-48*PITCH+PITCH*8]
  749. add esi,edi
  750. mov bl,WeightBackwardMotion[ebx]
  751. mov [edx].BlkY3.BestVMVb,bl
  752. mov Block3.PastRefBlockAddr,esi
  753. sar ebx,1
  754. mov al,WeightBackwardMotion[eax]
  755. adc ebp,ebp
  756. mov [edx].BlkY3.BestHMVb,al
  757. sar eax,1
  758. lea edi,[ebx+ebx*4]
  759. adc ebp,ebp
  760. lea esi,FutureBlock+80-48-48*FPITCH+FPITCH*8
  761. lea edi,[eax+edi*8]
  762. mov Block2.TransferCase,ebp
  763. add esi,edi
  764. mov bl,[edx].BlkY2.PVMV
  765. mov Block3.FutureRefBlockAddr,esi
  766. mov al,[edx].BlkY2.PHMV
  767. mov esi,PastMBAddr
  768. mov bl,WeightForwardMotion[ebx]
  769. mov [edx].BlkY2.BestVMVf,bl
  770. xor ebp,ebp
  771. sar ebx,1
  772. mov cl,WeightForwardMotion[eax]
  773. adc ebp,ebp
  774. mov [edx].BlkY2.BestHMVf,cl
  775. sar ecx,1
  776. lea edi,[ebx+ebx*2]
  777. adc ebp,ebp
  778. mov bl,[edx].BlkY2.PVMV
  779. shl edi,7
  780. lea esi,[esi+ecx-48-48*PITCH+8]
  781. add esi,edi
  782. mov bl,WeightBackwardMotion[ebx]
  783. mov [edx].BlkY2.BestVMVb,bl
  784. mov Block2.PastRefBlockAddr,esi
  785. sar ebx,1
  786. mov al,WeightBackwardMotion[eax]
  787. adc ebp,ebp
  788. mov [edx].BlkY2.BestHMVb,al
  789. sar eax,1
  790. lea edi,[ebx+ebx*4]
  791. adc ebp,ebp
  792. lea esi,FutureBlock+80-48-48*FPITCH+8
  793. lea edi,[eax+edi*8]
  794. mov Block1.TransferCase,ebp
  795. add esi,edi
  796. mov bl,[edx].BlkY1.PVMV
  797. mov Block2.FutureRefBlockAddr,esi
  798. mov al,[edx].BlkY1.PHMV
  799. mov esi,PastMBAddr
  800. mov bl,WeightForwardMotion[ebx]
  801. mov [edx].BlkY1.BestVMVf,bl
  802. xor ebp,ebp
  803. sar ebx,1
  804. mov cl,WeightForwardMotion[eax]
  805. adc ebp,ebp
  806. mov [edx].BlkY1.BestHMVf,cl
  807. sar ecx,1
  808. lea edi,[ebx+ebx*2]
  809. adc ebp,ebp
  810. mov bl,[edx].BlkY1.PVMV
  811. shl edi,7
  812. lea esi,[esi+ecx-48-48*PITCH]
  813. add esi,edi
  814. mov bl,WeightBackwardMotion[ebx]
  815. mov [edx].BlkY1.BestVMVb,bl
  816. mov Block1.PastRefBlockAddr,esi
  817. sar ebx,1
  818. mov al,WeightBackwardMotion[eax]
  819. adc ebp,ebp
  820. mov [edx].BlkY1.BestHMVb,al
  821. sar eax,1
  822. lea ecx,[ebx+ebx*4]
  823. adc ebp,ebp
  824. lea edi,FutureBlock+80-48-48*FPITCH
  825. lea ecx,[eax+ecx*8]
  826. mov eax,ebp
  827. add edi,ecx
  828. mov ebp,00BADBEEFH
  829. mov Block1.BestSWDAccum,ebp
  830. mov Block2.BestSWDAccum,ebp
  831. mov Block3.BestSWDAccum,ebp
  832. mov Block4.BestSWDAccum,ebp
  833. mov BlockN.BestSWDAccum,ebp
  834. xor ebp,ebp
  835. sub esp,64
  836. jmp PD JumpTable[eax*4]
  837. ZeroVectorSWDDone:
  838. mov eax,ZeroVectorThreshold
  839. mov ebx,Block2.CandidateSWDAccum
  840. cmp eax,ebp
  841. mov edi,Block1.CandidateSWDAccum
  842. mov ecx,Block3.CandidateSWDAccum
  843. mov Block1.BestSWDAccum,edi
  844. mov Block2.BestSWDAccum,ebx
  845. mov Block3.BestSWDAccum,ecx
  846. mov Block4.BestSWDAccum,ebp
  847. mov eax,0 ; Set best MV to zero.
  848. mov esi,MBlockActionStream
  849. jge BelowZeroThreshold
  850. mov Block1.SWD0MVAccum,edi
  851. mov Block2.SWD0MVAccum,ebx
  852. mov Block3.SWD0MVAccum,ecx
  853. mov Block4.SWD0MVAccum,ebp
  854. mov ebx,[esi].BlkY1.BestBiDiMVs
  855. mov ecx,[esi].BlkY2.BestBiDiMVs
  856. mov BlkY1_0deltaBiDiMVs,ebx
  857. mov BlkY2_0deltaBiDiMVs,ecx
  858. mov ebx,[esi].BlkY3.BestBiDiMVs
  859. mov ecx,[esi].BlkY4.BestBiDiMVs
  860. mov BlkY3_0deltaBiDiMVs,ebx
  861. mov BlkY4_0deltaBiDiMVs,ecx
  862. mov ecx,17
  863. xor ebx,ebx ; First ME engine state is zero.
  864. mov Block4.TransferCase,ecx ; After block 4, transfer to done non0-MV.
  865. xor ecx,ecx
  866. mov BlockN.BestSWDAccum,ebp
  867. SWDLoop:
  868. mov CurrSWDState,ebx ; Record ME engine state.
  869. mov edx,PD SWDState[ebx] ; dl == HMV; dh == VMV offsets to try.
  870. mov bl,[esi].BlkY4.PVMV
  871. add dl,al ; Try this horizontal MV delta.
  872. add dh,ah ; Try this vertical MV delta.
  873. mov cl,[esi].BlkY4.PHMV
  874. mov BestMV,eax ; Record what the best MV so far is.
  875. mov CandidateMV,edx ; Record the candidate MV delta.
  876. mov bl,WeightForwardMotion[ebx] ; TRb * VMV / TRd
  877. xor ebp,ebp
  878. add bl,dh ; VMVf = TRb * VMV / TRd + VMVd
  879. mov cl,WeightForwardMotion[ecx] ; TRb * HMV / TRd
  880. cmp bl,040H ; If too far up or down, take quick out.
  881. jbe MVDeltaOutOfRange
  882. mov [esi].BlkY4.CandidateVMVf,bl
  883. add cl,dl ; HMVf = TRb * HMV / TRd + HMVd
  884. cmp cl,040H ; If too far left or right, quick out.
  885. jbe MVDeltaOutOfRange
  886. sar ebx,1 ; CF == 1 if past vert is at half pel.
  887. mov [esi].BlkY4.CandidateHMVf,cl
  888. adc ebp,ebp ; ebp == 1 if past vert is at half pel.
  889. mov eax,PastMBAddr
  890. sar ecx,1 ; CF == 1 if past horz is at half pel.
  891. IF PITCH-384
  892. **** The magic leaks out if PITCH != 384
  893. ENDIF
  894. lea edi,[ebx+ebx*2] ; Multiply vertical component by PITCH.
  895. adc ebp,ebp ; ebp odd if past horz is at half pel.
  896. mov bl,[esi].BlkY3.PVMV
  897. shl edi,7
  898. mov Block3.TransferCase,ebp ; Stash case to do after block 3.
  899. lea ebp,[eax+ecx-48-48*PITCH+PITCH*8+8] ;Add horz full pel disp to ref addr.
  900. mov cl,[esi].BlkY3.PHMV
  901. add edi,ebp ; Add vert full pel disp to past ref addr.
  902. mov bl,WeightForwardMotion[ebx]
  903. mov Block4.PastRefBlockAddr,edi ; Stash address of ref block from past.
  904. xor ebp,ebp
  905. add bl,dh
  906. mov cl,WeightForwardMotion[ecx]
  907. cmp bl,040H
  908. jbe MVDeltaOutOfRange
  909. mov [esi].BlkY3.CandidateVMVf,bl
  910. add cl,dl
  911. cmp cl,040H
  912. jbe MVDeltaOutOfRange
  913. sar ebx,1
  914. mov [esi].BlkY3.CandidateHMVf,cl
  915. adc ebp,ebp
  916. sub eax,48+48*PITCH
  917. sar ecx,1
  918. lea edi,[ebx+ebx*2]
  919. adc ebp,ebp
  920. mov bl,[esi].BlkY2.PVMV
  921. shl edi,7
  922. mov Block2.TransferCase,ebp
  923. lea ebp,[eax+ecx+PITCH*8]
  924. mov cl,[esi].BlkY2.PHMV
  925. add edi,ebp
  926. mov bl,WeightForwardMotion[ebx]
  927. mov Block3.PastRefBlockAddr,edi
  928. xor ebp,ebp
  929. add bl,dh
  930. mov cl,WeightForwardMotion[ecx]
  931. cmp bl,040H
  932. jbe MVDeltaOutOfRange
  933. mov [esi].BlkY2.CandidateVMVf,bl
  934. add cl,dl
  935. cmp cl,040H
  936. jbe MVDeltaOutOfRange
  937. sar ebx,1
  938. mov [esi].BlkY2.CandidateHMVf,cl
  939. adc ebp,ebp
  940. sar ecx,1
  941. lea edi,[ebx+ebx*2]
  942. adc ebp,ebp
  943. mov bl,[esi].BlkY1.PVMV
  944. shl edi,7
  945. mov Block1.TransferCase,ebp
  946. lea ebp,[eax+ecx+8]
  947. mov cl,[esi].BlkY1.PHMV
  948. add edi,ebp
  949. mov bl,WeightForwardMotion[ebx]
  950. mov Block2.PastRefBlockAddr,edi
  951. xor ebp,ebp
  952. add bl,dh
  953. mov cl,WeightForwardMotion[ecx]
  954. cmp bl,040H
  955. jbe MVDeltaOutOfRange
  956. mov [esi].BlkY1.CandidateVMVf,bl
  957. add cl,dl
  958. cmp cl,040H
  959. jbe MVDeltaOutOfRange
  960. sar ebx,1
  961. mov [esi].BlkY1.CandidateHMVf,cl
  962. adc ebp,ebp
  963. sar ecx,1
  964. lea edi,[ebx+ebx*2]
  965. adc ebp,ebp
  966. add eax,ecx
  967. shl edi,7
  968. mov FirstTransferCase,ebp
  969. add edi,eax
  970. test dh,dh ; Is vertical component MV delta zero?
  971. mov Block1.PastRefBlockAddr,edi
  972. je VMVdIsZero
  973. lea edi,FutureBlock+80-48-48*FPITCH
  974. xor eax,eax
  975. mov bl,[esi].BlkY4.PVMV
  976. mov al,[esi].BlkY4.CandidateVMVf
  977. mov ebp,Block3.TransferCase ; Reload transfer case (computed goto idx)
  978. sub al,bl ; -VMVb = -(VMVf - VMV)
  979. mov [esi].BlkY4.CandidateVMVb,al
  980. mov cl,[esi].BlkY3.PVMV
  981. sar eax,1 ; CF == 1 if future vert is at half pel.
  982. mov bl,[esi].BlkY3.CandidateVMVf
  983. adc ebp,ebp ; ebp odd if future vert is at half pel.
  984. IF FPITCH-40
  985. **** The magic leaks out if FPITCH != 40
  986. ENDIF
  987. mov Block3.TransferCase,ebp ; Stash case to do after block 3.
  988. lea eax,[eax+eax*4] ; Multiply vertical component by FPITCH.
  989. mov ebp,Block2.TransferCase
  990. sub bl,cl
  991. lea eax,[edi+eax*8+FPITCH*8+8] ; Addr of ref blk w/ vert MV.
  992. mov [esi].BlkY3.CandidateVMVb,bl
  993. sar ebx,1
  994. mov Block4.FutureRefBlockAddr,eax ; Stash address of ref block from future.
  995. adc ebp,ebp
  996. mov cl,[esi].BlkY2.PVMV
  997. mov Block2.TransferCase,ebp
  998. lea eax,[ebx+ebx*4]
  999. mov bl,[esi].BlkY2.CandidateVMVf
  1000. mov ebp,Block1.TransferCase
  1001. lea eax,[edi+eax*8+FPITCH*8]
  1002. sub bl,cl
  1003. mov Block3.FutureRefBlockAddr,eax
  1004. mov [esi].BlkY2.CandidateVMVb,bl
  1005. sar ebx,1
  1006. mov dh,[esi].BlkY1.PVMV
  1007. adc ebp,ebp
  1008. mov cl,[esi].BlkY1.CandidateVMVf
  1009. mov Block1.TransferCase,ebp
  1010. sub cl,dh
  1011. mov [esi].BlkY1.CandidateVMVb,cl
  1012. lea eax,[ebx+ebx*4]
  1013. sar ecx,1
  1014. mov ebp,FirstTransferCase
  1015. adc ebp,ebp
  1016. lea eax,[edi+eax*8+8]
  1017. mov Block2.FutureRefBlockAddr,eax
  1018. lea eax,[ecx+ecx*4]
  1019. mov FirstTransferCase,ebp
  1020. test dl,dl ; Is horizontal component MV delta zero?
  1021. lea edi,[edi+eax*8]
  1022. mov eax,0
  1023. mov Block1.FutureRefBlockAddr,edi
  1024. je HMVdIsZero
  1025. HMVdIsNonZero:
  1026. mov cl,[esi].BlkY4.CandidateHMVf
  1027. mov bl,[esi].BlkY4.PHMV
  1028. mov ebp,Block3.TransferCase
  1029. sub cl,bl ; -HMVb = -(HMVf - HMV)
  1030. mov [esi].BlkY4.CandidateHMVb,cl
  1031. mov edi,Block4.FutureRefBlockAddr ; Load addr of ref blk to factor in horz.
  1032. sar ecx,1 ; CF == 1 if future horz is at half pel.
  1033. mov bl,[esi].BlkY3.PHMV
  1034. adc ebp,ebp ; ebp odd if future horz is at half pel.
  1035. add edi,ecx ; Factor in HMVb.
  1036. mov Block3.TransferCase,ebp ; Stash case to do after block 3.
  1037. mov cl,[esi].BlkY3.CandidateHMVf
  1038. sub cl,bl
  1039. mov Block4.FutureRefBlockAddr,edi ; Stash address of ref block from future.
  1040. mov ebp,Block2.TransferCase
  1041. mov [esi].BlkY3.CandidateHMVb,cl
  1042. sar ecx,1
  1043. mov edi,Block3.FutureRefBlockAddr
  1044. adc ebp,ebp
  1045. add edi,ecx
  1046. mov Block2.TransferCase,ebp
  1047. mov Block3.FutureRefBlockAddr,edi
  1048. mov cl,[esi].BlkY2.CandidateHMVf
  1049. mov bl,[esi].BlkY2.PHMV
  1050. mov ebp,Block1.TransferCase
  1051. sub cl,bl
  1052. mov [esi].BlkY2.CandidateHMVb,cl
  1053. mov edi,Block2.FutureRefBlockAddr
  1054. sar ecx,1
  1055. mov bl,[esi].BlkY1.PHMV
  1056. adc ebp,ebp
  1057. add edi,ecx
  1058. mov Block1.TransferCase,ebp
  1059. mov cl,[esi].BlkY1.CandidateHMVf
  1060. sub cl,bl
  1061. mov Block2.FutureRefBlockAddr,edi
  1062. mov eax,FirstTransferCase
  1063. mov [esi].BlkY1.CandidateHMVb,cl
  1064. sar ecx,1
  1065. mov edi,Block1.FutureRefBlockAddr
  1066. adc eax,eax
  1067. add edi,ecx
  1068. mov esi,Block1.PastRefBlockAddr
  1069. sub esp,64
  1070. xor ebp,ebp
  1071. jmp PD JumpTable[eax*4]
  1072. VMVdIsZero:
  1073. mov bl,[esi].BlkY4.PVMV
  1074. mov cl,[esi].BlkY3.PVMV
  1075. mov ebp,Block3.TransferCase
  1076. mov dh,PB Block2.TransferCase
  1077. mov bl,WeightBackwardMotion[ebx]
  1078. lea edi,FutureBlock+80-48-48*FPITCH
  1079. mov [esi].BlkY4.CandidateVMVb,bl
  1080. mov cl,WeightBackwardMotion[ecx]
  1081. sar ebx,1 ; CF == 1 if future vert is at half pel.
  1082. mov [esi].BlkY3.CandidateVMVb,cl
  1083. adc ebp,ebp ; ebp odd if future vert is at half pel.
  1084. sar ecx,1
  1085. lea eax,[ebx+ebx*4] ; Multiply vertical component by FPITCH.
  1086. adc dh,dh
  1087. mov Block3.TransferCase,ebp ; Stash case to do after block 3.
  1088. lea ebp,[edi+eax*8+FPITCH*8+8] ; Addr of ref blk w/ vert MV factored in.
  1089. lea eax,[ecx+ecx*4]
  1090. mov PB Block2.TransferCase,dh
  1091. mov Block4.FutureRefBlockAddr,ebp ; Stash address of ref block from future.
  1092. lea ebp,[edi+eax*8+FPITCH*8]
  1093. mov bl,[esi].BlkY2.PVMV
  1094. mov Block3.FutureRefBlockAddr,ebp
  1095. mov cl,[esi].BlkY1.PVMV
  1096. mov ebp,Block1.TransferCase
  1097. mov bl,WeightBackwardMotion[ebx]
  1098. mov dh,PB FirstTransferCase
  1099. mov [esi].BlkY2.CandidateVMVb,bl
  1100. sar ebx,1
  1101. mov cl,WeightBackwardMotion[ecx]
  1102. adc ebp,ebp
  1103. mov [esi].BlkY1.CandidateVMVb,cl
  1104. sar ecx,1
  1105. lea eax,[ebx+ebx*4]
  1106. adc dh,dh
  1107. mov Block1.TransferCase,ebp
  1108. lea ebp,[edi+eax*8+8]
  1109. lea eax,[ecx+ecx*4]
  1110. mov PB FirstTransferCase,dh
  1111. mov Block2.FutureRefBlockAddr,ebp
  1112. lea ebp,[edi+eax*8]
  1113. test dl,dl
  1114. mov Block1.FutureRefBlockAddr,ebp
  1115. jne HMVdIsNonZero
  1116. HMVdIsZero:
  1117. mov bl,[esi].BlkY4.PHMV
  1118. mov cl,[esi].BlkY3.PHMV
  1119. mov ebp,Block3.TransferCase
  1120. mov edx,Block2.TransferCase
  1121. mov bl,WeightBackwardMotion[ebx]
  1122. mov eax,Block4.FutureRefBlockAddr
  1123. mov [esi].BlkY4.CandidateHMVb,bl
  1124. mov cl,WeightBackwardMotion[ecx]
  1125. sar ebx,1 ; CF == 1 if future horz is at half pel.
  1126. mov [esi].BlkY3.CandidateHMVb,cl
  1127. adc ebp,ebp ; ebp odd if future horz is at half pel.
  1128. add eax,ebx ; Addr of ref blk w/ horz MV factored in.
  1129. sar ecx,1
  1130. mov Block3.TransferCase,ebp ; Stash case to do after block 3.
  1131. adc edx,edx
  1132. mov edi,Block3.FutureRefBlockAddr
  1133. mov Block4.FutureRefBlockAddr,eax ; Stash address of ref block from future.
  1134. add edi,ecx
  1135. mov Block2.TransferCase,edx
  1136. mov Block3.FutureRefBlockAddr,edi
  1137. mov bl,[esi].BlkY2.PHMV
  1138. mov cl,[esi].BlkY1.PHMV
  1139. mov ebp,Block1.TransferCase
  1140. mov edx,FirstTransferCase
  1141. mov bl,WeightBackwardMotion[ebx]
  1142. mov eax,Block2.FutureRefBlockAddr
  1143. mov [esi].BlkY2.CandidateHMVb,bl
  1144. mov cl,WeightBackwardMotion[ecx]
  1145. sar ebx,1 ; CF == 1 if future horz is at half pel.
  1146. mov [esi].BlkY1.CandidateHMVb,cl
  1147. adc ebp,ebp ; ebp odd if future horz is at half pel.
  1148. add eax,ebx ; Addr of ref blk w/ horz MV factored in.
  1149. sar ecx,1
  1150. mov Block1.TransferCase,ebp ; Stash case to do after block 3.
  1151. adc edx,edx
  1152. mov edi,Block1.FutureRefBlockAddr
  1153. mov Block2.FutureRefBlockAddr,eax ; Stash address of ref block from future.
  1154. add edi,ecx
  1155. mov esi,Block1.PastRefBlockAddr
  1156. sub esp,64
  1157. xor ebp,ebp
  1158. jmp PD JumpTable[edx*4]
  1159. MVDeltaOutOfRange:
  1160. xor ebp,ebp
  1161. mov ebx,CurrSWDState ; Restore ME engine state.
  1162. jmp OutOfRangeHandlingDone
  1163. TakeEarlyOut:
  1164. sub esp,4
  1165. xor ecx,ecx
  1166. and esp,0FFFFFFC0H
  1167. mov ebx,CurrSWDState+64
  1168. mov esi,MBlockActionStream+64
  1169. mov eax,BestMV+64
  1170. add esp,64
  1171. mov bl,SWDState[ebx+3]
  1172. test bl,bl
  1173. jne SWDLoop
  1174. mov ecx,Block4.SWD0MVAccum
  1175. mov ebp,Block4.BestSWDAccum
  1176. jmp CandidatesDone
  1177. NonZeroVectorSWDDone:
  1178. mov ebx,CurrSWDState
  1179. mov esi,MBlockActionStream
  1180. xor ecx,ecx
  1181. mov ebp,-1
  1182. mov eax,[esi].BlkY1.CandidateBiDiMVs
  1183. mov edx,[esi].BlkY2.CandidateBiDiMVs
  1184. mov [esi].BlkY1.BestBiDiMVs,eax
  1185. mov [esi].BlkY2.BestBiDiMVs,edx
  1186. mov eax,[esi].BlkY3.CandidateBiDiMVs
  1187. mov edx,[esi].BlkY4.CandidateBiDiMVs
  1188. mov [esi].BlkY3.BestBiDiMVs,eax
  1189. mov [esi].BlkY4.BestBiDiMVs,edx
  1190. mov eax,Block1.CandidateSWDAccum
  1191. mov edx,Block2.CandidateSWDAccum
  1192. mov Block1.BestSWDAccum,eax
  1193. mov Block2.BestSWDAccum,edx
  1194. mov eax,Block3.CandidateSWDAccum
  1195. mov edx,Block4.CandidateSWDAccum
  1196. mov Block3.BestSWDAccum,eax
  1197. mov Block4.BestSWDAccum,edx
  1198. mov BlockN.BestSWDAccum,edx
  1199. OutOfRangeHandlingDone:
  1200. mov bl,SWDState[ebx+ebp*1+3]
  1201. mov eax,BestMV[ebp*4]
  1202. test bl,bl
  1203. jne SWDLoop
  1204. mov ecx,Block4.SWD0MVAccum
  1205. mov ebp,Block4.BestSWDAccum
  1206. CandidatesDone:
  1207. sub ecx,ebp
  1208. mov ebx,NonZeroMVDifferential
  1209. cmp ecx,ebx
  1210. jge ZeroMVNotGoodEnough
  1211. ZeroMVGoodEnough:
  1212. xor eax,eax
  1213. mov esi,MBlockActionStream
  1214. mov edi,Block1.SWD0MVAccum
  1215. mov ebx,Block2.SWD0MVAccum
  1216. mov ecx,Block3.SWD0MVAccum
  1217. mov ebp,Block4.SWD0MVAccum
  1218. mov Block1.BestSWDAccum,edi
  1219. mov Block2.BestSWDAccum,ebx
  1220. mov Block3.BestSWDAccum,ecx
  1221. mov Block4.BestSWDAccum,ebp
  1222. mov ebx,BlkY1_0deltaBiDiMVs
  1223. mov edi,BlkY2_0deltaBiDiMVs
  1224. mov [esi].BlkY1.BestBiDiMVs,ebx
  1225. mov [esi].BlkY2.BestBiDiMVs,edi
  1226. mov ebx,BlkY3_0deltaBiDiMVs
  1227. mov edi,BlkY4_0deltaBiDiMVs
  1228. mov [esi].BlkY3.BestBiDiMVs,ebx
  1229. mov [esi].BlkY4.BestBiDiMVs,edi
  1230. BelowZeroThreshold:
  1231. ZeroMVNotGoodEnough:
  1232. mov [esi].BlkY1.BHMV,al
  1233. mov [esi].BlkY2.BHMV,al
  1234. mov [esi].BlkY3.BHMV,al
  1235. mov [esi].BlkY4.BHMV,al
  1236. mov [esi].BlkY1.BVMV,ah
  1237. mov [esi].BlkY2.BVMV,ah
  1238. mov [esi].BlkY3.BVMV,ah
  1239. mov [esi].BlkY4.BVMV,ah
  1240. mov al,[esi].CodedBlocksB ; Fetch coded block pattern.
  1241. mov edi,EmptyBlockThreshold ; Get threshold for forcing block empty?
  1242. or al,03FH ; Initially set all blocks coded.
  1243. mov ecx,Block3.BestSWDAccum
  1244. mov ebx,InterSWDBlocks
  1245. mov edx,ebp
  1246. sub edx,ecx ; Get SWD for block 4.
  1247. cmp edx,edi ; Is it below empty threshold?
  1248. jg @f
  1249. and al,0F7H ; If so, indicate block 4 is NOT coded.
  1250. dec ebx
  1251. sub ebp,edx
  1252. @@:
  1253. mov edx,Block2.BestSWDAccum
  1254. sub ecx,edx
  1255. cmp ecx,edi
  1256. jg @f
  1257. and al,0FBH
  1258. dec ebx
  1259. sub ebp,ecx
  1260. @@:
  1261. mov ecx,Block1.BestSWDAccum
  1262. sub edx,ecx
  1263. cmp edx,edi
  1264. jg @f
  1265. and al,0FDH
  1266. dec ebx
  1267. sub ebp,edx
  1268. @@:
  1269. mov edx,InterSWDTotal
  1270. cmp ecx,edi
  1271. jg @f
  1272. and al,0FEH
  1273. dec ebx
  1274. sub ebp,ecx
  1275. @@:
  1276. mov [esi].CodedBlocksB,al ; Store coded block pattern.
  1277. add ebx,4
  1278. mov InterSWDBlocks,ebx
  1279. add edx,ebp ; Add to total for this macroblock class.
  1280. mov InterSWDTotal,edx
  1281. mov edx,esi
  1282. mov PD [esi].SWDB,ebp
  1283. jmp NextMacroBlock
  1284. BiDiNoInterp:
  1285. ; esp -- Pointer to block of target macroblock.
  1286. ; ebp -- SWD accumulator. Must be initialized by caller.
  1287. ; esi -- Pointer to block of reference in past frame.
  1288. ; edi -- Pointer to block of reference in future frame + 80.
  1289. ; al, bl, cl, dl -- Scratch.
  1290. xor eax,eax
  1291. xor ebx,ebx
  1292. mov al,[edi-80] ; 00A Fetch pel from future ref.
  1293. mov bl,[esi] ; 00B Fetch pel from previous ref.
  1294. xor ecx,ecx
  1295. xor edx,edx
  1296. @@:
  1297. mov al,InterpPastAndFutureRef[eax+ebx] ; 00C (past+future) or 2*past
  1298. mov bl,BlockN.TargetBlock[0] ; 00D Fetch -2 * target pel.
  1299. mov cl,[edi+FPITCH*2+2-80] ; 22A
  1300. mov dl,[esi+PITCH*2+2] ; 22B
  1301. mov bl,WeightedDiff[ebx+eax] ; 00E Weighted difference.
  1302. mov cl,InterpPastAndFutureRef[ecx+edx] ; 22C
  1303. add ebp,ebx ; 00F Accumulate weighted difference.
  1304. mov dl,BlockN.TargetBlock[TPITCH*2+2] ; 22D
  1305. mov al,[esi+PITCH*0+2] ; 02a Fetch pel from previous ref.
  1306. mov bl,BlockN.TargetBlock[TPITCH*0+2] ; 02b Fetch -2 * target pel.
  1307. mov dl,WeightedDiff[edx+ecx] ; 22E
  1308. mov cl,[esi+PITCH*2+0] ; 20a
  1309. add ebp,edx ; 22F
  1310. mov dl,BlockN.TargetBlock[TPITCH*2+0] ; 20b
  1311. mov bl,WeightedDiff[ebx+eax*2] ; 02c Weighted difference.
  1312. mov al,[esi+PITCH*1+1] ; 11a
  1313. add ebp,ebx ; 02d Accumulate weighted difference.
  1314. mov bl,BlockN.TargetBlock[TPITCH*1+1] ; 11b
  1315. mov dl,WeightedDiff[edx+ecx*2] ; 20c
  1316. mov cl,[esi+PITCH*1+3] ; 13a
  1317. add ebp,edx ; 20d
  1318. mov dl,BlockN.TargetBlock[TPITCH*1+3] ; 13b
  1319. mov bl,WeightedDiff[ebx+eax*2] ; 11c
  1320. mov al,[esi+PITCH*3+1] ; 31a
  1321. add ebp,ebx ; 11d
  1322. mov bl,BlockN.TargetBlock[TPITCH*3+1] ; 31b
  1323. mov dl,WeightedDiff[edx+ecx*2] ; 13c
  1324. mov cl,[esi+PITCH*3+3] ; 33a
  1325. add ebp,edx ; 13d
  1326. mov dl,BlockN.TargetBlock[TPITCH*3+3] ; 33b
  1327. mov bl,WeightedDiff[ebx+eax*2] ; 31c
  1328. add edi,4 ; Move to next 4 columns.
  1329. add ebp,ebx ; 31d
  1330. mov dl,WeightedDiff[edx+ecx*2] ; 33c
  1331. add ebp,edx ; 33d
  1332. add esi,4 ; Move to next 4 columns.
  1333. add esp,4 ; Move to next 4 columns.
  1334. mov al,[edi-80] ; 04A
  1335. mov bl,[esi] ; 04B
  1336. mov cl,4
  1337. and ecx,esp ; Twice, 4 cols each time.
  1338. jne @b
  1339. mov al,[edi-80+FPITCH*4-8] ; 40A
  1340. add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1341. mov cl,8
  1342. add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1343. and ecx,esp ; Twice, 4 rows each time.
  1344. mov bl,[esi] ; 40B
  1345. jne @b
  1346. mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
  1347. mov eax,BlockN.BestSWDAccum
  1348. cmp ebp,eax
  1349. jg TakeEarlyOut
  1350. mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
  1351. mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
  1352. mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
  1353. jmp PD JumpTable[eax*4]
  1354. BiDiFutureHorz LABEL DWORD
  1355. mov edx,1
  1356. xor ecx,ecx
  1357. jmp BiDiSWDCalc_InterpFuture
  1358. BiDiFutureVert:
  1359. mov edx,FPITCH
  1360. xor ecx,ecx
  1361. jmp BiDiSWDCalc_InterpFuture
  1362. BiDiFutureBoth:
  1363. mov edx,FPITCH+1
  1364. xor ecx,ecx
  1365. ; esp -- Pointer to block of target macroblock.
  1366. ; ebp -- SWD accumulator. Must be initialized by caller.
  1367. ; esi -- Pointer to block of reference in past frame.
  1368. ; edi -- Pointer to block of reference in future frame + 80.
  1369. ; edx -- Distance from future pel to other future pel with which to interp.
  1370. ; al, bl, cl, dl -- Scratch.
  1371. BiDiSWDCalc_InterpFuture:
  1372. mov al,[edi-80] ; 00A Fetch pel from future ref.
  1373. xor ebx,ebx
  1374. @@:
  1375. mov bl,[edi+edx-80] ; 00B Fetch other future ref pel.
  1376. and eax,0000000FFH
  1377. mov BlockN.FutureRefInterpOffset,edx ; Stash interp offset.
  1378. mov dl,[edi+edx+FPITCH*2+2-80] ; 22B
  1379. mov al,InterpFutureRef[eax+ebx] ; 00C Get interpolated future ref.
  1380. mov bl,[esi] ; 00D Fetch pel from previous ref.
  1381. mov cl,[edi+FPITCH*2+2-80] ; 22A
  1382. and edx,0000000FFH ; Extract pel value.
  1383. mov al,InterpPastAndFutureRef[eax+ebx] ; 00E (past+future) or 2*past
  1384. mov bl,BlockN.TargetBlock[0] ; 00F Fetch -2 * target pel.
  1385. mov cl,InterpFutureRef[ecx+edx] ; 22C
  1386. mov dl,[esi+PITCH*2+2] ; 22D
  1387. mov bl,WeightedDiff[ebx+eax] ; 00G Weighted difference.
  1388. mov al,[esi+PITCH*0+2] ; 02a Fetch pel from previous ref.
  1389. add ebp,ebx ; 00H Accumulate weighted difference.
  1390. mov bl,BlockN.TargetBlock[TPITCH*0+2] ; 02b Fetch -2 * target pel.
  1391. mov cl,InterpPastAndFutureRef[ecx+edx] ; 22E
  1392. mov dl,BlockN.TargetBlock[TPITCH*2+2] ; 22F
  1393. mov bl,WeightedDiff[ebx+eax*2] ; 02c Weighted difference.
  1394. mov al,[esi+PITCH*2+0] ; 20a
  1395. mov dl,WeightedDiff[edx+ecx] ; 22G
  1396. add ebp,ebx ; 02d Accumulate weighted difference.
  1397. add ebp,edx ; 22H
  1398. mov bl,BlockN.TargetBlock[TPITCH*2+0] ; 20b
  1399. mov cl,[esi+PITCH*1+1] ; 11a
  1400. mov dl,BlockN.TargetBlock[TPITCH*1+1] ; 11b
  1401. mov bl,WeightedDiff[ebx+eax*2] ; 20c
  1402. mov al,[esi+PITCH*1+3] ; 13a
  1403. add ebp,ebx ; 20d
  1404. mov dl,WeightedDiff[edx+ecx*2] ; 11c
  1405. add ebp,edx ; 11d
  1406. mov bl,BlockN.TargetBlock[TPITCH*1+3] ; 13b
  1407. mov cl,[esi+PITCH*3+1] ; 31a
  1408. mov dl,BlockN.TargetBlock[TPITCH*3+1] ; 31b
  1409. mov bl,WeightedDiff[ebx+eax*2] ; 13c
  1410. mov al,[esi+PITCH*3+3] ; 33a
  1411. add ebp,ebx ; 13d
  1412. mov bl,BlockN.TargetBlock[TPITCH*3+3] ; 33b
  1413. mov dl,WeightedDiff[edx+ecx*2] ; 31c
  1414. add edi,4 ; Move to next 4 columns.
  1415. add ebp,edx ; 31d
  1416. mov bl,WeightedDiff[ebx+eax*2] ; 33c
  1417. add ebp,ebx ; 33d
  1418. mov edx,BlockN.FutureRefInterpOffset ; Prepare for next iteration.
  1419. add esi,4 ; Move to next 4 columns.
  1420. add esp,4 ; Move to next 4 columns.
  1421. mov al,[edi-80] ; 04A
  1422. mov cl,4
  1423. and ecx,esp ; Twice, 4 cols each time.
  1424. jne @b
  1425. mov al,[edi-80+FPITCH*4-8] ; 40A
  1426. add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1427. mov cl,8
  1428. add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1429. and ecx,esp ; Twice, 4 rows each time.
  1430. jne @b
  1431. mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
  1432. mov eax,BlockN.BestSWDAccum
  1433. cmp ebp,eax
  1434. jg TakeEarlyOut
  1435. mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
  1436. mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
  1437. mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
  1438. jmp PD JumpTable[eax*4]
  1439. BiDiPastHorz LABEL DWORD
  1440. mov edx,edi
  1441. mov edi,1
  1442. xor eax,eax
  1443. jmp BiDiSWDCalc_InterpPast
  1444. BiDiPastVert:
  1445. mov edx,edi
  1446. mov edi,PITCH
  1447. xor eax,eax
  1448. jmp BiDiSWDCalc_InterpPast
  1449. BiDiPastBoth:
  1450. mov edx,edi
  1451. mov edi,PITCH+1
  1452. xor eax,eax
  1453. BiDiSWDCalc_InterpPast:
  1454. ; esp -- Pointer to block of target macroblock.
  1455. ; ebp -- SWD accumulator. Must be initialized by caller.
  1456. ; esi -- Pointer to block of reference in past frame.
  1457. ; edi -- Distance from future pel to other future pel with which to interp.
  1458. ; edx -- Pointer to block of reference in future frame + 80.
  1459. ; al, bl, cl, dl -- Scratch.
  1460. mov al,[esi] ; 00A Fetch pel from previous ref.
  1461. xor ebx,ebx
  1462. mov bl,[esi+edi] ; 00B Fetch other past ref pel.
  1463. xor ecx,ecx
  1464. @@:
  1465. add al,bl ; 00C Interp'd past ref, times 2.
  1466. mov bl,[edx-80] ; 00D Fetch pel from future ref.
  1467. mov BlockN.FutureRefBlockAddr,edx
  1468. mov dl,[edx+FPITCH*2+2-80] ; 22D
  1469. mov al,Interp2PastAndFutureRef[eax+ebx*2] ; 00E (past+future) or 2*past.
  1470. mov bl,BlockN.TargetBlock[0] ; 00F Fetch target pel.
  1471. mov cl,[esi+PITCH*2+2] ; 22A
  1472. and edx,0000000FFH
  1473. mov bl,WeightedDiff[eax+ebx] ; 00G Weighted difference.
  1474. mov al,[esi+edi+PITCH*2+2] ; 22B
  1475. add cl,al ; 22C
  1476. mov al,[esi+PITCH*0+2] ; 02a Fetch pel from previous ref.
  1477. add ebp,ebx ; 00H Accumulate weighted diff.
  1478. mov bl,[esi+edi+PITCH*0+2] ; 02b Fetch other past ref pel.
  1479. mov cl,Interp2PastAndFutureRef[ecx+edx*2] ; 22E
  1480. mov dl,BlockN.TargetBlock[TPITCH*2+2] ; 22F
  1481. add al,bl ; 02c Interp'd past ref, times 2.
  1482. mov bl,BlockN.TargetBlock[TPITCH*0+2] ; 02d Fetch -2 * target pel.
  1483. mov dl,WeightedDiff[ecx+edx] ; 22G Weighted difference.
  1484. mov cl,[esi+PITCH*2+0] ; 20a
  1485. add ebp,edx ; 22H
  1486. mov dl,[esi+edi+PITCH*2+0] ; 20b
  1487. add cl,dl ; 20c
  1488. mov dl,BlockN.TargetBlock[TPITCH*2+0] ; 20d
  1489. mov bl,WeightedDiff[eax+ebx] ; 02e Weighted difference.
  1490. mov al,[esi+PITCH*1+1] ; 11a
  1491. add ebp,ebx ; 02f Accumulate weighted diff.
  1492. mov bl,[esi+edi+PITCH*1+1] ; 11b
  1493. add al,bl ; 11c
  1494. mov bl,BlockN.TargetBlock[TPITCH*1+1] ; 11d
  1495. mov dl,WeightedDiff[ecx+edx] ; 20e
  1496. mov cl,[esi+PITCH*1+3] ; 13a
  1497. add ebp,edx ; 20f
  1498. mov dl,[esi+edi+PITCH*1+3] ; 13b
  1499. add cl,dl ; 13c
  1500. mov dl,BlockN.TargetBlock[TPITCH*1+3] ; 13d
  1501. mov bl,WeightedDiff[eax+ebx] ; 11e
  1502. mov al,[esi+PITCH*3+1] ; 31a
  1503. add ebp,ebx ; 11f
  1504. mov bl,[esi+edi+PITCH*3+1] ; 31b
  1505. add al,bl ; 31c
  1506. mov bl,BlockN.TargetBlock[TPITCH*3+1] ; 31d
  1507. mov dl,WeightedDiff[ecx+edx] ; 13e
  1508. mov cl,[esi+PITCH*3+3] ; 33a
  1509. add ebp,edx ; 13f
  1510. mov dl,[esi+edi+PITCH*3+3] ; 33b
  1511. add cl,dl ; 33c
  1512. mov dl,BlockN.TargetBlock[TPITCH*3+3] ; 33d
  1513. mov bl,WeightedDiff[eax+ebx] ; 31e
  1514. add esi,4 ; Move to next 4 columns.
  1515. add ebp,ebx ; 31f
  1516. mov dl,WeightedDiff[ecx+edx] ; 33e
  1517. add ebp,edx ; 33f
  1518. mov edx,BlockN.FutureRefBlockAddr
  1519. add edx,4 ; Move to next 4 columns.
  1520. add esp,4 ; Move to next 4 columns.
  1521. mov al,[esi] ; 04A
  1522. mov cl,4
  1523. mov bl,[esi+edi] ; 04B
  1524. and ecx,esp ; Twice, 4 cols each time.
  1525. mov cl,8
  1526. jne @b
  1527. add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1528. mov al,[esi+PITCH*4-8] ; 40A
  1529. mov bl,[esi+edi+PITCH*4-8] ; 40B
  1530. add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1531. and ecx,esp ; Twice, 4 rows each time.
  1532. jne @b
  1533. mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
  1534. mov eax,BlockN.BestSWDAccum
  1535. cmp ebp,eax
  1536. jg TakeEarlyOut
  1537. mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
  1538. mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
  1539. mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
  1540. jmp PD JumpTable[eax*4]
  1541. BiDiSWDCalc_InterpBoth MACRO PastRefInterpOffset
  1542. ; esp -- Pointer to block of target macroblock.
  1543. ; ebp -- SWD accumulator. Must be initialized by caller.
  1544. ; esi -- Pointer to block of reference in past frame.
  1545. ; edi -- Pointer to block of reference in future frame + 80.
  1546. ; al, bl, cl, dl -- Scratch.
  1547. @@:
  1548. mov bl,[edi-80] ; 00A Fetch pel from future ref.
  1549. mov BlockN.FutureRefBlockAddr,edx
  1550. mov al,[edi+edx-80] ; 00B Fetch other future ref pel.
  1551. mov dl,[edi+edx+FPITCH*2+2-80] ; 22B
  1552. mov cl,[edi+FPITCH*2+2-80] ; 22A
  1553. add esp,4 ; Move to next 4 columns.
  1554. mov bl,InterpFutureRef[eax+ebx] ; 00C Get interpolated future ref.
  1555. mov al,[esi] ; 00D Fetch pel from previous ref.
  1556. mov dl,InterpFutureRef[ecx+edx] ; 22C
  1557. mov cl,[esi+PITCH*2+2] ; 22D
  1558. lea ebx,[eax+ebx*2] ; 00E Interp'ed future plus one past.
  1559. mov al,[esi+PastRefInterpOffset] ; 00F Fetch other pel from past ref.
  1560. lea edx,[ecx+edx*2] ; 22E
  1561. mov cl,[esi+PITCH*2+2+PastRefInterpOffset] ; 22F
  1562. mov al,Interp2PastAndFutureRef[ebx+eax]; 00G (past+future) or 2*past.
  1563. xor ebx,ebx
  1564. mov bl,BlockN.TargetBlock[0-4] ; 00H Fetch target pel.
  1565. mov cl,Interp2PastAndFutureRef[edx+ecx] ; 22G
  1566. mov dl,BlockN.TargetBlock[TPITCH*2+2-4] ; 22H
  1567. add esi,4 ; Move to next 4 columns.
  1568. and edx,0000000FFH
  1569. mov bl,WeightedDiff[eax+ebx] ; 00I Weighted difference.
  1570. add ebp,ebx ; 00J Accum weighted difference.
  1571. mov al,[esi+PITCH*0+2-4] ; 02a Fetch pel from prev ref.
  1572. mov dl,WeightedDiff[ecx+edx] ; 22I
  1573. mov bl,[esi+PastRefInterpOffset+PITCH*0+2-4]; 02b Fetch other past ref pel.
  1574. add al,bl ; 02c Interp'd past ref, *2.
  1575. mov bl,BlockN.TargetBlock[TPITCH*0+2-4] ; 02d Fetch -2 * target pel.
  1576. add ebp,edx ; 22J
  1577. mov cl,[esi+PITCH*2+0-4] ; 20a
  1578. add edi,4 ; Move to next 4 columns.
  1579. mov dl,[esi+PastRefInterpOffset+PITCH*2+0-4]; 20b
  1580. mov bl,WeightedDiff[eax+ebx] ; 02e Weighted difference.
  1581. mov al,[esi+PITCH*1+1-4] ; 11a
  1582. add ebp,ebx ; 02f Accumulate weighted diff.
  1583. mov bl,[esi+PastRefInterpOffset+PITCH*1+1-4]; 11b
  1584. add cl,dl ; 20c
  1585. mov dl,BlockN.TargetBlock[TPITCH*2+0-4] ; 20d
  1586. add al,bl ; 11c
  1587. mov bl,BlockN.TargetBlock[TPITCH*1+1-4] ; 11d
  1588. mov dl,WeightedDiff[ecx+edx] ; 20e
  1589. mov cl,[esi+PITCH*1+3-4] ; 13a
  1590. add ebp,edx ; 20f
  1591. mov dl,[esi+PastRefInterpOffset+PITCH*1+3-4]; 13b
  1592. mov bl,WeightedDiff[eax+ebx] ; 11e
  1593. mov al,[esi+PITCH*3+1-4] ; 31a
  1594. add ebp,ebx ; 11f
  1595. mov bl,[esi+PastRefInterpOffset+PITCH*3+1-4]; 31b
  1596. add cl,dl ; 13c
  1597. mov dl,BlockN.TargetBlock[TPITCH*1+3-4] ; 13d
  1598. add al,bl ; 31c
  1599. mov bl,BlockN.TargetBlock[TPITCH*3+1-4] ; 31d
  1600. mov dl,WeightedDiff[ecx+edx] ; 13e
  1601. mov cl,[esi+PITCH*3+3-4] ; 33a
  1602. add ebp,edx ; 13f
  1603. mov dl,[esi+PastRefInterpOffset+PITCH*3+3-4]; 33b
  1604. add cl,dl ; 33c
  1605. mov dl,BlockN.TargetBlock[TPITCH*3+3-4] ; 33d
  1606. mov bl,WeightedDiff[eax+ebx] ; 31e
  1607. mov al,4
  1608. add ebp,ebx ; 31f
  1609. mov dl,WeightedDiff[ecx+edx] ; 33e
  1610. add ebp,edx ; 33f
  1611. mov edx,BlockN.FutureRefBlockAddr-4
  1612. and eax,esp ; Twice, 4 cols each time.
  1613. jne @b
  1614. add edi,FPITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1615. add esi,PITCH*4-8 ; Move to first 4 cols, next 4 rows.
  1616. mov cl,8
  1617. and ecx,esp ; Twice, 4 rows each time.
  1618. jne @b
  1619. mov BlockNM1.CandidateSWDAccum,ebp ; Store accumulated SWD.
  1620. mov eax,BlockN.BestSWDAccum
  1621. cmp ebp,eax
  1622. jg TakeEarlyOut
  1623. mov eax,BlockNM1.TransferCase ; Fetch next case to execute.
  1624. mov esi,BlockN.PastRefBlockAddr ; Fetch next past ref address.
  1625. mov edi,BlockN.FutureRefBlockAddr ; Fetch next past ref address.
  1626. jmp PD JumpTable[eax*4]
  1627. ENDM
  1628. BiDiPastHorzFutureHorz LABEL DWORD
  1629. xor ecx,ecx
  1630. mov edx,1
  1631. jmp BiDiSWDCalc_InterpBoth_PastByHorz
  1632. BiDiPastHorzFutureVert LABEL DWORD
  1633. xor ecx,ecx
  1634. mov edx,FPITCH
  1635. jmp BiDiSWDCalc_InterpBoth_PastByHorz
  1636. BiDiPastHorzFutureBoth LABEL DWORD
  1637. xor ecx,ecx
  1638. mov edx,FPITCH+1
  1639. BiDiSWDCalc_InterpBoth_PastByHorz:
  1640. xor eax,eax
  1641. xor ebx,ebx
  1642. BiDiSWDCalc_InterpBoth 1
  1643. BiDiPastVertFutureHorz LABEL DWORD
  1644. xor ecx,ecx
  1645. mov edx,1
  1646. jmp BiDiSWDCalc_InterpBoth_PastByVert
  1647. BiDiPastVertFutureVert LABEL DWORD
  1648. xor ecx,ecx
  1649. mov edx,FPITCH
  1650. jmp BiDiSWDCalc_InterpBoth_PastByVert
  1651. BiDiPastVertFutureBoth LABEL DWORD
  1652. xor ecx,ecx
  1653. mov edx,FPITCH+1
  1654. BiDiSWDCalc_InterpBoth_PastByVert:
  1655. xor eax,eax
  1656. xor ebx,ebx
  1657. BiDiSWDCalc_InterpBoth PITCH
  1658. BiDiPastBothFutureHorz LABEL DWORD
  1659. xor ecx,ecx
  1660. mov edx,1
  1661. jmp BiDiSWDCalc_InterpBoth_PastByBoth
  1662. BiDiPastBothFutureVert LABEL DWORD
  1663. xor ecx,ecx
  1664. mov edx,FPITCH
  1665. jmp BiDiSWDCalc_InterpBoth_PastByBoth
  1666. BiDiPastBothFutureBoth LABEL DWORD
  1667. xor ecx,ecx
  1668. mov edx,FPITCH+1
  1669. BiDiSWDCalc_InterpBoth_PastByBoth:
  1670. xor eax,eax
  1671. xor ebx,ebx
  1672. BiDiSWDCalc_InterpBoth PITCH+1
  1673. ALIGN 4
  1674. JumpTable:
  1675. DD BiDiNoInterp
  1676. DD BiDiFutureHorz
  1677. DD BiDiFutureVert
  1678. DD BiDiFutureBoth
  1679. DD BiDiPastHorz
  1680. DD BiDiPastHorzFutureHorz
  1681. DD BiDiPastHorzFutureVert
  1682. DD BiDiPastHorzFutureBoth
  1683. DD BiDiPastVert
  1684. DD BiDiPastVertFutureHorz
  1685. DD BiDiPastVertFutureVert
  1686. DD BiDiPastVertFutureBoth
  1687. DD BiDiPastBoth
  1688. DD BiDiPastBothFutureHorz
  1689. DD BiDiPastBothFutureVert
  1690. DD BiDiPastBothFutureBoth
  1691. DD ZeroVectorSWDDone
  1692. DD NonZeroVectorSWDDone
  1693. Done:
  1694. mov ecx,InterSWDTotal
  1695. mov edx,InterSWDBlocks
  1696. mov esp,StashESP
  1697. mov edi,[esp+InterSWDTotal_arg]
  1698. mov [edi],ecx
  1699. mov edi,[esp+InterSWDBlocks_arg]
  1700. mov [edi],edx
  1701. pop ebx
  1702. pop ebp
  1703. pop edi
  1704. pop esi
  1705. rturn
  1706. BFRAMEMOTIONESTIMATION endp
  1707. END