Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1305 lines
52 KiB

  1. ;////////////////////////////////////////////////////////////////////////////
  2. ;//
  3. ;// INTEL CORPORATION PROPRIETARY INFORMATION
  4. ;//
  5. ;// This software is supplied under the terms of a license
  6. ;// agreement or nondisclosure agreement with Intel Corporation
  7. ;// and may not be copied or disclosed except in accordance
  8. ;// with the terms of that agreement.
  9. ;//
  10. ;////////////////////////////////////////////////////////////////////////////
  11. ;//
  12. ;// $Header: R:\h26x\h26x\src\enc\e3mbme.asv 1.5 18 Oct 1996 16:57:08 BNICKERS $
  13. ;//
  14. ;// $Log: R:\h26x\h26x\src\enc\e3mbme.asv $
  15. ;//
  16. ;// Rev 1.5 18 Oct 1996 16:57:08 BNICKERS
  17. ;// Fixes for EMV
  18. ;//
  19. ;// Rev 1.4 12 Sep 1996 10:56:16 BNICKERS
  20. ;// Add arguments for thresholds and differentials.
  21. ;//
  22. ;// Rev 1.3 22 Jul 1996 15:22:48 BNICKERS
  23. ;// Reduce code size. Implement H261 spatial filter.
  24. ;//
  25. ;// Rev 1.2 14 May 1996 12:18:48 BNICKERS
  26. ;// Initial debugging of MMx B-Frame ME.
  27. ;//
  28. ;// Rev 1.1 03 May 1996 14:03:30 BNICKERS
  29. ;//
  30. ;// Minor bug fixes and integration refinements.
  31. ;//
  32. ;// Rev 1.0 02 May 1996 12:00:56 BNICKERS
  33. ;// Initial revision.
  34. ;//
  35. ;////////////////////////////////////////////////////////////////////////////
  36. ;
  37. ; MMxBFrameMotionEstimation -- This function performs motion estimation for the
  38. ; B frame macroblocks identified in the input list.
  39. ; This is the MMx version.
  40. ;
  41. OPTION M510
  42. OPTION CASEMAP:NONE
  43. BFRMNONZEROMVDIFFERENTIAL = 400
  44. BFRMEMPTYTHRESHOLD = 256
  45. .xlist
  46. include e3inst.inc
  47. include memmodel.inc
  48. include iammx.inc
  49. include exEDTQ.inc
  50. include e3mbad.inc
  51. .list
  52. .CODE EDTQ
  53. EXTERN MMxDoForwardDCT:NEAR
  54. PUBLIC MMxDoBFrameLumaBlocks
  55. PUBLIC MMxDoBFrameChromaBlocks
  56. StackOffset TEXTEQU <4>
  57. CONST_384 TEXTEQU <ebp>
  58. MMxDoBFrameLumaBlocks:
  59. mov eax,QPDiv2 ; Swap these so Quantizer uses right level.
  60. mov ebx,BQPDiv2
  61. mov QPDiv2,ebx
  62. mov BQPDiv2,eax
  63. mov eax,CodeStreamCursor
  64. mov ebx,BCodeStreamCursor
  65. mov CodeStreamCursor,ebx
  66. mov BCodeStreamCursor,eax
  67. mov eax,Recip2QPToUse
  68. mov ebx,BRecip2QPToUse
  69. mov Recip2QPToUse,ebx
  70. mov cl,INTER1MV
  71. mov BRecip2QPToUse,eax
  72. mov StashBlockType,cl
  73. BFrameSWDLoop_0MV:
  74. mov ecx,[edx].BlkY1.MVs
  75. xor ebx,ebx
  76. mov bl,[edx].BlkY1.PVMV ; P-frame Vertical MV
  77. lea edi,WeightForwardMotion
  78. xor eax,eax
  79. and ecx,0FFH ; P-frame Horizontal MV
  80. mov al,[edi+ebx] ; VMV for past ref.
  81. mov bl,[edi+ebx+64] ; VMV for future ref.
  82. mov [edx].BlkY1.VMVb0Delta,bl
  83. mov bl,[edi+ecx+64] ; HMV for future ref.
  84. mov [edx].BlkY1.HMVb0Delta,bl
  85. mov bl,[edi+ecx] ; HMV for past ref.
  86. mov [edx].BlkY1.VMVf0Delta,al ; Record candidate VMVf.
  87. xor ecx,ecx ; Keep pairing happy.
  88. mov [edx].BlkY1.HMVf0Delta,bl ; Record candidate HMVf.
  89. mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
  90. call ComputeBFrameSWDForCandRef
  91. movdf [edx].BlkY1.BlkLvlSWD0Delta,mm7 ; Stash SWD.
  92. add edx,SIZEOF T_Blk
  93. lea edi,WeightForwardMotion
  94. test dl,4*SIZEOF T_Blk ; Quit when fourth block done.
  95. je BFrameSWDLoop_0MV
  96. mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs
  97. mov cl,[edx-4*SIZEOF T_Blk].BlockType
  98. xor cl,INTER1MV
  99. or al,ah
  100. lea esi,[edx-4*SIZEOF T_Blk] ; Reset MacroBlockActionDescr cursor.
  101. or al,cl
  102. mov ecx,[edx-SIZEOF T_Blk].BlkY1.BlkLvlSWD0Delta
  103. je BelowBFrmZeroThreshold ; Jump if P frm macroblock uses 0 motion vector.
  104. xor eax,eax
  105. cmp ecx,BFrmZeroVectorThreshold
  106. mov CurrSWDState,eax ; Record ME engine state.
  107. jle BelowBFrmZeroThreshold
  108. mov edx,[esi].BlkY1.BlkLvlSWD0Delta ; Remember 0-MV SWDs.
  109. mov ecx,[esi].BlkY2.BlkLvlSWD0Delta
  110. mov [esi].BlkY1.BestBlkLvlSWD,edx
  111. mov [esi].BlkY2.BestBlkLvlSWD,ecx
  112. mov edx,[esi].BlkY3.BlkLvlSWD0Delta
  113. mov ecx,[esi].BlkY4.BlkLvlSWD0Delta
  114. mov [esi].BlkY3.BestBlkLvlSWD,edx
  115. mov [esi].BlkY4.BestBlkLvlSWD,ecx
  116. mov [esi].BlkU.BestBlkLvlSWD,ecx ; Avoid unintended early out, below.
  117. xor edx,edx ; Set best MV to zero.
  118. BFrmSWDLoop:
  119. mov ecx,PD BFrmSWDState[eax] ; cl == HMV; ch == VMV offsets to try.
  120. mov BestMV,edx ; Record what the best MV so far is.
  121. add cl,dl ; Try this horizontal MV delta.
  122. je HMVdIsZero
  123. mov PB CandidateMV,cl ; Record the candidate HMV delta.
  124. add ch,dh ; Try this vertical MV delta.
  125. mov PB CandidateMV+1,ch ; Record the candidate VMV delta.
  126. je VMVdIsZero
  127. VMVdAndHMVdAreNonZero_Loop:
  128. mov edx,[esi].BlkY1.MVs
  129. xor ebx,ebx
  130. mov bl,dl
  131. xor eax,eax
  132. mov al,dh
  133. add esi,SIZEOF T_Blk
  134. mov bl,[edi+ebx] ; TRb * HMV / TRd
  135. pxor mm7,mm7 ; Initialize SWD accumulator
  136. add bl,cl ; HMVf = TRb * HMV / TRd + HMVd
  137. mov al,[edi+eax] ; TRb * VMV / TRd
  138. cmp bl,040H ; If too far left or right, quick out.
  139. jbe MVDeltaOutOfRange
  140. mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl
  141. add al,ch ; VMVf = TRb * VMV / TRd + VMVd
  142. cmp al,040H ; If too far up or down, quick out.
  143. jbe MVDeltaOutOfRange
  144. mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al
  145. sub bl,dl ; -HMVb = -(HMVf - HMV)
  146. mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl
  147. sub al,dh ; -VMVb = -(VMVf - VMV)
  148. test esi,4*SIZEOF T_Blk
  149. mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al
  150. je VMVdAndHMVdAreNonZero_Loop
  151. sub esi,4*SIZEOF T_Blk
  152. jmp CandidateMVsGotten
  153. VMVdIsZero:
  154. VMVdIsZero_Loop:
  155. mov edx,[esi].BlkY1.MVs
  156. xor eax,eax
  157. mov al,dh
  158. xor ebx,ebx
  159. mov bl,dl
  160. add esi,SIZEOF T_Blk
  161. mov dh,[edi+eax+64] ; -VMVb = -((TRb - TRd) * VMV) / TRd
  162. mov al,[edi+eax] ; TRb * VMV / TRd
  163. mov bl,[edi+ebx] ; TRb * HMV / TRd
  164. mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al
  165. add bl,cl ; HMVf = TRb * HMV / TRd + HMVd
  166. mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,dh
  167. cmp bl,040H ; If too far left or right, quick out.
  168. jbe MVDeltaOutOfRange
  169. mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl
  170. sub bl,dl ; -HMVb = -(HMVf - HMV)
  171. test esi,4*SIZEOF T_Blk
  172. mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl
  173. je VMVdIsZero_Loop
  174. sub esi,4*SIZEOF T_Blk
  175. pxor mm7,mm7 ; Initialize SWD accumulator
  176. jmp CandidateMVsGotten
  177. BFrameEarlyOutForCandidateMV:
  178. MVDeltaOutOfRange:
  179. and esi,-1-7*SIZEOF T_Blk ; Reset block action descr cursor.
  180. mov ebx,CurrSWDState ; Reload ME engine state.
  181. xor eax,eax
  182. mov edx,BestMV ; Previous best MV is still best.
  183. mov al,BFrmSWDState[ebx+2] ; Get next State number.
  184. jmp ProceedWithNextCand
  185. HMVdIsZero:
  186. mov PB CandidateMV,cl ; Record the candidate HMV delta.
  187. add ch,dh ; Try this vertical MV delta.
  188. mov PB CandidateMV+1,ch ; Record the candidate VMV delta.
  189. HMVdIsZeroLoop:
  190. mov edx,[esi].BlkY1.MVs
  191. xor ebx,ebx
  192. mov bl,dl
  193. xor eax,eax
  194. mov al,dh
  195. add esi,SIZEOF T_Blk
  196. mov dl,[edi+ebx+64] ; -HMVb = -((TRb - TRd) * HMV) / TRd
  197. mov bl,[edi+ebx] ; TRb * HMV / TRd
  198. mov al,[edi+eax] ; TRb * VMV / TRd
  199. mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl
  200. add al,ch ; VMVf = TRb * VMV / TRd + VMVd
  201. mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,dl
  202. cmp al,040H ; If too far up or down, quick out.
  203. jbe MVDeltaOutOfRange
  204. mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al
  205. sub al,dh ; -VMVb = -(VMVf - VMV)
  206. test esi,4*SIZEOF T_Blk
  207. mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al
  208. je HMVdIsZeroLoop
  209. sub esi,4*SIZEOF T_Blk
  210. pxor mm7,mm7 ; Initialize SWD accumulator
  211. CandidateMVsGotten:
  212. BFrameSWDLoop_Non0MVCandidate:
  213. xor eax,eax
  214. xor ebx,ebx
  215. mov al,[esi].BlkY1.CandVMVf
  216. mov edi,[esi].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
  217. mov bl,[esi].BlkY1.CandHMVf
  218. mov edx,esi
  219. call ComputeBFrameSWDForCandRef
  220. movdf ecx,mm7
  221. mov eax,[edx].BlkY2.BestBlkLvlSWD
  222. lea esi,[edx+SIZEOF T_Blk] ; Early out if the first N blocks for
  223. cmp ecx,eax ; this cand are worse than the first
  224. jge BFrameEarlyOutForCandidateMV ; N+1 blocks for previous best.
  225. test esi,4*SIZEOF T_Blk ; Quit when fourth block done.
  226. mov [esi-SIZEOF T_Blk].BlkY1.CandBlkLvlSWD,ecx ; Stash SWD.
  227. je BFrameSWDLoop_Non0MVCandidate
  228. ; This candidate is best so far.
  229. mov [esi-4*SIZEOF T_Blk].BlkY4.BestBlkLvlSWD,ecx
  230. mov ebx,CurrSWDState ; Reload ME engine state.
  231. mov [esi-4*SIZEOF T_Blk].BlkU.BestBlkLvlSWD,ecx
  232. sub esi,4*SIZEOF T_Blk
  233. xor eax,eax
  234. mov edx,CandidateMV ; Candidate was best MV.
  235. mov ecx,[esi].BlkY3.CandBlkLvlSWD
  236. mov [esi].BlkY3.BestBlkLvlSWD,ecx
  237. mov ecx,[esi].BlkY2.CandBlkLvlSWD
  238. mov [esi].BlkY2.BestBlkLvlSWD,ecx
  239. mov ecx,[esi].BlkY1.CandBlkLvlSWD
  240. mov [esi].BlkY1.BestBlkLvlSWD,ecx
  241. mov ecx,[esi].BlkY4.CandBiDiMVs
  242. mov [esi].BlkY4.BestBiDiMVs,ecx
  243. mov ecx,[esi].BlkY3.CandBiDiMVs
  244. mov [esi].BlkY3.BestBiDiMVs,ecx
  245. mov ecx,[esi].BlkY2.CandBiDiMVs
  246. mov [esi].BlkY2.BestBiDiMVs,ecx
  247. mov ecx,[esi].BlkY1.CandBiDiMVs
  248. mov [esi].BlkY1.BestBiDiMVs,ecx
  249. mov al,BFrmSWDState[ebx+3] ; Get next State number.
  250. ProceedWithNextCand:
  251. mov CurrSWDState,eax ; Record ME engine state.
  252. test eax,eax
  253. lea edi,WeightForwardMotion
  254. jne BFrmSWDLoop
  255. mov ecx,[esi].BlkY4.BlkLvlSWD0Delta ; 0MV SWD
  256. sub ecx,BFRMNONZEROMVDIFFERENTIAL
  257. mov ebx,[esi].BlkY4.BestBlkLvlSWD ; Best non-0 MV SWD.
  258. cmp ebx,ecx
  259. jge NonZeroBFrmVectorNotGoodEnoughGain
  260. mov [esi].BlkY1.BHMV,dl
  261. mov [esi].BlkY2.BHMV,dl
  262. mov [esi].BlkY3.BHMV,dl
  263. mov [esi].BlkY4.BHMV,dl
  264. mov [esi].BlkY1.BVMV,dh
  265. mov [esi].BlkY2.BVMV,dh
  266. mov [esi].BlkY3.BVMV,dh
  267. mov [esi].BlkY4.BVMV,dh
  268. mov eax,[esi].BlkY4.BestBlkLvlSWD
  269. mov ebx,[esi].BlkY3.BestBlkLvlSWD
  270. sub eax,ebx
  271. mov ecx,[esi].BlkY2.BestBlkLvlSWD
  272. sub ebx,ecx
  273. mov edx,[esi].BlkY1.BestBlkLvlSWD
  274. sub ecx,edx
  275. mov [esi].BlkY4.BestBlkLvlSWD,eax
  276. mov [esi].BlkY3.BestBlkLvlSWD,ebx
  277. mov [esi].BlkY2.BestBlkLvlSWD,ecx
  278. mov [esi].BlkY1.BestBlkLvlSWD,edx
  279. jmp BFrmMVSettled
  280. BelowBFrmZeroThreshold:
  281. NonZeroBFrmVectorNotGoodEnoughGain:
  282. mov ebx,[esi].BlkY4.BlkLvlSWD0Delta
  283. mov ecx,[esi].BlkY3.BlkLvlSWD0Delta
  284. sub ebx,ecx
  285. mov edx,[esi].BlkY2.BlkLvlSWD0Delta
  286. sub ecx,edx
  287. mov edi,[esi].BlkY1.BlkLvlSWD0Delta
  288. sub edx,edi
  289. mov [esi].BlkY4.BestBlkLvlSWD,ebx
  290. mov [esi].BlkY3.BestBlkLvlSWD,ecx
  291. mov [esi].BlkY2.BestBlkLvlSWD,edx
  292. mov [esi].BlkY1.BestBlkLvlSWD,edi
  293. mov eax,[esi].BlkY1.BiDiMVs0Delta
  294. mov [esi].BlkY1.BestBiDiMVs,eax
  295. mov eax,[esi].BlkY2.BiDiMVs0Delta
  296. mov [esi].BlkY2.BestBiDiMVs,eax
  297. mov eax,[esi].BlkY3.BiDiMVs0Delta
  298. mov [esi].BlkY3.BestBiDiMVs,eax
  299. mov eax,[esi].BlkY4.BiDiMVs0Delta
  300. mov [esi].BlkY4.BestBiDiMVs,eax
  301. xor eax,eax
  302. mov [esi].BlkY1.BHMV,al
  303. mov [esi].BlkY2.BHMV,al
  304. mov [esi].BlkY3.BHMV,al
  305. mov [esi].BlkY4.BHMV,al
  306. mov [esi].BlkY1.BVMV,al
  307. mov [esi].BlkY2.BVMV,al
  308. mov [esi].BlkY3.BVMV,al
  309. mov [esi].BlkY4.BVMV,al
  310. BFrmMVSettled:
  311. mov edx,esi
  312. mov bl,8 ; Init coded block pattern
  313. BFrmLumaBlkLoop:
  314. mov esi,[edx].BlkY1.BestBlkLvlSWD ; Get SWD for block.
  315. xor eax,eax
  316. mov BFrmCBP,bl
  317. cmp esi,BFRMEMPTYTHRESHOLD ; Below threshold for forcing empty?
  318. mov ecx,BSWDTotal
  319. jl BFrmLumaBlkEmpty
  320. mov eax,[edx].BlkY1.BestBiDiMVs
  321. xor ebx,ebx
  322. add ecx,esi
  323. mov bl,ah
  324. mov BSWDTotal,ecx
  325. and eax,0FFH
  326. call BFrameDTQ
  327. mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0.
  328. mov al,BFrmCBP
  329. BFrmLumaBlkEmpty:
  330. or bl,al ; Factor in CBP bit for this block.
  331. add edx,SIZEOF T_Blk
  332. shr bl,1 ; CF == 1 when sentinel shifted off
  333. jnc BFrmLumaBlkLoop
  334. mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl
  335. sub edx,4*SIZEOF T_Blk
  336. mov eax,QPDiv2 ; Restore these for P frame blocks.
  337. mov ebx,BQPDiv2
  338. mov QPDiv2,ebx
  339. mov BQPDiv2,eax
  340. mov eax,CodeStreamCursor
  341. mov ebx,BCodeStreamCursor
  342. mov CodeStreamCursor,ebx
  343. mov BCodeStreamCursor,eax
  344. mov eax,Recip2QPToUse
  345. mov ebx,BRecip2QPToUse
  346. mov Recip2QPToUse,ebx
  347. mov BRecip2QPToUse,eax
  348. ret
  349. MMxDoBFrameChromaBlocks:
  350. ; mov eax,QPDiv2 ; Swap these so Quantizer uses right level.
  351. ; mov ebx,BQPDiv2 ; (Loaded in caller.)
  352. mov QPDiv2,ebx
  353. mov BQPDiv2,eax
  354. mov eax,CodeStreamCursor
  355. mov ebx,BCodeStreamCursor
  356. mov CodeStreamCursor,ebx
  357. mov BCodeStreamCursor,eax
  358. mov eax,Recip2QPToUse
  359. mov ebx,BRecip2QPToUse
  360. mov Recip2QPToUse,ebx
  361. mov cl,INTER1MV
  362. mov BRecip2QPToUse,eax
  363. mov StashBlockType,cl
  364. mov eax,[edx].BlkU.BestBiDiMVs
  365. xor ebx,ebx
  366. mov bl,ah
  367. and eax,0FFH
  368. add edx,4*SIZEOF T_Blk ; To know we're working on chroma.
  369. call BFrameDTQ
  370. mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0.
  371. mov al,[edx-4*SIZEOF T_Blk].CodedBlocksB
  372. or bl,al ; Factor in CBP bit for this block.
  373. mov eax,[edx-4*SIZEOF T_Blk].BlkV.BestBiDiMVs
  374. mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl
  375. xor ebx,ebx
  376. mov bl,ah
  377. and eax,0FFH
  378. add edx,SIZEOF T_Blk
  379. call BFrameDTQ
  380. mov bl,BlkEmptyFlag[ebx+2] ; Fetch 32 if block not empty; else 0.
  381. mov al,[edx-5*SIZEOF T_Blk].CodedBlocksB
  382. or bl,al ; Factor in CBP bit for this block.
  383. mov eax,QPDiv2 ; Restore these for P frame blocks.
  384. mov [edx-5*SIZEOF T_Blk].CodedBlocksB,bl
  385. mov ebx,BQPDiv2
  386. mov QPDiv2,ebx
  387. mov BQPDiv2,eax
  388. mov eax,CodeStreamCursor
  389. mov ebx,BCodeStreamCursor
  390. mov CodeStreamCursor,ebx
  391. mov BCodeStreamCursor,eax
  392. mov eax,Recip2QPToUse
  393. mov ebx,BRecip2QPToUse
  394. mov Recip2QPToUse,ebx
  395. mov BRecip2QPToUse,eax
  396. sub edx,5*SIZEOF T_Blk
  397. ret
  398. ;===============================================================================
  399. ; ebp -- Pitch
  400. ; edi -- Address of (0-MV) block within frame.
  401. ; edx -- Block Action Decriptor cursor
  402. ; ebx -- HMVf (HMV to apply to past reference) biased by 96.
  403. ; eax -- VMVf (VMV to apply to past reference) biased by 96.
  404. StackOffset TEXTEQU <8>
  405. ComputeBFrameSWDForCandRef:
  406. test al,1
  407. mov ecx,PreviousFrameBaseAddress
  408. lea eax,[eax+eax*2] ; Start of VMVf*384
  409. jne ME_VMVfAtHalfPelPosition
  410. ME_VMVfAtFullPelPosition:
  411. IF PITCH-384
  412. **** The magic leaks out if PITCH != 384
  413. ENDIF
  414. shl eax,6
  415. add ecx,edi
  416. shr ebx,1 ; CF == 1 iff HMVf is at half pel.
  417. jc ME_VMVfAtFull_HMVfAtHalfPelPosition
  418. ME_VMVfAtFull_HMVfAtFullPelPosition:
  419. lea esi,[ecx+eax-48*PITCH-48]
  420. lea ecx,[ebp+ebp*2]
  421. add esi,ebx ; Address of past reference block.
  422. mov eax,BFrameBaseAddress
  423. add edi,eax ; Address of target block.
  424. lea ebx,[ebp+ebp*4]
  425. movq mm0,[esi+ebp*1]
  426. psubw mm0,[edi+ebp*1] ; Get diff for line 1.
  427. movq mm1,[esi+ecx] ; Ref MB, upper left block, Line 3.
  428. psllw mm0,8 ; Extract diffs for line 1 even pels.
  429. psubw mm1,[edi+ecx] ; Diff for line 3.
  430. pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
  431. movq mm2,[esi+ebx]
  432. psllw mm1,8
  433. psubw mm2,[edi+ebx]
  434. pmaddwd mm1,mm1
  435. movq mm3,[esi+PITCH*7]
  436. psllw mm2,8
  437. psubw mm3,[edi+PITCH*7]
  438. pmaddwd mm2,mm2
  439. movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
  440. psllw mm3,8
  441. psubw mm4,[edi] ; Diff for line 0.
  442. paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
  443. movq mm1,[esi+ebp*2]
  444. pmaddwd mm3,mm3
  445. psubw mm1,[edi+ebp*2]
  446. paddusw mm0,mm2
  447. movq mm2,[esi+ebp*4]
  448. pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
  449. psubw mm2,[edi+ebp*4]
  450. paddusw mm0,mm3
  451. movq mm3,[esi+ecx*2]
  452. pmaddwd mm1,mm1
  453. psubw mm3,[edi+ecx*2]
  454. pmaddwd mm2,mm2
  455. paddusw mm0,mm4
  456. pmaddwd mm3,mm3
  457. paddusw mm0,mm1
  458. ;
  459. paddusw mm0,mm2
  460. ;
  461. paddusw mm0,mm3
  462. ;
  463. punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
  464. ;
  465. paddusw mm0,mm1 ; mm0[48:63] is SWD for block.
  466. ;
  467. psrlq mm0,48 ; SWD for block.
  468. ;
  469. paddd mm7,mm0 ; mm7 is SWD for all four blocks.
  470. ;
  471. ret
  472. ME_VMVfAtFull_HMVfAtHalfPelPosition:
  473. lea esi,[ecx+eax-48*PITCH-48]
  474. mov eax,BFrameBaseAddress
  475. add esi,ebx ; Address of past reference block.
  476. add edi,eax ; Address of target block.
  477. lea ecx,[ebp+ebp*2]
  478. movq mm0,mm6 ; 8 bytes of 1
  479. pmullw mm0,[esi] ; <(P07+P06)*256+junk ...>
  480. movq mm1,mm6
  481. pmullw mm1,[esi+ebp*2]
  482. movq mm2,mm6
  483. pmullw mm2,[esi+ebp*4]
  484. movq mm3,mm6
  485. movq mm4,[edi] ; <C07 C06 C05 C04 C03 C02 C01 C00>
  486. psrlw mm0,1 ; <(P07+P06)*256/2+junk ...>
  487. pmullw mm3,[esi+ecx*2]
  488. psllw mm4,8 ; <C06*256 C04*256 C02*256 C00*256>
  489. movq mm5,[edi+ebp*2]
  490. psrlw mm1,1
  491. psubw mm0,mm4 ; <(P07+P06)*256/2-C06*256+junk ...>
  492. psllw mm5,8
  493. movq mm4,[edi+ebp*4]
  494. psrlw mm2,1
  495. psubw mm1,mm5
  496. psllw mm4,8
  497. movq mm5,[edi+ecx*2]
  498. psrlw mm3,1
  499. psubw mm2,mm4
  500. pmaddwd mm0,mm0 ; SSD fof even pels of line 0.
  501. pmaddwd mm1,mm1
  502. psllw mm5,8
  503. psubw mm3,mm5
  504. pmaddwd mm2,mm2
  505. pmaddwd mm3,mm3
  506. movq mm5,mm6
  507. pmullw mm6,[esi+ebp*1+1] ; <(P18+P17)*256+junk ...>
  508. movq mm4,mm5
  509. pmullw mm5,[esi+ecx+1]
  510. paddusw mm0,mm1 ; Accum SSD for lines 0 and 2.
  511. paddusw mm2,mm3
  512. movq mm1,mm4
  513. pmullw mm4,[esi+PITCH*5+1]
  514. paddusw mm0,mm2
  515. pmullw mm1,[esi+PITCH*7+1]
  516. psrlw mm6,1 ; <(P18+P17)*256/2+junk ...>
  517. psubw mm6,[edi+ebp*1] ; <(P18+P17)*256/2-C17*256+junk ...>
  518. psrlw mm5,1
  519. psubw mm5,[edi+ecx]
  520. psrlw mm4,1
  521. psubw mm4,[edi+PITCH*5]
  522. pmaddwd mm6,mm6 ; SSD for odd pels of line 1.
  523. pmaddwd mm5,mm5
  524. psrlw mm1,1
  525. psubw mm1,[edi+PITCH*7]
  526. pmaddwd mm4,mm4
  527. pmaddwd mm1,mm1
  528. paddusw mm0,mm6
  529. pxor mm6,mm6
  530. paddusw mm0,mm5
  531. pcmpeqb mm5,mm5
  532. paddusw mm0,mm4
  533. psubb mm6,mm5 ; Restore 8 bytes of -1.
  534. paddusw mm0,mm1
  535. punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
  536. ;
  537. paddusw mm0,mm1 ; mm0[48:63] is SWD for block.
  538. ;
  539. psrlq mm0,48 ; SWD for block.
  540. ;
  541. paddd mm7,mm0 ; mm7 is SWD for all four blocks.
  542. ;
  543. ret
  544. ME_VMVfAtHalfPelPosition:
  545. IF PITCH-384
  546. **** The magic leaks out if PITCH != 384
  547. ENDIF
  548. shl eax,6
  549. lea ecx,[ecx+edi-48*PITCH-48-PITCH/2]
  550. add ecx,eax
  551. mov eax,BFrameBaseAddress
  552. shr ebx,1 ; CF == 1 iff HMVf is at half pel.
  553. mov esi,ecx ; esi and ecx same if HMVf at full pel,
  554. adc ecx,ebx ; but inc ecx if HMVf is at half pel.
  555. add esi,ebx
  556. add edi,eax ; Address of target block.
  557. lea ebx,[ebp+ebp*2]
  558. movq mm0,[esi] ; <P07 P06 ...>
  559. pcmpeqb mm6,mm6
  560. movq mm1,[ecx+ebp*1] ; <P17 P16 ...> or <P18 P17 ...>
  561. psrlw mm6,8
  562. movq mm2,[esi+ebp*2] ; <P27 P26 ...>
  563. paddb mm0,mm1 ; <P07+P17 junk ...> or <P07+P18 junk ...>
  564. movq mm3,[ecx+ebx] ; <P37 P36 ...> or <P38 P37 ...>
  565. paddb mm1,mm2 ; <junk P16+P26 ...> or <junk P17+P26 ...>
  566. movq mm4,[esi+ebp*4] ; <P47 P46 ...>
  567. paddb mm2,mm3 ; <P27+P37 junk ...> or <P27+P38 junk ...>
  568. paddb mm3,mm4 ; <junk P36+P46 ...> or <junk P37+P46 ...>
  569. psrlw mm0,1 ; <(P07+P17)/2 junk ...> or (P07+P18)/2 junk ...>
  570. pand mm1,mm6 ; <P16+P26 ...> or <P17+P26 ...>
  571. psrlw mm2,1 ; <(P27+P37)/2 junk ...> or (P27+P38)/2 junk ...>
  572. movq mm5,[edi+ebp*1] ; <C17 C16 C15 C14 C13 C12 C11 C10>
  573. pand mm3,mm6 ; <P36+P46 ...> or <P37+P46 ...>
  574. movq mm6,[edi+ebx] ; <C37 C36 C35 C34 C33 C32 C31 C30>
  575. psllw mm5,8 ; <C16 0 C14 0 C12 0 C10 0>
  576. psubw mm0,[edi] ; <(P07+P17)/2-C07 junk ...> or ...
  577. psllw mm1,7 ; <(P16+P26)/2 ...> or <(P17+P26)/2 ...>
  578. psubw mm2,[edi+ebp*2] ; <(P27+P37)/2-C27 junk ...> or ...
  579. psllw mm6,8 ; <C36 0 C34 0 C32 0 C30 0>
  580. pmaddwd mm0,mm0 ; SSD of even pels of line 0.
  581. psubw mm1,mm5 ; <(P16+P26)/2-C16 junk ...> or ...
  582. pmaddwd mm1,mm1 ; SSD of odd pels of line 1.
  583. psllw mm3,7 ; <(P36+P46)/2 ...> or <(P37+P46)/2 ...>
  584. pmaddwd mm2,mm2 ; SSD of even pels of line 2.
  585. psubw mm3,mm6 ; <(P36+P46)/2-C36 junk ...> or ...
  586. pmaddwd mm3,mm3 ; SSD of odd pels of line 3.
  587. pcmpeqb mm6,mm6
  588. paddusw mm0,mm1
  589. movq mm1,[ecx+PITCH*5]
  590. paddusw mm0,mm2
  591. movq mm2,[esi+ebx*2]
  592. paddusw mm0,mm3
  593. movq mm3,[ecx+PITCH*7]
  594. paddb mm4,mm1
  595. paddb mm1,mm2
  596. paddb mm2,mm3
  597. paddb mm3,[esi+ebp*8]
  598. psrlw mm6,8
  599. pand mm1,mm6
  600. psrlw mm4,1
  601. movq mm5,[edi+PITCH*5]
  602. psrlw mm2,1
  603. pand mm3,mm6
  604. psllw mm5,8
  605. movq mm6,[edi+PITCH*7]
  606. psllw mm1,7
  607. psubw mm4,[edi+ebp*4]
  608. psllw mm3,7
  609. psubw mm2,[edi+ebx*2]
  610. psllw mm6,8
  611. pmaddwd mm4,mm4
  612. psubw mm1,mm5
  613. pmaddwd mm2,mm2
  614. psubw mm3,mm6
  615. pmaddwd mm1,mm1
  616. pxor mm6,mm6
  617. pmaddwd mm3,mm3
  618. paddusw mm0,mm4
  619. pcmpeqb mm5,mm5
  620. paddusw mm0,mm1
  621. psubb mm6,mm5 ; Restore 8 bytes of 1.
  622. paddusw mm0,mm2
  623. paddusw mm0,mm3
  624. ;
  625. punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
  626. ;
  627. paddusw mm0,mm1 ; mm0[48:63] is SWD for block.
  628. ;
  629. psrlq mm0,48 ; SWD for block.
  630. ;
  631. paddd mm7,mm0 ; mm7 is SWD for all four blocks.
  632. ;
  633. ret
  634. ;===============================================================================
  635. ; ebp -- Pitch
  636. ; edx -- Block Action Decriptor cursor
  637. ; ebx -- VMVf (VMV to apply to past reference) biased by 96.
  638. ; eax -- HMVf (HMV to apply to past reference) biased by 96.
  639. StackOffset TEXTEQU <8>
  640. BFrameDTQ:
  641. test bl,1
  642. lea ebx,[ebx+ebx*2] ; Start of VMVf*384
  643. mov ecx,PreviousFrameBaseAddress
  644. jne Diff_VMVfAtHalfPelPosition
  645. Diff_VMVfAtFullPelPosition:
  646. IF PITCH-384
  647. **** The magic leaks out if PITCH != 384
  648. ENDIF
  649. shl ebx,6
  650. mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
  651. shr eax,1 ; CF == 1 iff HMVf is at half pel.
  652. jc Diff_VMVfAtFull_HMVfAtHalfPelPosition
  653. Diff_VMVfAtFull_HMVfAtFullPelPosition:
  654. lea esi,[ecx+ebx-48*PITCH-48]
  655. add eax,edi
  656. add esi,eax ; Address of past reference block.
  657. mov ecx,PITCH/4 ; Pitch for past reference blk, div 4.
  658. mov eax,BFrameBaseAddress ; Address of target block.
  659. mov PastRefPitchDiv4,ecx
  660. add edi,eax ; Address of target block.
  661. jmp Diff_GetFutureContribToPred
  662. Diff_VMVfAtHalfPelPosition:
  663. IF PITCH-384
  664. **** The magic leaks out if PITCH != 384
  665. ENDIF
  666. shl ebx,6
  667. mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
  668. shr eax,1 ; CF == 1 iff HMVf is at half pel.
  669. jc Diff_VMVfAtHalf_HMVfAtHalfPelPosition
  670. Diff_VMVfAtHalf_HMVfAtFullPelPosition:
  671. lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias.
  672. add eax,edi
  673. add esi,eax ; Address of past reference block.
  674. lea eax,PelDiffs-32
  675. pcmpeqb mm6,mm6
  676. pcmpeqb mm7,mm7 ; 8 bytes -1
  677. movq mm2,[esi] ; Line0
  678. paddb mm6,mm6 ; 8 bytes of 0xFE.
  679. @@:
  680. movq mm1,[esi+ebp*1] ; Line1
  681. movq mm0,mm2 ; Line0
  682. movq mm2,[esi+ebp*2] ; Line2
  683. psubb mm1,mm7 ; Line1+1
  684. paddb mm0,mm1 ; Line0+Line1+1
  685. paddb mm1,mm2 ; Line1+Line2+1
  686. pand mm0,mm6 ; pre-clean
  687. pand mm1,mm6 ; pre-clean
  688. add eax,32 ; Advance pointer for PelDiffs output.
  689. psrlq mm0,1 ; (Line0+Line1+1)/2
  690. lea esi,[esi+ebp*2] ; Advance input ptr 2 lines.
  691. psrlq mm1,1 ; (Line1+Line2+1)/2
  692. movq [eax],mm0 ; Store Past Ref for Line0
  693. movq [eax+16],mm1 ; Store Past Ref for Line1
  694. test al,32 ; Iterate twice
  695. jne @b
  696. test al,64 ; Iterate twice.
  697. mov ecx,4 ; Pitch for past reference blk, div 4.
  698. mov PastRefPitchDiv4,ecx
  699. jne @b
  700. mov eax,BFrameBaseAddress
  701. lea esi,PelDiffs ; Address of interpolated past ref blk.
  702. add edi,eax ; Address of target block.
  703. jmp Diff_GetFutureContribToPred
  704. Diff_VMVfAtFull_HMVfAtHalfPelPosition:
  705. lea esi,[ecx+ebx-48*PITCH-48] ; Begin get pastrefaddr. Del bias.
  706. add eax,edi
  707. add esi,eax ; Address of past reference block.
  708. lea eax,PelDiffs-32
  709. lea ebx,Pel_Rnd
  710. xor ecx,ecx
  711. @@:
  712. movq mm0,[esi+1] ; <P08 P07 P06 P05 P04 P03 P02 P01>
  713. pcmpeqb mm7,mm7
  714. mov cl,[esi] ; P00
  715. movq mm2,mm0 ; <P08 P07 P06 P05 P04 P03 P02 P01>
  716. movq mm1,[esi+ebp*1+1]
  717. psllq mm2,8 ; <P07 P06 P05 P04 P03 P02 P01 0>
  718. paddb mm0,[ebx+ecx*8] ; <P08+1 P07+1 ... P01+P00+1>
  719. movq mm3,mm1
  720. mov cl,[esi+ebp*1]
  721. psllq mm3,8
  722. paddb mm1,mm3
  723. paddb mm0,mm2 ; <P08+P07+1 P07+P06+1 ... P01+P00+1>
  724. paddb mm1,[ebx+ecx*8]
  725. paddb mm7,mm7 ; 8 bytes of 0xFE.
  726. pand mm0,mm7 ; pre-clean
  727. pand mm1,mm7 ; pre-clean
  728. add eax,32 ; Advance pointer for PelDiffs output.
  729. psrlq mm0,1 ; <(P08+P07+1)/2 ...>
  730. lea esi,[esi+ebp*2] ; Advance input ptr 2 lines.
  731. psrlq mm1,1
  732. movq [eax],mm0 ; Store Past Ref for Line0
  733. movq [eax+16],mm1 ; Store Past Ref for Line1
  734. test al,32 ; Iterate twice
  735. jne @b
  736. test al,64 ; Iterate twice.
  737. mov cl,4 ; Pitch for past reference blk, div 4.
  738. mov PastRefPitchDiv4,ecx
  739. jne @b
  740. mov eax,BFrameBaseAddress
  741. lea esi,PelDiffs ; Address of interpolated past ref blk.
  742. add edi,eax ; Address of target block.
  743. jmp Diff_GetFutureContribToPred
  744. Diff_VMVfAtHalf_HMVfAtHalfPelPosition:
  745. lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias.
  746. add eax,edi
  747. add esi,eax ; Address of past reference block.
  748. lea eax,PelDiffs-32
  749. lea ebx,Pel_Rnd
  750. xor ecx,ecx
  751. movq mm3,[esi+1] ; 0A: <P08 P07 P06 P05 P04 P03 P02 P01>
  752. pcmpeqb mm7,mm7
  753. mov cl,[esi] ; 0B: P00
  754. movq mm0,mm3 ; 0C: <P08 P07 P06 P05 P04 P03 P02 P01>
  755. paddb mm7,mm7 ; 8 bytes of 0xFE.
  756. psllq mm0,8 ; 0D: <P07 P06 P05 P04 P03 P02 P01 0>
  757. paddb mm3,[ebx+ecx*8] ; 0E: <P08+1 P07+1 ... P01+P00+1>
  758. movq mm6,mm7 ; 8 bytes of 0xFE.
  759. @@:
  760. movq mm1,[esi+ebp*1+1] ; 1A: <P18 P17 P16 P15 P14 P13 P12 P11>
  761. paddb mm0,mm3 ; 0F: <P08+P07+1 ... P01+P00+1>
  762. mov cl,[esi+ebp*1] ; 1B: P10
  763. movq mm3,mm1 ; 1C: <P18 P17 P16 P15 P14 P13 P12 P11>
  764. movq mm2,[esi+ebp*2+1] ; 2A: <P28 P27 P26 P25 P24 P23 P22 P21>
  765. psllq mm3,8 ; 1D: <P17 P16 P15 P14 P13 P12 P11 0>
  766. paddb mm1,[ebx+ecx*8] ; 1E: <P18+1 P17+1 ... P11+P10+1>
  767. movq mm4,mm2 ; 2C: <P28 P27 P26 P25 P24 P23 P22 P21>
  768. mov cl,[esi+ebp*2] ; 2B: P20
  769. paddb mm1,mm3 ; 1F: <P18+P17+1 ... P11+P10+1>
  770. pandn mm6,mm1 ; 0G: <(P18+P17+1)&1 ...>
  771. psllq mm4,8 ; 2D: <P27 P26 P25 P24 P23 P22 P21 0>
  772. paddb mm2,[ebx+ecx*8] ; 2E: <P28+1 P27+1 ... P21+P20+1>
  773. movq mm5,mm6 ; 1G: <(P18+P17+1)&1 ...>
  774. paddb mm2,mm4 ; 2F: <P28+P27+1 ... P21+P20+1>
  775. pand mm6,mm0 ; 0H: <(P18+P17+1)&(P08+P07+1)&1 ...>
  776. pand mm5,mm2 ; 1H: <(P18+P17+1)&(P28+P27+1)&1 ...>
  777. pand mm0,mm7 ; 0I: pre-clean for divide
  778. pand mm1,mm7 ; 1I: pre-clean for divide
  779. psrlq mm0,1 ; 0J: <(P08+P07+1)/2 ...>
  780. movq mm3,mm2 ; Save line 2 for next iter's line 0.
  781. psrlq mm1,1 ; 1J: <(P18+P17+1)/2 ...>
  782. pand mm2,mm7 ; 2I: pre-clean for divide
  783. paddb mm0,mm1 ; 0K: <(P08+P07+1)/2+(P18+P17+1)/2 ...>
  784. paddb mm6,mm0 ; 0L: <(P08+P07+P18+P17+2)/2 ...>
  785. psrlq mm2,1 ; 2J: <(P28+P27+1)/2 ...>
  786. paddb mm1,mm2 ; 1K: <(P18+P17+1)/2+(P28+P27+1)/2 ...>
  787. pand mm6,mm7 ; 0M: pre-clean for divide
  788. paddb mm5,mm1 ; 1L: <(P18+P17+P28+P27+2)/2 ...>
  789. psrlq mm6,1 ; 0M: <(P08+P07+P18+P17+2)/4 ...>
  790. add eax,32 ; Advance pointer for PelDiffs output.
  791. pand mm5,mm7 ; 1M: pre-clean for divide
  792. lea esi,[esi+ebp*2] ; Advance input ptr 2 lines.
  793. psrlq mm5,1 ; 1N: <(P18+P17+P28+P27+2)/4 ...>
  794. movq [eax],mm6 ; 0O: Store Past Ref for Line0
  795. pxor mm0,mm0 ; So that add of mm3 is just like movq.
  796. movq [eax+16],mm5 ; 1O: Store Past Ref for Line1
  797. movq mm6,mm7 ; 8 bytes of 0xFE.
  798. test al,32 ; Iterate twice
  799. jne @b
  800. test al,64 ; Iterate twice.
  801. mov cl,4 ; Pitch for past reference blk, div 4.
  802. jne @b
  803. mov eax,BFrameBaseAddress
  804. lea esi,PelDiffs ; Address of interpolated past ref blk.
  805. add edi,eax ; Address of target block.
  806. mov PastRefPitchDiv4,ecx
  807. Diff_GetFutureContribToPred:
  808. ;===============================================================================
  809. ;
  810. ; Registers at entry:
  811. ; edi -- Pointer to target block.
  812. ; esi -- Pointer to past reference.
  813. ; edx -- Block Descriptor within MacroBlockActionDescritptorStream
  814. ;
  815. ; Subsequent assignments:
  816. ;
  817. ; ebp -- Pitch for past reference block, div 4. Loop counter in high 2 bits.
  818. ; ecx -- Pointer to future reference block
  819. ; ebx -- Pointer to list of indices of multipliers to wt past and future refs.
  820. ; eax,edx -- Index of multiplier to weight past and future ref.
  821. xor ecx,ecx
  822. mov eax,edx
  823. IF SIZEOF T_Blk-16
  824. **** The magic leaks out if size of block descriptor is not 16.
  825. ENDIF
  826. mov cl,[edx].BlkY1.BestHMVb ; HMV for future reference block.
  827. and edx,112 ; Extract block number (times 16).
  828. xor ebx,ebx
  829. mov BlockActionDescrCursor,eax
  830. mov bl,[eax].BlkY1.BestVMVb ; VMV for future reference block.
  831. mov eax,LeftRightBlkPosition[edx]
  832. mov ebp,ecx
  833. CONST_384 TEXTEQU <384>
  834. mov edx,UpDownBlkPosition[edx]
  835. mov cl,[eax+ecx*2] ; Get horz part of past/future wt sel.
  836. IF PITCH-384
  837. **** The magic leaks out if PITCH != 384
  838. ENDIF
  839. lea eax,[ebx+ebx*2] ; Start of VMVb*384
  840. mov bl,[edx+ebx*2] ; Get vert part of past/future wt sel.
  841. shl eax,6
  842. mov edx,BFrameToFuture
  843. lea ebx,Diff_IdxRefWts[ecx+ebx] ; Addr of list of wts for refs.
  844. test al,64 ; Is VMVb odd?
  845. lea eax,[eax+edx] ; Begin to get addr futr ref.
  846. jne Diff_VMVbAtHalfPelPosition
  847. Diff_VMVbAtFullPelPosition:
  848. CONST_384 TEXTEQU <384>
  849. shr ebp,1 ; CF == 1 iff HMVf is at half pel.
  850. lea esp,[esp-128]
  851. StackOffset TEXTEQU <136>
  852. lea ecx,[eax+edi-48*PITCH-48]
  853. jc Diff_VMVbAtFull_HMVbAtHalfPelPosition
  854. Diff_VMVbAtFull_HMVbAtFullPelPosition:
  855. CONST_384 TEXTEQU <384>
  856. add ecx,ebp ; Address of future reference block.
  857. mov ebp,PastRefPitchDiv4
  858. xor eax,eax
  859. xor edx,edx
  860. @@:
  861. StackOffset TEXTEQU <undefined>
  862. mov al,[ebx] ; 0A: Index of weights for line 0.
  863. add esp,32 ; Advance Pel Difference cursor
  864. mov dl,[ebx+1] ; 1A: Index of weights for line 1.
  865. add ebx,2 ; Advance list ptr for ref weights.
  866. movq mm0,[ecx] ; 0B: <F07 F06 F05 F04 F03 F02 F01 F00>
  867. pcmpeqb mm7,mm7
  868. movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
  869. paddb mm7,mm7 ; 8 bytes of 0xFE
  870. movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
  871. pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
  872. pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
  873. paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
  874. movq mm1,[ecx+PITCH] ; 1B: <F17 F16 F15 F14 F13 F12 F11 F10>
  875. paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
  876. movq mm2,FutureWt_FF_or_00[edx]; 1C: <In?FF:00 ...>
  877. pand mm0,mm7 ; 0I: pre-clean
  878. movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
  879. pand mm1,mm2 ; 1E: <In?F17:00 In?F16:00 ...>
  880. pandn mm2,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
  881. paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
  882. movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
  883. psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
  884. psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
  885. paddb mm1,mm2 ; 1H: <In?F17+P17:2P17 ...>
  886. movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
  887. pand mm1,mm7 ; 1I: pre-clean
  888. add edi,PITCH*2 ; Advance Target Blk cursor
  889. psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
  890. StackOffset TEXTEQU <8+96>
  891. movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
  892. StackOffset TEXTEQU <undefined>
  893. psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
  894. add ecx,PITCH*2 ; Advance Future Ref Blk cursor
  895. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
  896. StackOffset TEXTEQU <8+96>
  897. movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1.
  898. StackOffset TEXTEQU <undefined>
  899. add ebp,080000000H ; Iterate twice
  900. jnc @b
  901. test ebp,040000000H ; Iterate twice
  902. lea ebp,[ebp+040000000H]
  903. je @b
  904. StackOffset TEXTEQU <8>
  905. mov ebp,16
  906. lea esi,PelDiffs
  907. mov edx,BlockActionDescrCursor
  908. jmp MMxDoForwardDCT
  909. Diff_VMVbAtHalfPelPosition:
  910. CONST_384 TEXTEQU <384>
  911. shr ebp,1 ; CF == 1 iff HMVf is at half pel.
  912. lea esp,[esp-128]
  913. StackOffset TEXTEQU <136>
  914. lea ecx,[eax+edi-48*PITCH-48-PITCH/2]
  915. jc Diff_VMVbAtHalf_HMVbAtHalfPelPosition
  916. Diff_VMVbAtHalf_HMVbAtFullPelPosition:
  917. CONST_384 TEXTEQU <384>
  918. add ecx,ebp ; Address of future reference block.
  919. mov ebp,PastRefPitchDiv4
  920. xor eax,eax
  921. xor edx,edx
  922. movq mm6,[ecx] ; 0B: <F07 F06 F05 F04 F03 F02 F01 F00>
  923. pcmpeqb mm7,mm7 ; 8 bytes -1
  924. @@:
  925. StackOffset TEXTEQU <undefined>
  926. movq mm1,[ecx+PITCH] ; 1a: <f17 f16 f15 f14 f13 f12 f11 f10>
  927. movq mm0,mm6 ; 0a: <f07 f06 f05 f04 f03 f02 f01 f00>
  928. mov al,[ebx] ; 0A: Index of weights for line 0.
  929. psubb mm1,mm7 ; b: <f17+1 ...>
  930. movq mm6,[ecx+PITCH*2] ; 2a: <f27 f26 f25 f24 f23 f22 f21 f20>
  931. paddb mm0,mm1 ; 0c: <f07+f17+1..>
  932. mov dl,[ebx+1] ; 1A: Index of weights for line 1.
  933. paddb mm7,mm7 ; 8 bytes of 0xFE
  934. paddb mm1,mm6 ; 1c: <f17+f27+1..>
  935. pand mm0,mm7 ; 0d: pre-clean
  936. pand mm1,mm7 ; 1d: pre-clean
  937. psrlq mm0,1 ; 0B: <(F07 = f07+f17+1)/2>
  938. movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
  939. psrlq mm1,1 ; 1B: <(F17 = f17+f27+1)/2>
  940. movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
  941. pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
  942. pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
  943. paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
  944. add ebx,2 ; Advance list ptr for ref weights.
  945. paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
  946. movq mm2,FutureWt_FF_or_00[edx]; 1C: <In?FF:00 ...>
  947. pand mm0,mm7 ; 0I: pre-clean
  948. movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
  949. pand mm1,mm2 ; 1E: <In?F17:00 In?F16:00 ...>
  950. pandn mm2,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
  951. paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
  952. movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
  953. psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
  954. psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
  955. add esp,32 ; Advance Pel Difference cursor
  956. movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
  957. paddb mm1,mm2 ; 1H: <In?F17+P17:2P17 ...>
  958. add ecx,PITCH*2 ; Advance Future Ref Blk cursor
  959. pand mm1,mm7 ; 1I: pre-clean
  960. add edi,PITCH*2 ; Advance Target Blk cursor
  961. psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
  962. StackOffset TEXTEQU <8+96>
  963. movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
  964. StackOffset TEXTEQU <undefined>
  965. psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
  966. pcmpeqb mm7,mm7 ; 8 bytes -1
  967. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
  968. StackOffset TEXTEQU <8+96>
  969. movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1.
  970. StackOffset TEXTEQU <undefined>
  971. pcmpeqb mm7,mm7 ; 8 bytes -1
  972. add ebp,080000000H ; Iterate twice
  973. jnc @b
  974. add ebp,040000000H ; Iterate twice
  975. test ebp,ebp
  976. jns @b
  977. StackOffset TEXTEQU <8>
  978. mov ebp,16
  979. lea esi,PelDiffs
  980. mov edx,BlockActionDescrCursor
  981. jmp MMxDoForwardDCT
  982. Diff_VMVbAtFull_HMVbAtHalfPelPosition:
  983. StackOffset TEXTEQU <136>
  984. CONST_384 TEXTEQU <384>
  985. add ecx,ebp ; Address of future reference block.
  986. mov ebp,PastRefPitchDiv4
  987. xor eax,eax
  988. lea edx,Pel_Rnd
  989. @@:
  990. StackOffset TEXTEQU <undefined>
  991. movq mm0,[ecx+1] ; 0a: <f08 f07 f06 f05 f04 f03 f02 f01>
  992. pcmpeqb mm7,mm7
  993. mov al,[ecx] ; 0b: f00
  994. movq mm2,mm0 ; 0c: <f08 f07 f06 f05 f04 f03 f02 f01>
  995. movq mm1,[ecx+PITCH+1] ; 1a: <f18 f17 f16 f15 f14 f13 f12 f11>
  996. psllq mm2,8 ; 0d: <f07 f06 f05 f04 f03 f02 f01 0>
  997. paddb mm0,[edx+eax*8] ; 0e: <f08+1 f07+1 ... f01+f00+1>
  998. movq mm3,mm1 ; 1c: <f18 f17 f16 f15 f14 f13 f12 f11>
  999. mov al,[ecx+PITCH] ; 1b: f10
  1000. psllq mm3,8 ; 1d: <f17 f16 f15 f14 f13 f12 f11 0>
  1001. paddb mm0,mm2 ; 0f: <f08+f07+1 f07+f06+1 ... f01+f00+1>
  1002. paddb mm1,mm3 ; 1f: <f18+f17 f17+f16 ... f11 >
  1003. paddb mm1,[edx+eax*8] ; 1e: <f18+f17+1 f17+f16+1 ... f11+f10+1>
  1004. paddb mm7,mm7 ; 8 bytes of 0xFE.
  1005. mov al,[ebx] ; 0A: Index of weights for line 0.
  1006. pand mm0,mm7 ; 0g: pre-clean
  1007. movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
  1008. psrlq mm0,1 ; 0B: <F07 = (f08+f07+1)/2 ...>
  1009. movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
  1010. pand mm1,mm7 ; 1g: pre-clean
  1011. mov al,[ebx+1] ; 1A: Index of weights for line 1.
  1012. psrlq mm1,1 ; 1B: <F17 = (f18+f17+1)/2 ...>
  1013. pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
  1014. pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
  1015. movq mm4,FutureWt_FF_or_00[eax]; 1C: <In?FF:00 ...>
  1016. paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
  1017. movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
  1018. paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
  1019. pand mm0,mm7 ; 0I: pre-clean
  1020. pand mm1,mm4 ; 1E: <In?F17:00 In?F16:00 ...>
  1021. pandn mm4,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
  1022. paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
  1023. movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
  1024. psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
  1025. psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
  1026. add esp,32 ; Advance Pel Difference cursor
  1027. add ecx,PITCH*2 ; Advance Future Ref Blk cursor
  1028. paddb mm1,mm4 ; 1H: <In?F17+P17:2P17 ...>
  1029. movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
  1030. pand mm1,mm7 ; 1I: pre-clean
  1031. add edi,PITCH*2 ; Advance Target Blk cursor
  1032. psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
  1033. StackOffset TEXTEQU <8+96>
  1034. movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
  1035. StackOffset TEXTEQU <undefined>
  1036. psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
  1037. add ebx,2 ; Advance list ptr for ref weights.
  1038. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
  1039. StackOffset TEXTEQU <8+96>
  1040. movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1.
  1041. StackOffset TEXTEQU <undefined>
  1042. add ebp,080000000H ; Iterate twice
  1043. jnc @b
  1044. add ebp,040000000H ; Iterate twice
  1045. test ebp,ebp
  1046. jns @b
  1047. StackOffset TEXTEQU <8>
  1048. mov ebp,16
  1049. lea esi,PelDiffs
  1050. mov edx,BlockActionDescrCursor
  1051. jmp MMxDoForwardDCT
  1052. Diff_VMVbAtHalf_HMVbAtHalfPelPosition:
  1053. StackOffset TEXTEQU <136>
  1054. CONST_384 TEXTEQU <384>
  1055. add ecx,ebp ; Address of future reference block.
  1056. mov ebp,PastRefPitchDiv4
  1057. xor eax,eax
  1058. lea edx,Pel_Rnd
  1059. movq mm4,[ecx+1] ; 0a: <f08 f07 f06 f05 f04 f03 f02 f01>
  1060. pcmpeqb mm7,mm7
  1061. mov al,[ecx] ; 0b: f00
  1062. movq mm0,mm4 ; 0c: <f08 f07 f06 f05 f04 f03 f02 f01>
  1063. paddb mm7,mm7 ; 8 bytes of 0xFE.
  1064. psllq mm0,8 ; 0d: <f07 f06 f05 f04 f03 f02 f01 0>
  1065. paddb mm4,[edx+eax*8] ; 0e: <f08+1 f07+1 ... f01+f00+1>
  1066. movq mm6,mm7 ; 8 bytes of 0xFE.
  1067. @@:
  1068. StackOffset TEXTEQU <undefined>
  1069. movq mm1,[ecx+PITCH+1] ; 1a: <f18 f17 f16 f15 f14 f13 f12 f11>
  1070. paddb mm0,mm4 ; 0f: <f08+f07+1 ... f01+f00+1>
  1071. mov al,[ecx+PITCH] ; 1b: f10
  1072. movq mm3,mm1 ; 1c: <f18 f17 f16 f15 f14 f13 f12 f11>
  1073. movq mm2,[ecx+PITCH*2+1] ; 2a: <f28 f27 f26 f25 f24 f23 f22 f21>
  1074. psllq mm3,8 ; 1d: <f17 f16 f15 f14 f13 f12 f11 0>
  1075. paddb mm1,[edx+eax*8] ; 1e: <f18+1 f17+1 ... f11+f10+1>
  1076. movq mm4,mm2 ; 2c: <f28 f27 f26 f25 f24 f23 f22 f21>
  1077. mov al,[ecx+PITCH*2] ; 2b: f20
  1078. paddb mm1,mm3 ; 1f: <f18+f17+1 ... f11+f10+1>
  1079. pandn mm6,mm1 ; 0g: <(f18+f17+1)&1 ...>
  1080. psllq mm4,8 ; 2d: <f27 f26 f25 f24 f23 f22 f21 0>
  1081. paddb mm2,[edx+eax*8] ; 2e: <f28+1 f27+1 ... f21+f20+1>
  1082. movq mm5,mm6 ; 1g: <(f18+f17+1)&1 ...>
  1083. paddb mm2,mm4 ; 2f: <f28+f27+1 ... f21+f20+1>
  1084. pand mm6,mm0 ; 0h: <(f18+f17+1)&(f08+f07+1)&1 ...>
  1085. pand mm5,mm2 ; 1h: <(f18+f17+1)&(f28+f27+1)&1 ...>
  1086. pand mm0,mm7 ; 0i: pre-clean for divide
  1087. pand mm1,mm7 ; 1i: pre-clean for divide
  1088. psrlq mm0,1 ; 0j: <(f08+f07+1)/2 ...>
  1089. movq mm4,mm2 ; Save line 2 for next iter's line 0.
  1090. psrlq mm1,1 ; 1j: <(f18+f17+1)/2 ...>
  1091. pand mm2,mm7 ; 2i: pre-clean for divide
  1092. paddb mm0,mm1 ; 0k: <(f08+f07+1)/2+(f18+f17+1)/2 ...>
  1093. paddb mm0,mm6 ; 0l: <(f08+f07+f18+f17+2)/2 ...>
  1094. psrlq mm2,1 ; 2j: <(f28+f27+1)/2 ...>
  1095. paddb mm1,mm2 ; 1k: <(f18+f17+1)/2+(f28+f27+1)/2 ...>
  1096. pand mm0,mm7 ; 0m: pre-clean for divide
  1097. mov al,[ebx] ; 0A: Index of weights for line 0.
  1098. paddb mm1,mm5 ; 1l: <(f18+f17+f28+f27+2)/2 ...>
  1099. movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
  1100. pand mm1,mm7 ; 1m: pre-clean for divide
  1101. movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
  1102. psrlq mm0,1 ; 0B: <F07 = (f08+f07+f18+f17+2)/4 ...>
  1103. mov al,[ebx+1] ; 1A: Index of weights for line 1.
  1104. psrlq mm1,1 ; 1B: <F17 = (f18+f17+f28+f27+2)/4 ...>
  1105. pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
  1106. pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
  1107. movq mm5,FutureWt_FF_or_00[eax]; 1C: <In?FF:00 ...>
  1108. paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
  1109. movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
  1110. paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
  1111. pand mm0,mm7 ; 0I: pre-clean
  1112. pand mm1,mm5 ; 1E: <In?F17:00 In?F16:00 ...>
  1113. pandn mm5,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
  1114. paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
  1115. movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
  1116. psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
  1117. psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
  1118. add esp,32 ; Advance Pel Difference cursor
  1119. paddb mm1,mm5 ; 1H: <In?F17+P17:2P17 ...>
  1120. add ecx,PITCH*2 ; Advance Future Ref Blk cursor
  1121. movq mm5,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
  1122. pand mm1,mm7 ; 1I: pre-clean
  1123. add edi,PITCH*2 ; Advance Target Blk cursor
  1124. psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
  1125. StackOffset TEXTEQU <8+96>
  1126. movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
  1127. StackOffset TEXTEQU <undefined>
  1128. psubb mm5,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
  1129. add ebx,2 ; Advance list ptr for ref weights.
  1130. lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
  1131. StackOffset TEXTEQU <8+96>
  1132. movq PelDiffs+16,mm5 ; 1M: Save pel differences for line 1.
  1133. StackOffset TEXTEQU <undefined>
  1134. pxor mm0,mm0 ; So that add of mm4 is just like movq.
  1135. add ebp,080000000H ; Iterate twice
  1136. movq mm6,mm7 ; 8 bytes of 0xFE.
  1137. jnc @b
  1138. add ebp,040000000H ; Iterate twice
  1139. test ebp,ebp
  1140. jns @b
  1141. StackOffset TEXTEQU <8>
  1142. mov ebp,16
  1143. lea esi,PelDiffs
  1144. mov edx,BlockActionDescrCursor
  1145. jmp MMxDoForwardDCT
  1146. CONST_384 TEXTEQU <ebp>
  1147. END