Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

753 lines
27 KiB

  1. ;--------------------------------------------------------------------------;
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;--------------------------------------------------------------------------;
  13. ;--------------------------------------------------------------------------;
  14. ;
  15. ; D3mBiMot.asm
  16. ;
  17. ; Description:
  18. ; This module does bi-directional motion compensated prediction for
  19. ; B frames. It is called after forward prediction has been computed
  20. ; and will average in the backward prediction for those pels where
  21. ; the backward motion vector points inside of the referenced P frame.
  22. ;
  23. ; MMx Version
  24. ;
  25. ; Routines: prototypes in:
  26. ; MMX_BiMotionComp none
  27. ;
  28. ;--------------------------------------------------------------------------;
  29. ;--------------------------------------------------------------------------;
  30. ;
  31. ; $Header: S:\h26x\src\dec\d3mbimot.asv 1.2 01 Apr 1996 12:35:48 RMCKENZX $
  32. ; $Log: S:\h26x\src\dec\d3mbimot.asv $
  33. ;//
  34. ;// Rev 1.2 01 Apr 1996 12:35:48 RMCKENZX
  35. ;//
  36. ;// Added MMXCODE1 and MMXDATA1 segments, moved global data
  37. ;// to MMXDATA1 segment.
  38. ;//
  39. ;// Rev 1.1 14 Mar 1996 13:58:00 RMCKENZX
  40. ;//
  41. ;// Optimized routine for speed of execution.
  42. ;//
  43. ;// Rev 1.0 07 Mar 1996 18:36:36 RMCKENZX
  44. ;// Initial revision.
  45. ;
  46. ;--------------------------------------------------------------------------;
  47. ;--------------------------------------------------------------------------;
  48. ;
  49. ; Routine Name:
  50. ; MMX_BiMotionComp(U32, U32, I32, I32, I32)
  51. ;
  52. ; Inputs -- C calling convention:
  53. ; pPrev flat pointer to prediction from previous P frame
  54. ; used for "forward" motion vector prediction.
  55. ; pCurr flat pointer into current P frame
  56. ; to be used for "backward" motion vector prediction.
  57. ; mvx x component of backward motion vector.
  58. ; mvy y component of backward motion vector.
  59. ; iNum block number.
  60. ;
  61. ; Returns:
  62. ; updates the values pointed to by pPrev.
  63. ;
  64. ;--------------------------------------------------------------------------;
  65. ;
  66. ; Version: .006
  67. ; Date: 14 March 1996
  68. ; Author: R. McKenzie
  69. ;
  70. ;--------------------------------------------------------------------------;
  71. .586
  72. .MODEL FLAT
  73. ; make all symbols case sensitive
  74. OPTION CASEMAP:NONE
  75. .xlist
  76. include iammx.inc
  77. .list
  78. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  79. MMXCODE1 ENDS
  80. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  81. MMXDATA1 ENDS
  82. ;-------------------;
  83. ; Stack Use ;
  84. ;-------------------;
  85. ; register storage (rel to old stack ptr as saved in ebp)
  86. ; esi ebp+00
  87. ; edi ebp+04
  88. ; ebp ebp+08
  89. ; ebx ebp+12
  90. ; return address ebp+16
  91. ; C input parameters
  92. pPrev EQU ebp+20
  93. pCurr EQU ebp+24
  94. mvx EQU ebp+28
  95. mvy EQU ebp+32
  96. iNum EQU ebp+36
  97. ; local variables
  98. uColEnd EQU esp+00
  99. uRowEnd EQU esp+02
  100. uColStart EQU esp+04
  101. uRowStart EQU esp+06
  102. mmxTempL EQU esp+08
  103. mmxTempH EQU esp+16
  104. PITCH = 384
  105. FRAMESIZE = 32
  106. MMXDATA1 SEGMENT
  107. ALIGN 8
  108. ; End Start
  109. ; Row Col Row Col
  110. ; y x y x
  111. mmxFudge DWORD 001e001eh, 00010001h
  112. DWORD 001e000eh, 0001fff1h
  113. DWORD 000e001eh, 0fff10001h
  114. DWORD 000e000eh, 0fff1fff1h
  115. DWORD 000e000eh, 00010001h
  116. DWORD 000e000eh, 00010001h
  117. mmxClipT DWORD 7ff87ff8h, 7ff77ff7h
  118. mmxClipB DWORD 7ff77ff7h, 7ff77ff7h
  119. ; start
  120. ColStartMask DWORD 0ffffffffh, 0ffffffffh ; 0
  121. DWORD 0ffffff00h, 0ffffffffh ; 1
  122. DWORD 0ffff0000h, 0ffffffffh ; 2
  123. DWORD 0ff000000h, 0ffffffffh ; 3
  124. DWORD 00000000h, 0ffffffffh ; 4
  125. DWORD 00000000h, 0ffffff00h ; 5
  126. DWORD 00000000h, 0ffff0000h ; 6
  127. DWORD 00000000h, 0ff000000h ; 7 end
  128. ColEndMask DWORD 00000000h, 00000000h ; 8 0
  129. DWORD 000000ffh, 00000000h ; 1
  130. DWORD 0000ffffh, 00000000h ; 2
  131. DWORD 00ffffffh, 00000000h ; 3
  132. DWORD 0ffffffffh, 00000000h ; 4
  133. DWORD 0ffffffffh, 000000ffh ; 5
  134. DWORD 0ffffffffh, 0000ffffh ; 6
  135. DWORD 0ffffffffh, 00ffffffh ; 7
  136. DWORD 0ffffffffh, 0ffffffffh ; 8
  137. ShiftMask DWORD 7f7f7f7fh, 7f7f7f7fh ; used for byte shifts
  138. BottomBitMask DWORD 01010101h, 01010101h ; used for packed averages
  139. Round1 DWORD 00010001h, 00010001h
  140. MMXDATA1 ENDS
  141. ;-------------------;
  142. ; Set Up ;
  143. ;-------------------;
  144. MMXCODE1 SEGMENT
  145. PUBLIC C MMX_BiMotionComp
  146. MMX_BiMotionComp:
  147. push ebx
  148. push ebp
  149. push edi
  150. push esi
  151. mov ebp, esp
  152. and esp, -32 ; align the stack on a cache line
  153. sub esp, FRAMESIZE ; make room for locals
  154. mov edi, [iNum]
  155. mov esi, [pCurr]
  156. ; start end
  157. movd mm1, [mvx] ; mm1 = 0000 0000 .... .mvx
  158. movd mm2, [mvy] ; mm2 = 0000 0000 .... .mvy
  159. movq mm0, [mmxFudge+8*edi]
  160. punpcklwd mm1, mm2 ; mm1 = .... .... .mvy .mvx
  161. movq mm3, [mmxClipT]
  162. punpckldq mm1, mm1 ; mm1 = .mvy .mvx .mvy .mvx
  163. movq mm4, [mmxClipB]
  164. psubw mm0, mm1
  165. mov edi, [pPrev]
  166. psraw mm0, 1 ; mm0 = RowStart ColStart RowEnd ColEnd
  167. mov ebx, [mvy]
  168. paddsw mm0, mm3 ; clip at 8 or higher
  169. and ebx, -2 ; 2*(mvy>>1)
  170. psubusw mm0, mm4 ; clip at 0 or lower
  171. shl ebx, 6 ; 128*(mvy>>1)
  172. mov eax, [mvx]
  173. movq [uColEnd], mm0
  174. sar eax, 1 ; mvx>>1
  175. lea ebx, [ebx+2*ebx] ; PITCH*(mvy>>1)
  176. add esi, ebx ; pCurr += PITCH*(mvy>>1)
  177. xor ecx, ecx
  178. add esi, eax ; pCurr += mvx>>1
  179. xor edx, edx
  180. mov cl, [uColStart] ; uColStart
  181. mov dl, [uColEnd] ; uColEnd
  182. cmp ecx, edx ; iColCount = ColStart - ColEnd
  183. jge hasta_la_vista_baby
  184. movq mm6, ColStartMask[8*ecx]
  185. movq mm7, ColEndMask[8*edx]
  186. pxor mm4, mm4 ; mm4 = 0
  187. mov cl, [uRowStart] ; RowStart
  188. mov dl, [uRowEnd] ; RowEnd
  189. sub edx, ecx ; iRowCount = RowEnd - RowStart
  190. jle hasta_la_vista_baby
  191. pand mm7, mm6 ; mm7 = ff for those cols to use back pred.
  192. pxor mm6, mm6
  193. shl ecx, 7 ; 128*RowStart
  194. mov eax, [mvx]
  195. movq mm5, [ShiftMask] ; mm5 = 7f 7f 7f 7f 7f 7f 7f 7f
  196. pcmpeqb mm6, mm7 ; mm6 is the complement of mm7
  197. lea ecx, [ecx+2*ecx] ; PITCH*RowStart
  198. mov ebx, [mvy]
  199. add esi, ecx ; pCurr += PITCH*RowStart
  200. add edi, ecx ; pPrev += PITCH*RowStart
  201. mov ecx, PITCH
  202. and eax, 1
  203. je even_mvx
  204. and ebx, 1
  205. je odd_even
  206. ;
  207. ; mvx is odd (horizontal half pel motion)
  208. ; mvy is odd (vertical half pel motion)
  209. ;
  210. odd_odd:
  211. movq mm0, [esi+4]
  212. movq mm1, mm0
  213. psrlq mm0, 8
  214. movq mm2, [esi]
  215. punpcklbw mm1, mm4
  216. movq mm3, mm2
  217. punpcklbw mm0, mm4
  218. paddw mm0, mm1
  219. psrlq mm2, 8
  220. paddw mm0, [Round1]
  221. punpcklbw mm3, mm4
  222. punpcklbw mm2, mm4
  223. add esi, ecx
  224. movq [mmxTempH], mm0
  225. paddw mm2, mm3
  226. paddw mm2, [Round1]
  227. sub edi, ecx ; pre decrement destination pointer
  228. movq [mmxTempL], mm2
  229. ;
  230. ; This loop is 2-folded and works on 2 results (rows) per pass.
  231. ; It finishes one result per iteration.
  232. ;
  233. ; Stage I
  234. ; computes the partial sums of a row with a shifted copy of the row.
  235. ; It stores the partial sums for the next iteration's Stage II.
  236. ; Stage II
  237. ; reads the partial sums of the prior row and averages them with the
  238. ; just computed (in Stage I) partial sums of the current row to get
  239. ; the backward prediction. These computations are done unpacked as
  240. ; 16-bit words. A rounding factor is added to each partial sum before
  241. ; storage. Then stage II averages the result (with truncation) with
  242. ; the forward prediction.
  243. ;
  244. ; Those bytes of the backwards prediction which are not to be used are
  245. ; replaced by the corresponding bytes of the forwards prediction prior
  246. ; to averaging (using the masks in registers mm6 and mm7).
  247. ;
  248. ; Averaging of the forward with backward is done packed in 8-bit bytes by
  249. ; dividing both inputs by 2, adding them together, and then adding in an
  250. ; adjustment. To average with truncation, the adjustment is 1 when BOTH
  251. ; inputs are odd. Due to the absence of a byte shift instruction, divide
  252. ; by 2 is done by shifting the entire mmx register and then masking off
  253. ; (zeroing) bits , 15, ..., and 63 (the old low-order bits) using mm5.
  254. ;
  255. OddOddLoop:
  256. movq mm1, [esi] ; load left half
  257. movq mm0, mm1 ; copy left half
  258. psrlq mm1, 8 ; shift left over
  259. movq mm3, [esi+4] ; load right half
  260. punpcklbw mm0, mm4 ; unpack left half
  261. movq mm2, mm3 ; copy right half
  262. punpcklbw mm1, mm4 ; unpack shifted left half
  263. paddw mm1, mm0 ; add left side
  264. psrlq mm3, 8 ; shift right over
  265. paddw mm1, [Round1] ; add in round to left
  266. punpcklbw mm2, mm4 ; unpack right half
  267. movq mm0, [mmxTempL] ; fetch prior row's left half
  268. punpcklbw mm3, mm4 ; unpack shifted right half
  269. movq [mmxTempL], mm1 ; stash this row's left half
  270. paddw mm3, mm2 ; add right side
  271. paddw mm3, [Round1] ; add in round to right
  272. paddw mm0, mm1 ; sum current & prior lefts
  273. movq mm2, [mmxTempH] ; fetch prior row's right half
  274. psrlw mm0, 2 ; divide left sum by four
  275. movq [mmxTempH], mm3 ; stash this rows right half
  276. paddw mm2, mm3 ; sum current & prior rights
  277. movq mm1, [edi+ecx] ; fetch forward prediction
  278. psrlw mm2, 2 ; divide right sum by four
  279. packuswb mm0, mm2 ; complete backward prediction
  280. movq mm2, mm1 ; copy forward
  281. pand mm0, mm7 ; mask off unused bytes
  282. pand mm2, mm6 ; create replacement bytes
  283. por mm0, mm2 ; new backward prediction
  284. movq mm3, mm1 ; copy forward for adjustment
  285. pand mm3, mm0 ; adjustment with truncation
  286. psrlq mm0, 1 ; divide new backward by 2
  287. pand mm0, mm5 ; clear extra bits
  288. psrlq mm1, 1 ; divide forward by 2
  289. pand mm3, [BottomBitMask] ; complete adjustment
  290. pand mm1, mm5 ; clear extra bits
  291. paddb mm0, mm1 ; sum quotients
  292. add edi, ecx ; increment destination pointer
  293. paddb mm0, mm3 ; add addjustment
  294. add esi, ecx ; increment source pointer
  295. movq [edi], mm0 ; store result
  296. ; *** 1 cycle store penalty ***
  297. dec edx ; decrement loop control
  298. jg OddOddLoop ; back up if not done
  299. ; wrap up and go home
  300. mov esp, ebp
  301. pop esi
  302. pop edi
  303. pop ebp
  304. pop ebx
  305. ret
  306. ;
  307. ; mvx is odd (horizontal half pel motion)
  308. ; mvy is even (vertical full pel motion)
  309. ;
  310. odd_even:
  311. sub edi, ecx ; pre decrement destination pointer
  312. ;
  313. ; This loop is not folded and does 1 result (row) per pass.
  314. ;
  315. ; It loads the backward predicted row into mm0 and brings in the last
  316. ; (eighth) byte through al, which is or'd with the shifted row. It
  317. ; completes the bacward prediction (by averaging the rows with round)
  318. ; and averages the result (with truncation) with the forward prediction.
  319. ; Those bytes of the backwards prediction which are not to be used are
  320. ; replaced by the corresponding bytes of the forwards prediction prior
  321. ; to averaging (using the masks in registers mm6 and mm7).
  322. ;
  323. ; Averaging is done by dividing both inputs by 2, adding them together,
  324. ; and then adding in an adjustment.
  325. ; To average with round, the adjustment is 1 when EITHER input is odd.
  326. ; To average with truncation, the adjustment is 1 when BOTH inputs are odd.
  327. ; Due to the absence of a byte shift instruction, divide by 2 is done
  328. ; by shifting the entire mmx register and then masking off (zeroing) bits
  329. ; 7, 15, ..., and 63 (the old low-order bits) using mm5.
  330. ;
  331. OddEvenLoop:
  332. movq mm0, [esi] ; fetch backward predicted row
  333. mov al, [esi+8] ; fetch last byte
  334. movq mm1, mm0 ; copy row
  335. movd mm2, eax ; last byte
  336. psrlq mm0, 8 ; shift row right 1 byte
  337. movq mm3, mm1 ; copy row for adjustment
  338. psllq mm2, 56 ; move last byte to left end
  339. por mm0, mm2 ; or in last byte on left
  340. psrlq mm1, 1 ; divide row by 2
  341. por mm3, mm0 ; averaging with rounding bit
  342. psrlq mm0, 1 ; divide shifted row by 2
  343. pand mm0, mm5 ; clear extra bits
  344. pand mm1, mm5 ; clear extra bits
  345. pand mm3, [BottomBitMask] ; finish adjustment (with round)
  346. paddb mm0, mm1 ; sum quotients
  347. movq mm4, [edi+ecx] ; fetch forward prediction
  348. paddb mm3, mm0 ; add adjustment, got back pred.
  349. movq mm2, mm4 ; copy forward
  350. pand mm3, mm7 ; mask off unused bytes
  351. movq mm1, mm4 ; copy forward
  352. pand mm2, mm6 ; mask forward copy
  353. por mm3, mm2 ; backward with forward replacing
  354. psrlq mm4, 1 ; divide forward by 2
  355. pand mm1, mm3 ; adjustment for truncation
  356. psrlq mm3, 1 ; divide bacwards by 2
  357. pand mm3, mm5 ; clear extra bits
  358. pand mm4, mm5 ; clear extra bits
  359. pand mm1, [BottomBitMask] ; finish adjustment (with truncation)
  360. paddb mm4, mm3 ; sum quotients
  361. paddb mm4, mm1 ; add adjusment, have result
  362. add edi, ecx ; increment destination pointer
  363. add esi, ecx ; increment source pointer
  364. dec edx ; decrement loop control
  365. movq [edi], mm4 ; save result
  366. jg OddEvenLoop ; loop when not done
  367. ; wrap up and go home
  368. mov esp, ebp
  369. pop esi
  370. pop edi
  371. pop ebp
  372. pop ebx
  373. ret
  374. ;---------------------------;
  375. ; mvx is even -- test mvy ;
  376. ;---------------------------;
  377. even_mvx:
  378. and ebx, 1
  379. je even_even
  380. ;
  381. ; mvx is even (horizontal full pel motion)
  382. ; mvy is odd (vertical half pel motion)
  383. ;
  384. even_odd:
  385. movq mm0, [esi] ; 1: first row
  386. movq mm1, [esi+ecx] ; 1: second row
  387. movq mm2, mm0 ; 1: copy for rounding
  388. por mm2, mm1 ; 1: averaging with round
  389. sub edi, ecx ; pre deccrement destination pointer
  390. dec edx ; note that edx is positive on entry
  391. jz EvenOddPost
  392. ;
  393. ; This loop is 2-folded and works on 2 results (rows) per pass.
  394. ; It finishes one result per iteration.
  395. ; Stage I
  396. ; loads both backward predicted rows into mm0 and mm1, copies the first
  397. ; into mm2, and ors with the second for the rounding adjustment.
  398. ; Stage II
  399. ; completes the bacward prediction (by averaging the rows with round)
  400. ; and averages the result (with truncation) with the forward prediction.
  401. ; Those bytes of the backwards prediction which are not to be used are
  402. ; replaced by the corresponding bytes of the forwards prediction prior
  403. ; to averaging (using the masks in registers mm6 and mm7).
  404. ;
  405. ; Averaging is done by dividing both inputs by 2, adding them together,
  406. ; and then adding in an adjustment (in mm2).
  407. ; To average with round, the adjustment is 1 when EITHER input is odd.
  408. ; To average with truncation, the adjustment is 1 when BOTH inputs are odd.
  409. ; Due to the absence of a byte shift instruction, divide by 2 is done
  410. ; by shifting the entire mmx register and then masking off (zeroing) bits
  411. ; 7, 15, ..., and 63 (the old low-order bits) using mm5.
  412. ;
  413. EvenOddLoop:
  414. psrlq mm0, 1 ; 2: divide first row by 2
  415. add edi, ecx ; increment destination pointer
  416. psrlq mm1, 1 ; 2: divide second row by 2
  417. pand mm0, mm5 ; 2: clear extra bits
  418. pand mm2, [BottomBitMask] ; 2: rounding bits
  419. pand mm1, mm5 ; 2: clear extra bits
  420. movq mm3, [edi] ; 2: fetch forward prediction
  421. paddb mm1, mm0 ; 2: average backward rows
  422. paddb mm1, mm2 ; 2: add in round
  423. movq mm4, mm3 ; 2: copy for mask
  424. pand mm1, mm7 ; 2: masked backward prediction
  425. pand mm4, mm6 ; 2: masked forward prediction
  426. por mm4, mm1 ; 2: adjusted backwards prediction
  427. movq mm2, mm3 ; 2: copy for rounding
  428. pand mm2, mm4 ; 2: averaging with truncation
  429. psrlq mm4, 1 ; 2: divide bacwards by 2
  430. psrlq mm3, 1 ; 2: divide forwards by 2
  431. pand mm4, mm5 ; 2: clear extra bits
  432. pand mm2, [BottomBitMask] ; 2: "no-round" bits
  433. pand mm3, mm5 ; 2: clear extra bits
  434. movq mm0, [esi+ecx] ; 1: first row
  435. paddb mm4, mm3 ; 2: average forward & backwards
  436. movq mm1, [esi+2*ecx] ; 1: second row
  437. paddb mm4, mm2 ; 2: add in "no-round" bits
  438. movq mm2, mm0 ; 1: copy for rounding
  439. add esi, ecx ; increment source pointer
  440. movq [edi], mm4 ; 2: store resulting row
  441. por mm2, mm1 ; 1: averaging with rounding bit
  442. dec edx ; decrement loop count
  443. jg EvenOddLoop ; back up if not done
  444. EvenOddPost:
  445. psrlq mm0, 1 ; 2: divide first row by 2
  446. add edi, ecx ; increment destination pointer
  447. psrlq mm1, 1 ; 2: divide second row by 2
  448. pand mm0, mm5 ; 2: clear extra bits
  449. pand mm2, [BottomBitMask] ; 2: rounding bits
  450. pand mm1, mm5 ; 2: clear extra bits
  451. movq mm3, [edi] ; 2: fetch forward prediction
  452. paddb mm1, mm0 ; 2: average backward rows
  453. paddb mm1, mm2 ; 2: add in round
  454. movq mm4, mm3 ; 2: copy for mask
  455. pand mm1, mm7 ; 2: masked backward prediction
  456. pand mm4, mm6 ; 2: masked forward prediction
  457. por mm4, mm1 ; 2: adjusted backwards prediction
  458. movq mm2, mm3 ; 2: copy for rounding
  459. pand mm2, mm4 ; 2: averaging with truncation
  460. psrlq mm4, 1 ; 2: divide bacwards by 2
  461. psrlq mm3, 1 ; 2: divide forwards by 2
  462. pand mm4, mm5 ; 2: clear extra bits
  463. pand mm2, [BottomBitMask] ; 2: "no-round" bits
  464. pand mm3, mm5 ; 2: clear extra bits
  465. paddb mm4, mm3 ; 2: average forward & backwards
  466. mov esp, ebp
  467. paddb mm4, mm2 ; 2: add in "no-round" bits
  468. mov ecx, edi
  469. pop esi
  470. pop edi
  471. pop ebp
  472. pop ebx
  473. movq [ecx], mm4 ; 2: store resulting row
  474. ret
  475. ;
  476. ; mvx is even (horizontal full pel motion)
  477. ; mvy is even (vertical full pel motion)
  478. ;
  479. even_even:
  480. movq mm1, [edi] ; 1: forward prediction
  481. movq mm0, [esi] ; 1: backward prediction
  482. movq mm2, mm1 ; 1: copy forward for mask
  483. pand mm0, mm7 ; 1: mask off unused bytes
  484. sub edi, ecx ; pre deccrement destination pointer
  485. dec edx ; note that edx is positive on entry
  486. jz EvenEvenPost
  487. ;
  488. ; This loop is 2-folded and works on 2 results (rows) per pass.
  489. ; It finishes one result per iteration.
  490. ; Stage I
  491. ; loads mm0 and mm1 with the predictions and begins the replacement
  492. ; procedure for the forward prediction.
  493. ; Stage II
  494. ; finishes the replacement procedure for the forward prediction and
  495. ; averages that (with truncation) with the bacwards prediction.
  496. ; Those bytes of the backwards prediction which are not to be used are
  497. ; replaced by the corresponding bytes of the forwards prediction prior
  498. ; to averaging (using the masks in registers mm6 and mm7).
  499. ;
  500. ; Averaging is done by dividing both inputs by 2, adding them together,
  501. ; and then adding in an adjustment (in mm2).
  502. ; To average with round, the adjustment is 1 when EITHER input is odd.
  503. ; To average with truncation, the adjustment is 1 when BOTH inputs are odd.
  504. ; Due to the absence of a byte shift instruction, divide by 2 is done
  505. ; by shifting the entire mmx register and then masking off (zeroing) bits
  506. ; 7, 15, ..., and 63 (the old low-order bits) using mm5.
  507. ;
  508. EvenEvenLoop:
  509. pand mm2, mm6 ; 2: mask corresponding bytes
  510. add edi, ecx ; increment destination pointer
  511. por mm0, mm2 ; 2: replace unused back with for.
  512. movq mm3, mm1 ; 2: copy forward for adjustment
  513. pand mm3, mm0 ; 2: adjustment for truncation
  514. psrlq mm0, 1 ; 2: divide back by 2
  515. psrlq mm1, 1 ; 2: divide forward by 2
  516. pand mm0, mm5 ; 2: clear extra bits
  517. pand mm3, [BottomBitMask] ; 2: finish adjustment
  518. pand mm1, mm5 ; 2: clear extra bits
  519. paddb mm0, mm1 ; 2: sum quotients
  520. add esi, ecx ; increment source pointer
  521. movq mm1, [edi+ecx] ; 1: forward prediction
  522. paddb mm3, mm0 ; 2: add in adjusment
  523. movq mm0, [esi] ; 1: backward prediction
  524. movq mm2, mm1 ; 1: copy forward for mask
  525. movq [edi], mm3 ; 2: store result
  526. pand mm0, mm7 ; 1: mask off unused bytes
  527. dec edx ; decrement loop control
  528. jg EvenEvenLoop ; loop back when not done
  529. EvenEvenPost:
  530. pand mm2, mm6 ; 2: mask corresponding bytes
  531. add ecx, edi
  532. por mm0, mm2 ; 2: replace unused back with for.
  533. movq mm3, mm1 ; 2: copy forward for adjustment
  534. pand mm3, mm0 ; 2: adjustment for truncation
  535. psrlq mm0, 1 ; 2: divide back by 2
  536. psrlq mm1, 1 ; 2: divide forward by 2
  537. pand mm0, mm5 ; 2: clear extra bits
  538. pand mm3, [BottomBitMask] ; 2: finish adjustment
  539. pand mm1, mm5 ; 2: clear extra bits
  540. paddb mm0, mm1 ; 2: sum quotients
  541. mov esp, ebp
  542. paddb mm3, mm0 ; 2: add in adjusment
  543. nop
  544. pop esi
  545. pop edi
  546. pop ebp
  547. pop ebx
  548. movq [ecx], mm3
  549. ret
  550. ;
  551. ; "Remember when I promised to kill you last?"
  552. ;
  553. bye_bye:
  554. hasta_la_vista_baby:
  555. mov esp, ebp
  556. pop esi
  557. pop edi
  558. pop ebp
  559. pop ebx
  560. ret
  561. MMXCODE1 ENDS
  562. ; 1111111111222222222233333333334444444444555555555566666666667777777
  563. ;234567890123456789012345678901234567890123456789012345678901234567890123456
  564. ;--------------------------------------------------------------------------;
  565. END