Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

468 lines
18 KiB

  1. ;-------------------------------------------------------------------------
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;-------------------------------------------------------------------------
  13. ;-------------------------------------------------------------------------
  14. ;// $Header: S:\h26x\src\dec\cx51282.asv
  15. ;//
  16. ;// $Log: S:\h26x\src\dec\cxm1282.asv $
  17. ;//
  18. ;// Rev 1.7 14 Jun 1996 16:30:00 AGUPTA2
  19. ;// Cosmetic changes to adhere to common coding convention.
  20. ;//
  21. ;// Rev 1.6 13 May 1996 11:03:42 AGUPTA2
  22. ;// Final drop from IDC.
  23. ;//
  24. ;// Rev 1.3 02 Apr 1996 16:30:54 RMCKENZX
  25. ;// Corrected two bugs in set-up.
  26. ;//
  27. ;// Rev 1.1 20 Mar 1996 11:19:28 RMCKENZX
  28. ;// March 96 version.
  29. ;//
  30. ;// Rev 1.2 05 Feb 1996 11:45:02 vladip
  31. ;// initial mmx almost optimized version
  32. ;//
  33. ;// Rev 1.1 29 Jan 1996 18:53:38 vladip
  34. ;//
  35. ;// IFDEF TIMING is added
  36. ;//
  37. ;// Rev 1.0 29 Jan 1996 17:28:08 vladip
  38. ;// Initial revision.
  39. ;//
  40. ;// Rev 1.2 03 Nov 1995 14:39:42 BNICKERS
  41. ;// Support YUV12 to CLUT8 zoom by 2.
  42. ;//
  43. ;// Rev 1.1 26 Oct 1995 09:46:10 BNICKERS
  44. ;// Reduce the number of blanks in the "proc" statement because the assembler
  45. ;// sometimes has problems with statements longer than 512 characters long.
  46. ;//
  47. ;// Rev 1.0 25 Oct 1995 17:59:22 BNICKERS
  48. ;// Initial revision.
  49. ;-------------------------------------------------------------------------
  50. ;
  51. ; +---------- Color convertor.
  52. ; |+--------- For both H261 and H263.
  53. ; ||+-------- MMx Version.
  54. ; |||++------ Convert from YUV12.
  55. ; |||||+----- Convert to CLUT8.
  56. ; ||||||+---- Zoom by two.
  57. ; |||||||
  58. ; cxm1282 -- This function performs YUV12 to CLUT8 zoom-by-2 color conversion
  59. ; for H26x. It dithers among 9 chroma points and 26 luma
  60. ; points, mapping the 8 bit luma pels into the 26 luma points by
  61. ; clamping the ends and stepping the luma by 8.
  62. ;
  63. ; 1. The color convertor is destructive; the input Y, U, and V
  64. ; planes will be clobbered. The Y plane MUST be preceded by
  65. ; 1544 bytes of space for scratch work.
  66. ; 2. U and V planes should be preceded by 4 bytes (for read only)
  67. ;
  68. OPTION CASEMAP:NONE
  69. OPTION PROLOGUE:None
  70. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  71. .586
  72. .xlist
  73. include iammx.inc
  74. include memmodel.inc
  75. .list
  76. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  77. MMXCODE1 ENDS
  78. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  79. MMXDATA1 ENDS
  80. ;------------------------------------------------------------
  81. PQ equ PD
  82. ;------------------------------------------------------------
  83. ;=============================================================================
  84. MMXDATA1 SEGMENT
  85. ALIGN 8
  86. EXTRN convert_to_sign : DWORD ; Defined in cxm1281.asm
  87. EXTRN V2_U0low_bound : DWORD
  88. EXTRN V2_U0high_bound : DWORD
  89. EXTRN U2_V0low_bound : DWORD
  90. EXTRN U2_V0high_bound : DWORD
  91. EXTRN U_low_value : DWORD
  92. EXTRN V_low_value : DWORD
  93. EXTRN Y0_low : DWORD
  94. EXTRN Y1_low : DWORD
  95. EXTRN clean_MSB_mask : DWORD
  96. EXTRN saturate_to_Y_high: DWORD
  97. EXTRN return_from_Y_high: DWORD
  98. Y0_correct EQU Y1_low+8
  99. Y1_correct EQU Y0_low+8
  100. Y2_correct EQU Y1_low
  101. Y3_correct EQU Y0_low
  102. U_high_value EQU U_low_value
  103. V_high_value EQU V_low_value
  104. MMXDATA1 ENDS
  105. MMXCODE1 SEGMENT
  106. MMX_YUV12ToCLUT8ZoomBy2 PROC DIST LANG PUBLIC,
  107. AYPlane: DWORD,
  108. AVPlane: DWORD,
  109. AUPlane: DWORD,
  110. AFrameWidth: DWORD,
  111. AFrameHeight: DWORD,
  112. AYPitch: DWORD,
  113. AVPitch: DWORD,
  114. AAspectAdjustmentCnt: DWORD,
  115. AColorConvertedFrame: DWORD,
  116. ADCIOffset: DWORD,
  117. ACCOffsetToLine0: DWORD,
  118. ACCOPitch: DWORD,
  119. ACCType: DWORD
  120. LocalFrameSize = 56
  121. RegisterStorageSize = 16
  122. argument_base EQU ebp + RegisterStorageSize
  123. local_base EQU esp
  124. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  125. ; Arguments:
  126. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  127. YPlane EQU argument_base + 4
  128. VPlane EQU argument_base + 8
  129. UPlane EQU argument_base + 12
  130. FrameWidth EQU argument_base + 16
  131. FrameHeight EQU argument_base + 20
  132. YPitch EQU argument_base + 24
  133. ChromaPitch EQU argument_base + 28
  134. AspectAdjustmentCount EQU argument_base + 32
  135. ColorConvertedFrame EQU argument_base + 36
  136. DCIOffset EQU argument_base + 40
  137. CCOffsetToLine0 EQU argument_base + 44
  138. CCOPitch EQU argument_base + 48
  139. CCType EQU argument_base + 52
  140. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  141. ; Locals (on local stack frame)
  142. ; (local_base is aligned at cache-line boundary in the prologue)
  143. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  144. localVPlane EQU local_base + 0
  145. localFrameWidth EQU local_base + 4
  146. localYPitch EQU local_base + 8
  147. localChromaPitch EQU local_base + 12
  148. localAspectAdjustmentCount EQU local_base + 16
  149. localCCOPitch EQU local_base + 20
  150. CCOCursor EQU local_base + 24
  151. YLimit EQU local_base + 28
  152. DistanceFromVToU EQU local_base + 32
  153. AspectCount EQU local_base + 36
  154. CCOLine1 EQU local_base + 40
  155. CCOLine2 EQU local_base + 44
  156. CCOLine3 EQU local_base + 48
  157. StashESP EQU local_base + 52
  158. push esi
  159. push edi
  160. push ebp
  161. push ebx
  162. mov ebp, esp
  163. sub esp, LocalFrameSize
  164. and esp, -32 ; align at cache line boundary
  165. mov [StashESP], ebp
  166. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  167. ; Save some parameters on local stack frame
  168. ; localVPlane
  169. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  170. mov ebx, [VPlane]
  171. ;
  172. mov [localVPlane], ebx
  173. mov ebx, [FrameWidth]
  174. mov [localFrameWidth], ebx
  175. mov ebx, [YPitch]
  176. mov [localYPitch], ebx
  177. mov ebx, [ChromaPitch]
  178. mov [localChromaPitch], ebx
  179. mov ebx, [AspectAdjustmentCount]
  180. mov [localAspectAdjustmentCount], ebx
  181. mov ebx, [CCOPitch]
  182. mov [localCCOPitch], ebx
  183. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  184. ; Set-up rest of the local stack frame
  185. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  186. mov ebx, [localVPlane]
  187. mov ecx, [UPlane]
  188. sub ecx, ebx
  189. mov eax, [ColorConvertedFrame]
  190. mov [DistanceFromVToU], ecx
  191. ;
  192. add eax, [DCIOffset]
  193. ;
  194. add eax, [CCOffsetToLine0]
  195. ;
  196. mov [CCOCursor], eax
  197. mov edx, [FrameHeight]
  198. mov ecx, [localYPitch]
  199. ;
  200. imul edx, ecx
  201. ;
  202. mov edi, [localCCOPitch]
  203. mov esi, [YPlane] ; Fetch cursor over luma plane.
  204. mov [CCOCursor], eax
  205. add edx, esi
  206. mov [YLimit], edx
  207. mov edx, [localAspectAdjustmentCount]
  208. mov [AspectCount], edx
  209. mov edi, esi
  210. mov ebx, [localFrameWidth]
  211. mov eax, [CCOCursor] ; CCOLine0
  212. sar ebx, 1
  213. sub ebx, 4 ; counter starts from maxvalue-4, and in last iteration it equals 0
  214. mov ecx, eax
  215. ;
  216. add edi, [localYPitch] ; edi = odd Y line cursor
  217. ;
  218. add ecx, [localCCOPitch]
  219. mov [localFrameWidth], ebx
  220. mov [CCOLine1], ecx
  221. mov ebx, [localCCOPitch]
  222. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  223. ; in each outer loop iteration, 4 lines of output are done.
  224. ; in each inner loop iteration block 4x16 of output is done.
  225. ; main task of outer loop is to prepare pointers for inner loop
  226. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  227. ; Arguments should not be referenced beyond this point
  228. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  229. NextFourLines:
  230. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  231. ; eax : CCOLine0
  232. ; ebx : CCOPitch
  233. ; ecx : CCOLine1
  234. ; edx : available
  235. ; esi : Cursor over even Y line
  236. ; edi : Cursor over odd Y line
  237. ; ebp : available
  238. ; prepare output pointers : CCOLine1, CCOLine2, CCOLine3
  239. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  240. mov ebp, [AspectCount]
  241. ;
  242. sub ebp, 2
  243. jg continue1 ; jump if it still>0
  244. add ebp, [localAspectAdjustmentCount]
  245. mov ecx, eax ; Output1 will overwrite Output0 line
  246. mov [CCOLine1], ecx
  247. continue1:
  248. lea edx, [ecx+ebx] ; CCOLine2
  249. sub ebp, 2
  250. mov [CCOLine2], edx
  251. jg continue2 ; jump if it still>0
  252. add ebp, [localAspectAdjustmentCount]
  253. xor ebx, ebx ; Output3 will overwrite Output2 line
  254. continue2:
  255. mov [AspectCount], ebp
  256. lea ebp, [edx+ebx]
  257. mov [CCOLine3], ebp
  258. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  259. ; Inner loop does 4x16 block of output points (2x8 of input points)
  260. ; Register Usage
  261. ; eax : cursor over Output
  262. ; ebx : counter
  263. ; ecx : cursor over Output1,2,3
  264. ; edx : Cursor over V line
  265. ; esi : Cursor over even Y line
  266. ; edi : Cursor over odd Y line
  267. ; ebp : Cursor over U line.
  268. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  269. mov ebp, [localVPlane]
  270. mov ebx, [localFrameWidth]
  271. mov edx, ebp
  272. add ebp, [DistanceFromVToU] ; Cursor over U line.
  273. movdt mm3, [ebp+ebx] ; read 4 U points
  274. ;
  275. movdt mm2, [edx+ebx] ; read 4 V points
  276. punpcklbw mm3, mm3 ; u3:u3:u2:u2|u1:u1:u0:u0
  277. prepare_next4x8:
  278. psubb mm3, convert_to_sign
  279. punpcklbw mm2, mm2 ; v3:v3:v2:v2|v1:v1:v0:v0
  280. psubb mm2, convert_to_sign
  281. movq mm4, mm3
  282. movdt mm7, [esi+2*ebx] ; read even Y line
  283. punpcklwd mm3, mm3 ; u1:u1:u1:u1|u0:u0:u0:u0
  284. mov ecx, [CCOLine1]
  285. movq mm1, mm3
  286. pcmpgtb mm3, V2_U0low_bound
  287. punpcklbw mm7, mm7 ; y3:y3:y2:y2|y1:y1:y0:y0
  288. pand mm3, U_low_value
  289. movq mm5, mm7
  290. psubusb mm7, Y0_correct
  291. movq mm6, mm2
  292. pcmpgtb mm1, V2_U0high_bound
  293. punpcklwd mm2, mm2 ; v1:v1:v1:v1|v0:v0:v0:v0
  294. pand mm1, U_high_value
  295. psrlq mm7, 3
  296. pand mm7, clean_MSB_mask
  297. movq mm0, mm2
  298. pcmpgtb mm2, U2_V0low_bound
  299. ;
  300. pcmpgtb mm0, U2_V0high_bound
  301. paddb mm3, mm1
  302. pand mm2, V_low_value
  303. pand mm0, V_high_value
  304. paddusb mm7, saturate_to_Y_high
  305. paddb mm3, mm2
  306. psubusb mm7, return_from_Y_high ; Y impact on line0
  307. paddd mm3, mm0 ; common U,V impact on line 0
  308. psubusb mm5, Y1_correct
  309. paddb mm7, mm3 ; final value of line 0
  310. movq mm0, mm3 ; u31:u21:u11:u01|u30:u20:u10:u00
  311. psrlq mm5, 3
  312. pand mm5, clean_MSB_mask
  313. psrld mm0, 16 ; : :u31:u21| : :u30:u20
  314. paddusb mm5, saturate_to_Y_high
  315. pslld mm3, 16 ; u11:u01: : |u10:u00: :
  316. psubusb mm5, return_from_Y_high ; Y impact on line0
  317. por mm0, mm3 ; u11:u01:u31:u21|u10:u00:u30:u20
  318. movdt mm3, [edi+2*ebx] ; odd Y line
  319. paddb mm5, mm0 ; final value of line 0
  320. punpcklbw mm3, mm3 ; y3:y3:y2:y2|y1:y1:y0:y0
  321. movq mm2, mm0 ; u11:u01:u31:u21|u10:u00:u30:u20
  322. movq [ecx+4*ebx], mm5 ; write Output1 line
  323. movq mm1, mm3
  324. movq [eax+4*ebx], mm7 ; write Output0 line
  325. psrlw mm0, 8 ; :u11: :u31| :u10: :u30
  326. psubusb mm1, Y3_correct
  327. psllw mm2, 8 ; u01: :u21: |u00: :u20:
  328. psubusb mm3, Y2_correct
  329. psrlq mm1, 3
  330. pand mm1, clean_MSB_mask
  331. por mm0, mm2 ; u01:u11:u21:u31|u00:u10:u20:u30
  332. paddusb mm1, saturate_to_Y_high
  333. psrlq mm3, 3
  334. psubusb mm1, return_from_Y_high
  335. movq mm5, mm0 ; u01:u11:u21:u31|u00:u10:u20:u30
  336. pand mm3, clean_MSB_mask
  337. paddb mm1, mm0
  338. paddusb mm3, saturate_to_Y_high
  339. psrld mm5, 16
  340. psubusb mm3, return_from_Y_high
  341. pslld mm0, 16
  342. mov ecx, [CCOLine3]
  343. por mm5, mm0 ; u21:u31:u01:u11|u20:u30:u00:u10
  344. movdt mm2, [esi+2*ebx+4] ; read next even Y line
  345. paddb mm5, mm3
  346. movq [ecx+4*ebx], mm1 ; write Output3 line
  347. punpckhwd mm4, mm4 ; u3:u3:u3:u3|u2:u2:u2:u2
  348. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  349. ; start next 4x8 block of output
  350. ; SECOND uv-QWORD
  351. ; mm6, mm4 are live
  352. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  353. mov ecx, [CCOLine2]
  354. movq mm3, mm4
  355. pcmpgtb mm4, V2_U0low_bound
  356. punpckhwd mm6,mm6
  357. movq [ecx+4*ebx], mm5 ; write Output2 line
  358. movq mm7, mm6
  359. pand mm4, U_low_value
  360. punpcklbw mm2, mm2 ; y3:y3:y2:y2|y1:y1:y0:y0
  361. pcmpgtb mm3, V2_U0high_bound
  362. movq mm5, mm2
  363. pand mm3, U_high_value
  364. ;
  365. pcmpgtb mm6, U2_V0low_bound
  366. paddb mm4, mm3
  367. pand mm6, V_low_value
  368. ;
  369. pcmpgtb mm7, U2_V0high_bound
  370. paddb mm4, mm6
  371. pand mm7, V_high_value
  372. ;
  373. psubusb mm2, Y0_correct
  374. paddd mm4, mm7
  375. psubusb mm5, Y1_correct
  376. psrlq mm2, 3
  377. pand mm2, clean_MSB_mask
  378. movq mm3, mm4 ; u31:u21:u11:u01|u30:u20:u10:u00
  379. paddusb mm2, saturate_to_Y_high
  380. pslld mm3, 16 ; u11:u01: : |u10:u00: :
  381. psubusb mm2, return_from_Y_high
  382. psrlq mm5, 3
  383. pand mm5, clean_MSB_mask
  384. paddb mm2, mm4 ; MM4=u31:u21:u11:u01|u30:u20:u10:u00, WHERE U STANDS FOR UNATED U AND V IMPACTS
  385. paddusb mm5, saturate_to_Y_high
  386. psrld mm4, 16 ; : :u31:u21| : :u30:u20
  387. psubusb mm5, return_from_Y_high
  388. por mm4, mm3 ; u11:u01:u31:u21|u10:u00:u30:u20
  389. paddb mm5, mm4
  390. mov ecx, [CCOLine1]
  391. movdt mm0, [edi+2*ebx+4] ; read odd Y line
  392. movq mm7, mm4 ; u11:u01:u31:u21|u10:u00:u30:u20
  393. movq [ecx+4*ebx+8], mm5 ; write Output1 line
  394. punpcklbw mm0, mm0 ; y3:y3:y2:y2|y1:y1:y0:y0
  395. movq [eax+4*ebx+8], mm2 ; write Output0 line
  396. movq mm1, mm0
  397. psubusb mm1, Y2_correct
  398. psrlw mm4, 8 ; :u11: :u31| :u10: :u30
  399. psubusb mm0, Y3_correct
  400. psrlq mm1, 3
  401. pand mm1, clean_MSB_mask
  402. psllw mm7, 8 ; u01: :u21: |u00: :u20:
  403. paddusb mm1, saturate_to_Y_high
  404. por mm4, mm7 ; u01:u11:u21:u31|u00:u10:u20:u30
  405. psubusb mm1, return_from_Y_high
  406. psrlq mm0, 3
  407. pand mm0, clean_MSB_mask
  408. movq mm5, mm4 ; u01:u11:u21:u31|u00:u10:u20:u30
  409. paddusb mm0, saturate_to_Y_high
  410. psrld mm5, 16
  411. psubusb mm0, return_from_Y_high
  412. ;
  413. paddb mm0, mm4
  414. mov ecx, [CCOLine3]
  415. movdt mm3, [ebp+ebx-4] ; read next 4 U points
  416. pslld mm4, 16
  417. movq [ecx+4*ebx+8], mm0 ; write Output3 line
  418. por mm5, mm4 ; u21:u31:u01:u11|u20:u30:u00:u10
  419. paddb mm5, mm1
  420. mov ecx, [CCOLine2]
  421. movdt mm2, [edx+ebx-4] ; read next 4 V points
  422. punpcklbw mm3, mm3 ; u3:u3:u2:u2|u1:u1:u0:u0
  423. movq [ecx+4*ebx+8], mm5 ; write Output2 line
  424. ;
  425. sub ebx, 4
  426. jae prepare_next4x8
  427. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  428. ; ebp must point to arguments
  429. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  430. mov ebx, [localCCOPitch]
  431. mov ecx, [CCOLine3]
  432. mov ebp, [localYPitch]
  433. mov edx, [localVPlane]
  434. lea eax, [ecx+ebx] ; next Output0 = old Output3 + CCOPitch
  435. lea ecx, [ecx+2*ebx] ; next Output1 = old Output3 + 2* CCOPitch
  436. add edx, [localChromaPitch]
  437. mov [CCOLine1], ecx
  438. lea esi, [esi+2*ebp] ; even Y line cursor goes to next line
  439. lea edi, [edi+2*ebp] ; odd Y line cursor goes to next line
  440. mov [localVPlane], edx ; edx will point to V plane
  441. cmp esi, [YLimit]
  442. jb NextFourLines
  443. done:
  444. mov esp, [StashESP]
  445. pop ebx
  446. pop ebp
  447. pop edi
  448. pop esi
  449. ret
  450. MMX_YUV12ToCLUT8ZoomBy2 ENDP
  451. MMXCODE1 ENDS
  452. END