Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

495 lines
17 KiB

  1. ;-------------------------------------------------------------------------
  2. ; INTEL Corporation Proprietary Information
  3. ;
  4. ; This listing is supplied under the terms of a license
  5. ; agreement with INTEL Corporation and may not be copied
  6. ; nor disclosed except in accordance with the terms of
  7. ; that agreement.
  8. ;
  9. ; Copyright (c) 1996 Intel Corporation.
  10. ; All Rights Reserved.
  11. ;
  12. ;-------------------------------------------------------------------------
  13. ;-------------------------------------------------------------------------
  14. ;// $Header: S:\h26x\src\dec\cx51281.asv
  15. ;//
  16. ;// $Log: S:\h26x\src\dec\cxm1281.asv $
  17. ;//
  18. ;// Rev 1.7 25 Jul 1996 13:47:58 AGUPTA2
  19. ;// Fixed blockiness problem; dither matrices were not created properly.
  20. ;//
  21. ;// Rev 1.6 14 Jun 1996 16:28:24 AGUPTA2
  22. ;// Cosmetic changes to adhere to common coding convention.
  23. ;//
  24. ;// Rev 1.5 13 May 1996 11:01:34 AGUPTA2
  25. ;// Final drop from IDC.
  26. ;//
  27. ;// Rev 1.1 20 Mar 1996 11:19:24 RMCKENZX
  28. ;// March 96 version.
  29. ;//
  30. ;// Rev 1.2 01 Feb 1996 10:45:58 vladip
  31. ;// Reduced number of locals, DataSegment changed to PARA
  32. ;//
  33. ;// Rev 1.1 29 Jan 1996 18:53:40 vladip
  34. ;//
  35. ;// IFDEF TIMING is added
  36. ;//
  37. ;// Rev 1.0 29 Jan 1996 17:28:06 vladip
  38. ;// Initial mmx verision.
  39. ;//
  40. ;-------------------------------------------------------------------------
  41. ;
  42. ; +---------- Color convertor.
  43. ; |+--------- For both H261 and H263.
  44. ; ||+-------- MMx Version.
  45. ; |||++------ Convert from YUV12.
  46. ; |||||+----- Convert to CLUT8.
  47. ; ||||||+---- Zoom by one, i.e. non-zoom.
  48. ; |||||||
  49. ; cxm1281 -- This function performs YUV12 to CLUT8 color conversion for H26x.
  50. ; It dithers among 9 chroma points and 26 luma points, mapping the
  51. ; 8 bit luma pels into the 26 luma points by clamping the ends and
  52. ; stepping the luma by 8.
  53. ;
  54. ; Color convertor is not destructive.
  55. ; Requirement:
  56. ; U and V plane SHOULD be followed by 4 bytes (for read only)
  57. ; Y plane SHOULD be followed by 8 bytes (for read only)
  58. OPTION CASEMAP:NONE
  59. OPTION PROLOGUE:None
  60. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  61. .586
  62. .xlist
  63. include iammx.inc
  64. include memmodel.inc
  65. .list
  66. MMXCODE1 SEGMENT PARA USE32 PUBLIC 'CODE'
  67. MMXCODE1 ENDS
  68. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  69. MMXDATA1 ENDS
  70. MMXDATA1 SEGMENT
  71. ALIGN 8
  72. PUBLIC Y0_low
  73. PUBLIC Y1_low
  74. PUBLIC U_low_value
  75. PUBLIC V_low_value
  76. PUBLIC U2_V0high_bound
  77. PUBLIC U2_V0low_bound
  78. PUBLIC V2_U0high_bound
  79. PUBLIC V2_U0low_bound
  80. PUBLIC return_from_Y_high
  81. PUBLIC saturate_to_Y_high
  82. PUBLIC clean_MSB_mask
  83. PUBLIC convert_to_sign
  84. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  85. ; U,V,Y impacts are calculated as follows:
  86. ; 0 U < 64h
  87. ; U impact 1ah 64h <= U < 84h
  88. ; 24h U >= 84h
  89. ;
  90. ; 0 V < 64h
  91. ; V impact 4eh 64h <= V < 84h
  92. ; 9ch V >= 84h
  93. ;
  94. ; 0 Y < 1bh
  95. ; Y impact Y/8 1bh <= Y < ebh
  96. ; 19h Y >= ebh
  97. ; and the dither pattern is added to the input Y,U,V values and is a
  98. ; 4X4 matrix as defined below:
  99. ; U
  100. ; 10h 8 18h 0
  101. ; 18h 0 10h 8
  102. ; 8 10h 0 18h
  103. ; 0 18h 8 10h
  104. ; V
  105. ; 8 10h 0 18h
  106. ; 0 18h 8 10h
  107. ; 10h 8 18h 0
  108. ; 18h 0 10h 8
  109. ; Y
  110. ; 4 2 6 0
  111. ; 6 0 4 2
  112. ; 2 4 0 6
  113. ; 0 6 2 4
  114. ; Note the following equalities in dither matrices which will explain funny
  115. ; data declarations below:
  116. ; U0=V2
  117. ; U1=V3
  118. ; U2=V0
  119. ; U3=V1
  120. ; More gory details can be found in the color convertor document written
  121. ; by IDC.
  122. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  123. V2_U0low_bound DWORD 0f3ebfbe3h, 0f3ebfbe3h ; 746c7c64746c7c64 - 8080808080808080
  124. U2_V0low_bound DWORD 0ebf3e3fbh, 0ebf3e3fbh, ; 6c74647c6c74647c - 8080808080808080
  125. 0f3ebfbe3h, 0f3ebfbe3h ; 746c7c64746c7c64 - 8080808080808080
  126. U3_V1low_bound DWORD 0e3fbebf3h, 0e3fbebf3h ; 647c6c74647c6c74 - 8080808080808080
  127. V3_U1low_bound DWORD 0fbe3f3ebh, 0fbe3f3ebh, ; 7c64746c7c64746c - 8080808080808080
  128. 0e3fbebf3h, 0e3fbebf3h ; 647c6c74647c6c74 - 8080808080808080
  129. V2_U0high_bound DWORD 0130b1b03h, 0130b1b03h ; 948c9c84948c9c84 - 8080808080808080
  130. U2_V0high_bound DWORD 00b13031bh, 00b13031bh, ; 8c94849c8c94849c - 8080808080808080
  131. 0130b1b03h, 0130b1b03h ; 948c9c84948c9c84 - 8080808080808080
  132. U3_V1high_bound DWORD 0031b0b13h, 0031b0b13h ; 849c8c94849c8c94 - 8080808080808080
  133. V3_U1high_bound DWORD 01b03130bh, 01b03130bh, ; 9c84948c9c84948c - 8080808080808080
  134. 0031b0b13h, 0031b0b13h ; 849c8c94849c8c94 - 8080808080808080
  135. U_low_value DWORD 01a1a1a1ah, 01a1a1a1ah
  136. V_low_value DWORD 04e4e4e4eh, 04e4e4e4eh
  137. convert_to_sign DWORD 080808080h, 080808080h
  138. ; Y0_low,Y1_low are arrays
  139. Y0_low DWORD 01719151bh, 01719151bh, ; 1b1b1b1b1b1b1b1b - 0402060004020600 ; for line%4=0
  140. 019171b15h, 019171b15h ; 1b1b1b1b1b1b1b1b - 0204000602040006 ; for line%4=2
  141. Y1_low DWORD 0151b1719h, 0151b1719h, ; 1b1b1b1b1b1b1b1b - 0600040206000402 ; for line%4=1
  142. 01b151917h, 01b151917h ; 1b1b1b1b1b1b1b1b - 0006020400060204 ; for line%4=3
  143. clean_MSB_mask DWORD 01f1f1f1fh, 01f1f1f1fh
  144. saturate_to_Y_high DWORD 0e6e6e6e6h, 0e6e6e6e6h ; ffh-19h
  145. return_from_Y_high DWORD 0dcdcdcdch, 0dcdcdcdch ; ffh-19h-ah (return back and ADD ah);
  146. MMXDATA1 ENDS
  147. MMXCODE1 SEGMENT
  148. MMX_YUV12ToCLUT8 PROC DIST LANG PUBLIC,
  149. AYPlane: DWORD,
  150. AVPlane: DWORD,
  151. AUPlane: DWORD,
  152. AFrameWidth: DWORD,
  153. AFrameHeight: DWORD,
  154. AYPitch: DWORD,
  155. AVPitch: DWORD,
  156. AAspectAdjustmentCnt: DWORD,
  157. AColorConvertedFrame: DWORD,
  158. ADCIOffset: DWORD,
  159. ACCOffsetToLine0: DWORD,
  160. ACCOPitch: DWORD,
  161. ACCType: DWORD
  162. LocalFrameSize = 108
  163. RegisterStorageSize = 16
  164. argument_base EQU ebp + RegisterStorageSize
  165. local_base EQU esp
  166. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  167. ; Arguments:
  168. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  169. YPlane EQU argument_base + 4
  170. VPlane EQU argument_base + 8
  171. UPlane EQU argument_base + 12
  172. FrameWidth EQU argument_base + 16
  173. FrameHeight EQU argument_base + 20
  174. YPitch EQU argument_base + 24
  175. ChromaPitch EQU argument_base + 28
  176. AspectAdjustmentCount EQU argument_base + 32
  177. ColorConvertedFrame EQU argument_base + 36
  178. DCIOffset EQU argument_base + 40
  179. CCOffsetToLine0 EQU argument_base + 44
  180. CCOPitch EQU argument_base + 48
  181. CCType EQU argument_base + 52
  182. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  183. ; Locals (on local stack frame)
  184. ; (local_base is aligned at cache-line boundary in the prologue)
  185. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  186. tmpV2_U0low_bound EQU local_base + 0 ; qword
  187. tmpU2_V0low_bound EQU local_base + 8 ; qword
  188. tmpU3_V1low_bound EQU local_base + 16 ; qword
  189. tmpV3_U1low_bound EQU local_base + 24 ; qword
  190. tmpV2_U0high_bound EQU local_base + 32 ; qword
  191. tmpU2_V0high_bound EQU local_base + 40 ; qword
  192. tmpU3_V1high_bound EQU local_base + 48 ; qword
  193. tmpV3_U1high_bound EQU local_base + 56 ; qword
  194. tmpY0_low EQU local_base + 64 ; qword
  195. tmpY1_low EQU local_base + 72 ; qword
  196. tmpBlockParity EQU local_base + 80
  197. YLimit EQU local_base + 84
  198. AspectCount EQU local_base + 88
  199. tmpYCursorEven EQU local_base + 92
  200. tmpYCursorOdd EQU local_base + 96
  201. tmpCCOPitch EQU local_base + 100
  202. StashESP EQU local_base + 104
  203. U_low EQU mm6
  204. V_low EQU mm7
  205. U_high EQU U_low
  206. V_high EQU V_low
  207. push esi
  208. push edi
  209. push ebp
  210. push ebx
  211. mov ebp, esp
  212. sub esp, LocalFrameSize
  213. and esp, -32 ; align at cache line boundary
  214. mov [StashESP], ebp
  215. mov ecx, [YPitch]
  216. mov edx, [FrameHeight]
  217. mov ebx, [FrameWidth]
  218. ;
  219. imul edx, ecx
  220. ;
  221. mov eax, [YPlane]
  222. add edx, eax ; edx is relative to YPlane
  223. add eax, ebx ; Points to end of Y even line
  224. ;
  225. mov [tmpYCursorEven], eax
  226. add eax, ecx ; add YPitch
  227. mov [tmpYCursorOdd], eax
  228. lea edx, [edx+2*ebx] ; final value of Y-odd-pointer
  229. mov [YLimit], edx
  230. mov esi, [VPlane]
  231. mov edx, [UPlane]
  232. mov eax, [ColorConvertedFrame]
  233. add eax, [DCIOffset]
  234. ;
  235. add eax, [CCOffsetToLine0]
  236. sar ebx, 1
  237. add esi, ebx
  238. add edx, ebx
  239. lea edi, [eax+2*ebx] ; CCOCursor
  240. mov ecx, [AspectAdjustmentCount]
  241. mov [AspectCount], ecx
  242. test ecx, ecx ; if AspectCount=0 we should not drop any lines
  243. jnz non_zero_AspectCount
  244. dec ecx
  245. non_zero_AspectCount:
  246. mov [AspectCount], ecx
  247. cmp ecx, 1
  248. jbe finish
  249. ;
  250. neg ebx
  251. ;
  252. mov [FrameWidth], ebx
  253. ;
  254. movq mm6, U_low_value ; store some frequently used values in registers
  255. ;
  256. movq mm7, V_low_value
  257. xor eax, eax
  258. mov [tmpBlockParity], eax
  259. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  260. ; Register Usage:
  261. ;
  262. ; esi -- points to the end of V Line
  263. ; edx -- points to the end of U Line.
  264. ; edi -- points to the end of even line of output.
  265. ; ebp -- points to the end of odd line of output.
  266. ;
  267. ; ecx -- points to the end of even/odd Y Line
  268. ; eax -- 8*(line&2) == 0, on line%4=0,1
  269. ; == 8, on line%4=2,3
  270. ; in the loop, eax points to the end of even Y line
  271. ; ebx -- Number of points, we havn't done yet. (multiplyed by -0.5)
  272. ;
  273. ;
  274. ; Noise matrix is of size 4x4 , so we have different noise values in even
  275. ; pair of lines, and in odd pair of lines. But in our loop we are doing 2
  276. ; lines. So here we are prepairing constants for next two lines. This code
  277. ; is done each time we are starting to convert next pair of lines.
  278. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  279. PrepareNext2Lines:
  280. mov eax, [tmpBlockParity]
  281. ;
  282. ;constants for odd line
  283. movq mm0, V3_U1low_bound[eax]
  284. ;
  285. movq mm1, V3_U1high_bound[eax]
  286. ;
  287. movq mm2, U3_V1low_bound[eax]
  288. ;
  289. movq mm3, U3_V1high_bound[eax]
  290. ;
  291. movq [tmpV3_U1low_bound], mm0
  292. ;
  293. movq [tmpV3_U1high_bound], mm1
  294. ;
  295. movq [tmpU3_V1low_bound], mm2
  296. ;
  297. movq [tmpU3_V1high_bound], mm3
  298. ;
  299. ;
  300. ;constants for even line
  301. ;
  302. movq mm0, V2_U0low_bound[eax]
  303. ;
  304. movq mm1, V2_U0high_bound[eax]
  305. ;
  306. movq mm2, U2_V0low_bound[eax]
  307. ;
  308. movq mm3, U2_V0high_bound[eax]
  309. ;
  310. movq [tmpV2_U0low_bound], mm0
  311. ;
  312. movq [tmpV2_U0high_bound], mm1
  313. ;
  314. movq [tmpU2_V0low_bound], mm2
  315. ;
  316. movq [tmpU2_V0high_bound], mm3
  317. ;
  318. ;
  319. ; Constants for Y values
  320. ;
  321. movq mm4, Y0_low[eax]
  322. ;
  323. movq mm5, Y1_low[eax]
  324. ;
  325. xor eax, 8
  326. mov [tmpBlockParity], eax
  327. movq [tmpY0_low], mm4
  328. ;
  329. movq [tmpY1_low], mm5
  330. ;
  331. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  332. ; if AspectCount<2 we should skip a line. In this case we are still doing two
  333. ; lines, but output pointers are the same, so we just overwriting line
  334. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  335. mov eax, [CCOPitch]
  336. mov ebx, [AspectCount]
  337. xor ecx, ecx
  338. sub ebx, 2
  339. mov [tmpCCOPitch], eax
  340. ja continue
  341. mov eax, [AspectAdjustmentCount]
  342. mov [tmpCCOPitch], ecx ; 0
  343. lea ebx, [ebx+eax] ; calculate new AspectCount
  344. jnz continue ; skiping even line
  345. ;
  346. ;skip_odd_line
  347. ;
  348. mov eax, [tmpYCursorEven]
  349. ;
  350. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  351. ; set odd constants to be equal to even_constants
  352. ; Odd line will be performed as even
  353. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  354. movq [tmpV3_U1low_bound], mm0
  355. ;
  356. movq [tmpV3_U1high_bound], mm1
  357. ;
  358. movq [tmpU3_V1low_bound], mm2
  359. ;
  360. movq [tmpU3_V1high_bound], mm3
  361. ;
  362. movq [tmpY1_low], mm4
  363. ;
  364. mov [tmpYCursorOdd], eax
  365. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  366. ; when we got here, we already did all preparations.
  367. ; we are entering a main loop which is starts at do_next_2x8_block label
  368. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  369. continue:
  370. mov [AspectCount], ebx
  371. mov ebx, [FrameWidth]
  372. mov ebp, edi
  373. ;
  374. add ebp, [tmpCCOPitch] ; ebp points to the end of odd line
  375. mov eax, [tmpYCursorEven]
  376. mov ecx, [tmpYCursorOdd]
  377. ;
  378. movdt mm0, [edx+ebx] ; 0:0:0:0|u3:u2:u1:u0 unsigned
  379. ;
  380. movdt mm2, [esi+ebx] ; 0:0:0:0|v3:v2:v1:v0 unsigned
  381. punpcklbw mm0, mm0 ; u3:u3:u2:u2|u1:u1:u0:u0 unsigned
  382. psubb mm0, convert_to_sign ; u3:u3:u2:u2|u1:u1:u0:u0 signed
  383. punpcklbw mm2, mm2 ; v3:v3:v2:v2|v1:v1:v0:v0 unsigned
  384. movq mm4, [eax+2*ebx] ; y7|..|y0
  385. ;
  386. movq mm1, mm0 ; u3:u3:u2:u2|u1:u1:u0:u0
  387. ;
  388. do_next_2x8_block:
  389. psubb mm2, convert_to_sign ; v3:v3:v2:v2|v1:v1:v0:v0 signed
  390. movq mm5, mm1 ; u3:u3:u2:u2|u1:u1:u0:u0
  391. pcmpgtb mm0, [tmpV2_U0low_bound]
  392. movq mm3, mm2
  393. pcmpgtb mm1, [tmpV2_U0high_bound]
  394. pand mm0, U_low
  395. psubusb mm4, [tmpY0_low]
  396. pand mm1, U_high
  397. pcmpgtb mm2, [tmpU2_V0low_bound]
  398. psrlq mm4, 3
  399. pand mm4, clean_MSB_mask
  400. pand mm2, V_low
  401. paddusb mm4, saturate_to_Y_high
  402. paddb mm0, mm1 ; U03:U03:U02:U02|U01:U01:U00:U00
  403. psubusb mm4, return_from_Y_high
  404. movq mm1, mm5
  405. pcmpgtb mm5, [tmpV3_U1low_bound]
  406. paddd mm0, mm2
  407. pcmpgtb mm1, [tmpV3_U1high_bound]
  408. pand mm5, U_low
  409. paddd mm0, mm4
  410. movq mm2, mm3
  411. pcmpgtb mm3, [tmpU2_V0high_bound]
  412. pand mm1, U_high
  413. movq mm4, [ecx+2*ebx] ; read next 8 Y points from odd line
  414. paddb mm5, mm1 ; u impact on odd line
  415. psubusb mm4, [tmpY1_low]
  416. movq mm1, mm2
  417. pcmpgtb mm2, [tmpU3_V1low_bound]
  418. psrlq mm4, 3
  419. pand mm4, clean_MSB_mask
  420. pand mm2, V_low
  421. paddusb mm4, saturate_to_Y_high
  422. paddd mm5, mm2
  423. psubusb mm4, return_from_Y_high
  424. pand mm3, V_high
  425. pcmpgtb mm1, [tmpU3_V1high_bound]
  426. paddb mm3, mm0
  427. movdt mm0, [edx+ebx+4] ; read next 4 U points
  428. pand mm1, V_high
  429. movdt mm2, [esi+ebx+4] ; read next 4 V points
  430. paddd mm5, mm4
  431. movq mm4, [eax+2*ebx+8] ; read next 8 Y points from even line
  432. paddb mm5, mm1
  433. psubb mm0, convert_to_sign
  434. punpcklbw mm2, mm2 ; v3:v3:v2:v2|v1:v1:v0:v0
  435. movq [edi+2*ebx], mm3 ; write even line
  436. punpcklbw mm0, mm0 ; u3:u3:u2:u2|u1:u1:u0:u0
  437. movq [ebp+2*ebx], mm5 ; write odd line
  438. movq mm1, mm0 ; u3:u3:u2:u2|u1:u1:u0:u0
  439. add ebx, 4
  440. jl do_next_2x8_block
  441. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  442. ; update pointes to input and output buffers, to point to the next lines
  443. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  444. mov ebp, [StashESP]
  445. mov eax, [tmpYCursorEven]
  446. mov ecx, [YPitch]
  447. add edi, [CCOPitch] ; go to the end of next line
  448. add edi, [tmpCCOPitch] ; skip odd line
  449. lea eax, [eax+2*ecx]
  450. mov [tmpYCursorEven], eax
  451. add eax, [YPitch]
  452. mov [tmpYCursorOdd], eax
  453. add esi, [ChromaPitch]
  454. mov ecx, [YLimit] ; Done with last line?
  455. add edx, [ChromaPitch]
  456. cmp eax, ecx
  457. jb PrepareNext2Lines
  458. finish:
  459. mov esp, [StashESP]
  460. ;
  461. pop ebx
  462. pop ebp
  463. pop edi
  464. pop esi
  465. ret
  466. MMX_YUV12ToCLUT8 ENDP
  467. MMXCODE1 ENDS
  468. END