Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

515 lines
15 KiB

  1. ;*************************************************************************
  2. ;** INTEL Corporation Proprietary Information
  3. ;**
  4. ;** This listing is supplied under the terms of a license
  5. ;** agreement with INTEL Corporation and may not be copied
  6. ;** nor disclosed except in accordance with the terms of
  7. ;** that agreement.
  8. ;**
  9. ;** Copyright (c) 1995 Intel Corporation.
  10. ;** All Rights Reserved.
  11. ;**
  12. ;*************************************************************************
  13. ;//
  14. ;//
  15. ;////////////////////////////////////////////////////////////////////////////
  16. ; yuv12enc -- This function performs "color conversion" in the H26X decoder for
  17. ; consumption by the H26X encoder. This entails reformatting the decoder's
  18. ; YVU data into the shape required by the encoder - including YUV order. It
  19. ; Also includes 7-bit pels.
  20. ; $Header: S:\h26x\src\dec\yuv12enc.asv 1.5 30 Oct 1996 14:31:00 mbodart $
  21. ; $Log: S:\h26x\src\dec\yuv12enc.asv $
  22. ;//
  23. ;// Rev 1.5 30 Oct 1996 14:31:00 mbodart
  24. ;// Re-checking in changes originally made by Atul, but lost when the server
  25. ;// ran out of disk space during a PVCS operation. Atul's original log msg:
  26. ;//
  27. ;// Removed AGI in IA code. Added MMX code but it is not ready for prime-time.
  28. ;//
  29. ;// Rev 1.4 08 Mar 1996 15:11:10 AGUPTA2
  30. ;// Removed segment register override when compiling for WIN32.
  31. ;// Should speed-up this routine substantially.
  32. ;//
  33. ;
  34. OPTION PROLOGUE:None
  35. OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
  36. include locals.inc
  37. include decconst.inc
  38. include iammx.inc
  39. IFNDEF DSEGNAME
  40. IFNDEF WIN32
  41. DSEGNAME TEXTEQU <DataH26x_YUV12ForEnc>
  42. ENDIF
  43. ENDIF
  44. IFDEF WIN32
  45. .xlist
  46. include memmodel.inc
  47. .list
  48. .DATA
  49. ELSE
  50. DSEGNAME SEGMENT WORD PUBLIC 'DATA'
  51. ENDIF
  52. ; any data would go here
  53. IFNDEF WIN32
  54. DSEGNAME ENDS
  55. .xlist
  56. include memmodel.inc
  57. .list
  58. ENDIF
  59. IFNDEF SEGNAME
  60. IFNDEF WIN32
  61. SEGNAME TEXTEQU <_CODE32>
  62. ENDIF
  63. ENDIF
  64. ifdef WIN32
  65. .CODE
  66. else
  67. SEGNAME SEGMENT PARA PUBLIC USE32 'CODE'
  68. endif
  69. ifdef WIN32
  70. ASSUME cs : FLAT
  71. ASSUME ds : FLAT
  72. ASSUME es : FLAT
  73. ASSUME fs : FLAT
  74. ASSUME gs : FLAT
  75. ASSUME ss : FLAT
  76. else
  77. ASSUME CS : SEGNAME
  78. ASSUME DS : Nothing
  79. ASSUME ES : Nothing
  80. ASSUME FS : Nothing
  81. ASSUME GS : Nothing
  82. endif
  83. ; void FAR ASM_CALLTYPE H26x_YUV12ForEnc (
  84. ; U8 FAR * InstanceBase,
  85. ; X32 YPlane,
  86. ; X32 VPlane,
  87. ; X32 UPlane,
  88. ; UN FrameWidth,
  89. ; UN FrameHeight,
  90. ; UN Pitch,
  91. ; U8 FAR * ColorConvertedFrame, // encoder's buffers.
  92. ; X32 YOutputPlane,
  93. ; X32 VOutputPlane,
  94. ; X32 UOutputPlane)
  95. ;
  96. ; YPlane, VPlane, YOutputPlane, and VOutputPlane are offsets. In 16-bit Microsoft
  97. ; Windows (tm), space in this segment is used for local variables and tables.
  98. ; In 32-bit variants of Microsoft Windows (tm), the local variables are on
  99. ; the stack, while the tables are in the one and only data segment.
  100. ;
  101. PUBLIC H26x_YUV12ForEnc
  102. ; due to the need for the ebp reg, these parameter declarations aren't used,
  103. ; they are here so the assembler knows how many bytes to relieve from the stack
  104. H26x_YUV12ForEnc proc DIST LANG PUBLIC,
  105. AInstanceBase: DWORD,
  106. AYPlane: DWORD,
  107. AVPlane: DWORD,
  108. AUPlane: DWORD,
  109. AFrameWidth: DWORD,
  110. AFrameHeight: DWORD,
  111. APitch: DWORD,
  112. AColorConvertedFrame: DWORD,
  113. AYOutputPlane: DWORD,
  114. AVOutputPLane: DWORD,
  115. AUOutputPLane: DWORD
  116. LocalFrameSize = 0
  117. RegisterStorageSize = 16
  118. ; Arguments:
  119. InstanceBase = LocalFrameSize + RegisterStorageSize + 4
  120. YPlane = LocalFrameSize + RegisterStorageSize + 8
  121. VPlane = LocalFrameSize + RegisterStorageSize + 12
  122. UPlane = LocalFrameSize + RegisterStorageSize + 16
  123. FrameWidth = LocalFrameSize + RegisterStorageSize + 20
  124. FrameHeight = LocalFrameSize + RegisterStorageSize + 24
  125. Pitch = LocalFrameSize + RegisterStorageSize + 28
  126. ColorConvertedFrame = LocalFrameSize + RegisterStorageSize + 32
  127. YOutputPlane = LocalFrameSize + RegisterStorageSize + 36
  128. VOutputPlane = LocalFrameSize + RegisterStorageSize + 40
  129. UOutputPlane = LocalFrameSize + RegisterStorageSize + 44
  130. EndOfArgList = LocalFrameSize + RegisterStorageSize + 48
  131. LCL EQU <esp+>
  132. push esi
  133. push edi
  134. push ebp
  135. push ebx
  136. sub esp,LocalFrameSize
  137. mov eax,PD [esp+InstanceBase]
  138. add PD [esp+YPlane],eax
  139. add PD [esp+VPlane],eax
  140. add PD [esp+UPlane],eax
  141. mov eax,PD [esp+ColorConvertedFrame]
  142. add PD [esp+YOutputPlane],eax
  143. add PD [esp+VOutputPlane],eax
  144. add PD [esp+UOutputPlane],eax
  145. ; We copy 16 pels in one iteration of the inner loop
  146. ; Register usage:
  147. ; edi -- Y plane output cursor
  148. ; esi -- Y plane input cursor
  149. ; ebp -- Count down Y plane height
  150. ; ecx -- Count down Y plane width
  151. ; ebx -- Y plane input pitch
  152. ; eax,edx -- scratch
  153. Lebp FrameHeight
  154. Lecx FrameWidth
  155. Lesi YPlane
  156. Lebx Pitch
  157. Ledi YOutputPlane
  158. YLoopHeader:
  159. mov eax, PD [esi+ecx-8] ;
  160. mov edx, PD [esi+ecx-4]
  161. ALIGN 4
  162. YLoop:
  163. shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
  164. and edx, 0FEFEFEFEH ; and to get rid of upper bit
  165. shr edx, 1
  166. and eax, 07F7F7F7Fh ; and to get rid of upper bit
  167. mov PD [edi+ecx-8], eax
  168. mov PD [edi+ecx-4], edx
  169. ; NEXT 8 PELS
  170. mov eax, PD [esi+ecx-8-8] ; speculatively load next 8 pels
  171. mov edx, PD [esi+ecx-4-8] ; this avoids AGI
  172. shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
  173. and edx, 0FEFEFEFEH ; and to get rid of upper bit
  174. shr edx, 1
  175. and eax, 07F7F7F7Fh ; and to get rid of upper bit
  176. mov PD [edi+ecx-8-8], eax
  177. mov PD [edi+ecx-4-8], edx
  178. mov eax, PD [esi+ecx-8-16] ; speculatively load next 8 pels
  179. mov edx, PD [esi+ecx-4-16] ; for next iteration
  180. sub ecx, 16
  181. jg YLoop
  182. Lecx FrameWidth
  183. add esi, ebx
  184. add edi, ebx
  185. dec ebp
  186. jne YLoopHeader
  187. ; We copy 8 pels in one iteration of the inner loop
  188. ; Register usage:
  189. ; edi -- V plane output cursor
  190. ; esi -- V plane input cursor
  191. ; ebp -- Count down V plane height
  192. ; ecx -- Count down V plane width
  193. ; ebx -- Pitch
  194. ; eax,edx -- scratch
  195. Lebp FrameHeight
  196. Lecx FrameWidth
  197. sar ecx,1
  198. Lesi VPlane
  199. sar ebp,1
  200. Ledi VOutputPlane
  201. ALIGN 4
  202. VLoopHeader:
  203. mov eax, PD [esi+ecx-8]
  204. mov edx, PD [esi+ecx-4]
  205. VLoop:
  206. shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
  207. and edx, 0FEFEFEFEH ; and to get rid of upper bit
  208. shr edx, 1
  209. and eax, 07F7F7F7Fh ; and to get rid of upper bit
  210. mov PD [edi+ecx-8], eax
  211. mov PD [edi+ecx-4], edx
  212. mov eax, PD [esi+ecx-8-8] ; speculatively load next 8 pels
  213. mov edx, PD [esi+ecx-4-8] ; this avoids AGI
  214. sub ecx, 8
  215. jg VLoop
  216. Lecx FrameWidth
  217. add esi,ebx
  218. shr ecx,1
  219. add edi,ebx
  220. dec ebp
  221. jne VLoopHeader
  222. ; We copy 8 pels in one iteration of the inner loop
  223. ; Register usage:
  224. ; edi -- U plane output cursor
  225. ; esi -- U plane input cursor
  226. ; ebp -- Count down U plane height
  227. ; ecx -- Count down U plane width
  228. ; ebx -- Pitch
  229. ; eax,edx -- scratch
  230. Lebp FrameHeight
  231. Lecx FrameWidth
  232. sar ecx,1
  233. Lesi UPlane
  234. sar ebp,1
  235. Ledi UOutputPlane
  236. ALIGN 4
  237. ULoopHeader:
  238. mov eax,PD [esi+ecx-8]
  239. mov edx,PD [esi+ecx-4]
  240. ULoop:
  241. shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
  242. and edx, 0FEFEFEFEH ; and to get rid of upper bit
  243. shr edx, 1
  244. and eax, 07F7F7F7Fh ; and to get rid of upper bit
  245. mov PD [edi+ecx-8], eax
  246. mov PD [edi+ecx-4], edx
  247. mov eax, PD [esi+ecx-8-8]
  248. mov edx, PD [esi+ecx-4-8]
  249. sub ecx, 8
  250. jg ULoop
  251. Lecx FrameWidth
  252. add esi, ebx
  253. shr ecx, 1
  254. add edi, ebx
  255. dec ebp
  256. jne ULoopHeader
  257. add esp,LocalFrameSize
  258. pop ebx
  259. pop ebp
  260. pop edi
  261. pop esi
  262. rturn
  263. H26x_YUV12ForEnc endp
  264. IFDEF H263P
  265. MMXDATA1 SEGMENT PARA USE32 PUBLIC 'DATA'
  266. MMXDATA1 ENDS
  267. MMXDATA1 SEGMENT
  268. ALIGN 8
  269. CLEAR_LOW_BIT_MASK LABEL DWORD
  270. DWORD 0FEFEFEFEH, 0FEFEFEFEH
  271. CLEAR_HIGH_BIT_MASK LABEL DWORD
  272. DWORD 07F7F7F7FH, 07F7F7F7FH
  273. MMXDATA1 ENDS
  274. PUBLIC MMX_H26x_YUV12ForEnc
  275. ; due to the need for the ebp reg, these parameter declarations aren't used,
  276. ; they are here so the assembler knows how many bytes to relieve from the stack
  277. MMX_H26x_YUV12ForEnc proc DIST LANG PUBLIC,
  278. AInstanceBase: DWORD,
  279. AYPlane: DWORD,
  280. AVPlane: DWORD,
  281. AUPlane: DWORD,
  282. AFrameWidth: DWORD,
  283. AFrameHeight: DWORD,
  284. APitch: DWORD,
  285. AColorConvertedFrame: DWORD,
  286. AYOutputPlane: DWORD,
  287. AVOutputPLane: DWORD,
  288. AUOutputPLane: DWORD
  289. LocalFrameSize = 0
  290. RegisterStorageSize = 16
  291. ; Arguments:
  292. InstanceBase = LocalFrameSize + RegisterStorageSize + 4
  293. YPlane = LocalFrameSize + RegisterStorageSize + 8
  294. VPlane = LocalFrameSize + RegisterStorageSize + 12
  295. UPlane = LocalFrameSize + RegisterStorageSize + 16
  296. FrameWidth = LocalFrameSize + RegisterStorageSize + 20
  297. FrameHeight = LocalFrameSize + RegisterStorageSize + 24
  298. Pitch = LocalFrameSize + RegisterStorageSize + 28
  299. ColorConvertedFrame = LocalFrameSize + RegisterStorageSize + 32
  300. YOutputPlane = LocalFrameSize + RegisterStorageSize + 36
  301. VOutputPlane = LocalFrameSize + RegisterStorageSize + 40
  302. UOutputPlane = LocalFrameSize + RegisterStorageSize + 44
  303. EndOfArgList = LocalFrameSize + RegisterStorageSize + 48
  304. LCL EQU <esp+>
  305. CLEAR_LOW_BIT EQU mm6
  306. CLEAR_HIGH_BIT EQU mm7
  307. push esi
  308. push edi
  309. push ebp
  310. push ebx
  311. sub esp,LocalFrameSize
  312. mov eax,PD [esp+InstanceBase]
  313. add PD [esp+YPlane],eax
  314. add PD [esp+VPlane],eax
  315. add PD [esp+UPlane],eax
  316. mov eax,PD [esp+ColorConvertedFrame]
  317. add PD [esp+YOutputPlane],eax
  318. add PD [esp+VOutputPlane],eax
  319. add PD [esp+UOutputPlane],eax
  320. ; We copy 16 pels of two lines in one iteration of the inner loop
  321. ; Register usage:
  322. ; edi -- Y plane output cursor (line 0)
  323. ; edx -- Y plane output cursor (line 1)
  324. ; esi -- Y plane input cursor (line 0)
  325. ; eax -- Y plane input cursor (line 1)
  326. ; ebp -- Count down Y plane height / 2
  327. ; ecx -- Count down Y plane width
  328. ; ebx -- Y plane input pitch
  329. Lebp FrameHeight
  330. Lebx Pitch
  331. Lesi YPlane
  332. Lecx FrameWidth
  333. Ledi YOutputPlane
  334. lea eax, [esi + ebx] ; line 1 of input
  335. movq mm6, CLEAR_LOW_BIT_MASK
  336. lea edx, [edi + ebx] ; line 1 of output
  337. movq mm7, CLEAR_HIGH_BIT_MASK
  338. shr ebp, 1 ; two lines in one iteration
  339. YLoopHeader:
  340. movq mm0, [esi+ecx-16] ;00
  341. ;
  342. movq mm1, [esi+ecx-8] ;01
  343. psrlq mm0, 1 ;00 Shift packed pel by 1 to convert to 7-bit
  344. YLoop:
  345. movq mm2, [eax+ecx-16] ;10
  346. pand mm0, CLEAR_HIGH_BIT ;00 and to get rid of high bit
  347. movq mm3, [eax+ecx-8] ;11
  348. psrlq mm1, 1 ;01
  349. movq [edi+ecx-16], mm0 ;00
  350. pand mm1, CLEAR_LOW_BIT ;01 and to get rid of low bit
  351. movq mm0, [esi+ecx-16-16] ; speculatively load next 8 pels
  352. psrlq mm2, 1 ;10 Shift packed pel by 1 to convert to 7-bit
  353. movq [edi+ecx-8 ], mm1 ;01
  354. pand mm2, CLEAR_HIGH_BIT ;10 and to get rid of high bit
  355. movq mm1, [esi+ecx-8 -16] ; for next iteration
  356. pand mm3, CLEAR_LOW_BIT ;11 and to get rid of low bit
  357. movq [edx+ecx-16], mm2 ;10
  358. psrlq mm3, 1 ;11
  359. psrlq mm0, 1 ;00 Shift packed pel by 1 to convert to 7-bit
  360. ;
  361. movq [edx+ecx-8 ], mm3 ;11
  362. sub ecx, 16
  363. jg YLoop
  364. Lecx FrameWidth
  365. lea esi, [esi + 2*ebx]
  366. lea edi, [edi + 2*ebx]
  367. lea eax, [eax + 2*ebx] ; line 1 of input
  368. lea edx, [edx + 2*ebx] ; line 1 of output
  369. dec ebp
  370. jne YLoopHeader
  371. ; We copy 8 pels in one iteration of the inner loop
  372. ; Register usage:
  373. ; edi -- V plane output cursor
  374. ; esi -- V plane input cursor
  375. ; ebp -- Count down V plane height
  376. ; ecx -- Count down V plane width
  377. ; ebx -- Pitch
  378. ; eax,edx -- scratch
  379. Lebp FrameHeight
  380. Lecx FrameWidth
  381. sar ecx,1
  382. Lesi VPlane
  383. sar ebp,1
  384. Ledi VOutputPlane
  385. ALIGN 4
  386. VLoopHeader:
  387. mov eax, PD [esi+ecx-8]
  388. mov edx, PD [esi+ecx-4]
  389. VLoop:
  390. shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
  391. and edx, 0FEFEFEFEH ; and to get rid of upper bit
  392. shr edx, 1
  393. and eax, 07F7F7F7Fh ; and to get rid of upper bit
  394. mov PD [edi+ecx-8], eax
  395. mov PD [edi+ecx-4], edx
  396. mov eax, PD [esi+ecx-8-8] ; speculatively load next 8 pels
  397. mov edx, PD [esi+ecx-4-8] ; this avoids AGI
  398. sub ecx, 8
  399. jg VLoop
  400. Lecx FrameWidth
  401. add esi,ebx
  402. shr ecx,1
  403. add edi,ebx
  404. dec ebp
  405. jne VLoopHeader
  406. ; We copy 8 pels in one iteration of the inner loop
  407. ; Register usage:
  408. ; edi -- U plane output cursor
  409. ; esi -- U plane input cursor
  410. ; ebp -- Count down U plane height
  411. ; ecx -- Count down U plane width
  412. ; ebx -- Pitch
  413. ; eax,edx -- scratch
  414. Lebp FrameHeight
  415. Lecx FrameWidth
  416. sar ecx,1
  417. Lesi UPlane
  418. sar ebp,1
  419. Ledi UOutputPlane
  420. ALIGN 4
  421. ULoopHeader:
  422. mov eax,PD [esi+ecx-8]
  423. mov edx,PD [esi+ecx-4]
  424. ULoop:
  425. shr eax, 1 ; Shift packed pel by 1 to convert to 7-bit
  426. and edx, 0FEFEFEFEH ; and to get rid of upper bit
  427. shr edx, 1
  428. and eax, 07F7F7F7Fh ; and to get rid of upper bit
  429. mov PD [edi+ecx-8], eax
  430. mov PD [edi+ecx-4], edx
  431. mov eax, PD [esi+ecx-8-8]
  432. mov edx, PD [esi+ecx-4-8]
  433. sub ecx, 8
  434. jg ULoop
  435. Lecx FrameWidth
  436. add esi, ebx
  437. shr ecx, 1
  438. add edi, ebx
  439. dec ebp
  440. jne ULoopHeader
  441. add esp,LocalFrameSize
  442. pop ebx
  443. pop ebp
  444. pop edi
  445. pop esi
  446. rturn
  447. MMX_H26x_YUV12ForEnc endp
  448. ENDIF ;H263P
  449. IFNDEF WIN32
  450. SEGNAME ENDS
  451. ENDIF
  452. END