Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

548 lines
16 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. //////////////////////////////////////////////////////////////////////////
  15. // $Author: AGUPTA2 $
  16. // $Date: 22 Mar 1996 17:23:16 $
  17. // $Archive: S:\h26x\src\dec\d3bvriq.cpv $
  18. // $Header: S:\h26x\src\dec\d3bvriq.cpv 1.7 22 Mar 1996 17:23:16 AGUPTA2 $
  19. // $Log: S:\h26x\src\dec\d3bvriq.cpv $
  20. //
  21. // Rev 1.7 22 Mar 1996 17:23:16 AGUPTA2
  22. // Minor interface change to accomodate MMX rtns. Now the interface is the
  23. // same for MMX and IA.
  24. //
  25. // Rev 1.6 08 Mar 1996 16:46:10 AGUPTA2
  26. // Added pragma code_seg.
  27. //
  28. //
  29. // Rev 1.5 15 Feb 1996 14:54:08 RMCKENZX
  30. // Gutted and re-wrote routine, optimizing for performance
  31. // for the p5. Added clamping to -2048...+2047 to escape code
  32. // portion.
  33. //
  34. // Rev 1.4 27 Dec 1995 14:36:00 RMCKENZX
  35. // Added copyright notice
  36. //
  37. // Rev 1.3 09 Dec 1995 17:35:20 RMCKENZX
  38. // Re-checked in module to support decoder re-architecture (thru PB frames)
  39. //
  40. // Rev 1.0 27 Nov 1995 14:36:46 CZHU
  41. // Initial revision.
  42. //
  43. // Rev 1.28 03 Nov 1995 16:28:50 CZHU
  44. // Cleaning up and added more comments
  45. //
  46. // Rev 1.27 31 Oct 1995 10:27:20 CZHU
  47. // Added error checking for total run value.
  48. //
  49. // Rev 1.26 19 Sep 1995 10:45:12 CZHU
  50. //
  51. // Improved pairing and cleaned up
  52. //
  53. // Rev 1.25 18 Sep 1995 10:20:28 CZHU
  54. // Fixed bugs in handling escape codes for INTER blocks w.r.t. run.
  55. //
  56. // Rev 1.24 15 Sep 1995 09:35:30 CZHU
  57. // fixed bugs in run cumulation for inter
  58. //
  59. // Rev 1.23 14 Sep 1995 10:13:32 CZHU
  60. //
  61. // Initialize cumulated run for the INTER blocks.
  62. //
  63. // Rev 1.22 12 Sep 1995 17:36:06 AKASAI
  64. //
  65. // Fixed bug in addressing to Intermediate when changed from writing
  66. // BYTES to DWORDS. Inter Butterfly only had the problem.
  67. //
  68. // Rev 1.21 12 Sep 1995 13:37:58 AKASAI
  69. // Added Butterfly Inter code. Also added optimizations to pre-fetch
  70. // accumulators and "output" cache lines.
  71. //
  72. // Rev 1.20 11 Sep 1995 16:41:32 CZHU
  73. // Adjust target block address: write to Target if INTRA, write to tempory sto
  74. //
  75. // Rev 1.19 11 Sep 1995 14:30:32 CZHU
  76. // Seperate Butterfly for inter and intra, put place holder for inter blocks
  77. //
  78. // Rev 1.18 08 Sep 1995 11:49:00 CZHU
  79. // Added support for P frames, fixed bugs related to INTRADC's presence.
  80. //
  81. // Rev 1.17 28 Aug 1995 14:51:22 CZHU
  82. // Improve pairing and clean up
  83. //
  84. // Rev 1.16 24 Aug 1995 15:36:24 CZHU
  85. //
  86. // Fixed bugs handling the escape code followed by 22bits fixed length code
  87. //
  88. // Rev 1.15 23 Aug 1995 14:53:32 AKASAI
  89. // Changed butterfly writes to increment by bytes and take a PITCH.
  90. //
  91. // Rev 1.14 23 Aug 1995 11:58:46 CZHU
  92. // Added signed extended inverse quant before calling idct. and others
  93. //
  94. // Rev 1.13 22 Aug 1995 17:38:28 CZHU
  95. // Calls the idct accumulation for each symbol and butterfly at the end.
  96. //
  97. // Rev 1.12 21 Aug 1995 14:39:58 CZHU
  98. //
  99. // Added IDCT initialization code and stubs for accumulation and butterfly.
  100. // Also added register saving and restoration before and after accumulation
  101. //
  102. // Rev 1.11 18 Aug 1995 17:03:32 CZHU
  103. // Added comments and clean up for integration with IDCT
  104. //
  105. // Rev 1.10 18 Aug 1995 15:01:52 CZHU
  106. // Fixed bugs in handling escape codes using byte oriented reading approach
  107. //
  108. // Rev 1.9 16 Aug 1995 14:24:22 CZHU
  109. // Bug fixes for the integration with bitstream parsing. Also changed from DWO
  110. // reading to byte oriented reading.
  111. //
  112. // Rev 1.8 15 Aug 1995 15:07:42 CZHU
  113. // Fixed the stack so that the parameters have been passed in correctly.
  114. //
  115. // Rev 1.7 14 Aug 1995 16:39:02 DBRUCKS
  116. // changed pPBlock to pCurBlock
  117. //
  118. // Rev 1.6 11 Aug 1995 16:08:12 CZHU
  119. // removed local varables in C
  120. //
  121. // Rev 1.5 11 Aug 1995 15:51:26 CZHU
  122. //
  123. // Readjust local varables on the stack. Clear ECX upfront.
  124. //
  125. // Rev 1.4 11 Aug 1995 15:14:32 DBRUCKS
  126. // variable name changes
  127. //
  128. // Rev 1.3 11 Aug 1995 13:37:26 CZHU
  129. //
  130. // Adjust to the joint optimation of IDCT, IQ, RLE, and ZZ.
  131. // Also added place holders for IDCT.
  132. //
  133. // Rev 1.2 11 Aug 1995 10:30:26 CZHU
  134. // Changed the functions parameters, and added codes to short-curcuit IDCT bef
  135. //
  136. // Rev 1.1 03 Aug 1995 14:39:04 CZHU
  137. //
  138. // further optimization.
  139. //
  140. // Rev 1.0 02 Aug 1995 15:20:02 CZHU
  141. // Initial revision.
  142. //
  143. // Rev 1.1 02 Aug 1995 10:21:12 CZHU
  144. // Added asm codes for VLD of TCOEFF, inverse quantization, run-length decode.
  145. //
  146. //--------------------------------------------------------------------------
  147. //
  148. // d3xbvriq.cpp
  149. //
  150. // Description:
  151. // This routine performs run length decoding and inverse quantization
  152. // of transform coefficients for one block.
  153. // MMx version.
  154. //
  155. // Routines:
  156. // VLD_RLD_IQ_Block
  157. //
  158. // Inputs (dwords pushed onto stack by caller):
  159. // lpBlockAction pointer to Block action stream for current blk.
  160. //
  161. // lpSrc The input bitstream.
  162. //
  163. // uBitsInOut Number of bits already read.
  164. //
  165. // pIQ_INDEX Pointer to coefficients and indices.
  166. //
  167. // pN Pointer to number of coefficients read.
  168. //
  169. // Returns:
  170. // 0 on bit stream error, otherwise total number of bits read
  171. // (including number read prior to call).
  172. //
  173. // Note:
  174. // The structure of gTAB_TCOEFF_MAJOR is as follows:
  175. // bits name: description
  176. // ---- ----- -----------
  177. // 25-18 bits: number of bitstream bits used
  178. // 17 last: flag for last coefficient
  179. // 16-9 run: number of preceeding 0 coefficients plus 1
  180. // 8-2 level: absolute value of coefficient
  181. // 1 sign: sign of coefficient
  182. // 0 hit: 1 = major table miss, 0 = major table hit
  183. //
  184. // The structure of gTAB_TCOEFF_MINOR is the same, right shifted by 1 bit.
  185. // A gTAB_TCOEFF_MAJOR value of 00000001h indicates the escape code.
  186. //
  187. //--------------------------------------------------------------------------
  188. #include "precomp.h"
  189. // local variable definitions
  190. #define L_Quantizer esp+20 // quantizer P_BlockAction
  191. #define L_Quantizer64 esp+24 // 64*quantizer P_src
  192. #define L_Bits esp+28 // bit offset P_bits
  193. #define L_CumRun esp+36 // cumulative run P_dst
  194. // stack use
  195. // ebp esp+0
  196. // esi esp+4
  197. // edi esp+8
  198. // ebx esp+12
  199. // return address esp+16
  200. // input parameters
  201. #define P_BlockAction esp+20 // L_Quantizer
  202. #define P_src esp+24 // L_Quantizer64
  203. #define P_bits esp+28 // L_Bits
  204. #define P_num esp+32 //
  205. #define P_dst esp+36 // L_CumRun
  206. #pragma code_seg("IACODE1")
  207. extern "C" __declspec(naked)
  208. U32 VLD_RLD_IQ_Block(T_BlkAction *lpBlockAction,
  209. U8 *lpSrc,
  210. U32 uBitsread,
  211. U32 *pN,
  212. U32 *pIQ_INDEX)
  213. {
  214. __asm {
  215. // save registers
  216. push ebp
  217. push esi
  218. push edi
  219. push ebx
  220. //
  221. // initialize
  222. // make sure we read in the P_src and P_dst pointers before we
  223. // overwrite them with L_Quantizer64 and L_CumRun.
  224. //
  225. // Output Registers:
  226. // dl = block type ([P_BlockAction])
  227. // esi = bitstream source pointer (P_src)
  228. // edi = coefficient destination pointer (P_dst)
  229. // ebp = coefficent counter (init to 0)
  230. //
  231. // Locals initialized on Stack: (these overwrite indicated input parameters)
  232. // local var clobbers initial value
  233. // ---------------------------------------------------
  234. // L_Quantizer P_BlockAction input quantizer
  235. // L_Quantizer64 P_src 64 * input quantizer
  236. // L_CumRun P_dst -1
  237. //
  238. xor ebp, ebp // init coefficient counter to 0
  239. xor eax, eax // zero eax for quantizer & coef. counter
  240. mov ecx, [P_BlockAction] // ecx = block action pointer
  241. mov ebx, -1 // beginning cumulative run value
  242. mov esi, [P_src] // esi = bitstream source pointer
  243. mov edi, [P_dst] // edi = coefficient pointer
  244. mov al, [ecx+3] // al = Quantizer
  245. mov [L_CumRun], ebx // init cumulative run to -1
  246. mov [L_Quantizer], eax // save original quantizer
  247. mov dl, [ecx] // block type in dl
  248. shl eax, 6 // 64 * Quantizer
  249. mov ecx, [L_Bits] // ecx = L_Bits
  250. mov ebx, ecx // ebx = L_Bits
  251. mov [L_Quantizer64], eax // save 64*Quantizer for this block
  252. shr ebx, 3 // offset for input
  253. and ecx, 7 // shift value
  254. cmp dl, 1 // check the block type for INTRA
  255. ja get_next_coefficient // if type 2 or larger, no INTRADC
  256. //
  257. // Decode INTRADC
  258. //
  259. // uses dword load & bitswap to achieve big endian ordering.
  260. // prior codes prepares ebx, cl, and dl as follows:
  261. // ebx = L_Bits>>3
  262. // cl = L_Bits&7
  263. // dl = BlockType (0=INTRA_DC, 1=INTRA, 2=INTER, etc.)
  264. //
  265. mov eax, [esi+ebx] // *** PROBABLE MALALIGNMENT ***
  266. inc ebp // one coefficient decoded
  267. bswap eax // big endian order
  268. // *** NOT PAIRABLE ***
  269. shl eax, cl // left justify bitstream buffer
  270. // *** NOT PAIRABLE ***
  271. // *** 4 CYCLES ***
  272. shr eax, 21 // top 11 bits to the bottom
  273. mov ecx, [L_Bits] // ecx = L_Bits
  274. and eax, 07f8h // mask last 3 bits
  275. add ecx, 8 // bits used += 8 for INTRADC
  276. cmp eax, 07f8h // check for 11111111 codeword
  277. jne skipa
  278. mov eax, 0400h // 11111111 decodes to 400h = 1024
  279. skipa:
  280. mov [L_Bits], ecx // update bits used
  281. xor ebx, ebx
  282. mov [L_CumRun], ebx // save total run (starts with zero)
  283. mov [edi], eax // save decoded DC coefficient
  284. mov [edi+4], ebx // save 0 index
  285. mov ebx, ecx // ebx = L_Bits
  286. shr ebx, 3 // offset for input
  287. add edi, 8 // update coefficient pointer
  288. // check for last
  289. test dl, dl // check for INTRA-DC (block type=0)
  290. jz finish // if only the INTRADC present
  291. //
  292. // Get Next Coefficient
  293. //
  294. // prior codes prepares ebx and ecx as follows:
  295. // ebx = L_Bits>>3
  296. // ecx = L_Bits
  297. //
  298. get_next_coefficient:
  299. // use dword load & bitswap to achieve big endian ordering
  300. mov eax, [esi+ebx] // *** PROBABLE MALALIGNMENT ***
  301. and ecx, 7 // shift value
  302. bswap eax // big endian order
  303. // *** NOT PAIRABLE ***
  304. shl eax, cl // left justify buffer
  305. // *** NOT PAIRABLE ***
  306. // *** 4 CYCLES ***
  307. // do table lookups
  308. mov ebx, eax // ebx for major table
  309. mov ecx, eax // ecx for minor table
  310. shr ebx, 24 // major table lookup
  311. shr ecx, 17 // minor table lookup in bits with garbage
  312. mov ebx, [gTAB_TCOEFF_MAJOR+4*ebx] // get the major table value
  313. // ** AGI **
  314. shr ebx, 1 // test major hit ?
  315. jnc skipb // if hit major
  316. and ecx, 0ffch // mask off garbage for minor table
  317. test ebx, ebx // escape code value was 0x00000001
  318. jz escape_code // handle escape by major table.
  319. mov ebx, [gTAB_TCOEFF_MINOR+ecx] // use minor table
  320. //
  321. // input is ebx = event. See function header for the meaning of its fields
  322. // now we decode the event, extracting the run, value, last.
  323. // The table value moves to ecx and is shifted downward as portions
  324. // are extracted to ebx.
  325. //
  326. skipb:
  327. mov ecx, ebx // ecx = table value
  328. and ebx, 0ffh // ebx = 2*abs(level) + sign
  329. shr ecx, 8 // run to bottom
  330. mov edx, [L_Quantizer64] // edx = 64*quant
  331. // ** PREFIX DELAY **
  332. // ** AGI **
  333. mov ax, [gTAB_INVERSE_Q+edx+2*ebx] // ax = dequantized value (I16)
  334. mov ebx, ecx // ebx = table value
  335. shl eax, 16 // shift value until sign bit is on top
  336. and ebx, 0ffh // ebx = run + 1
  337. sar eax, 16 // arithmetic shift extends value's sign
  338. mov edx, [L_CumRun] // edx = (old) cumulative run
  339. add edx, ebx // cumulative run += run + 1
  340. mov [edi], eax // save coefficient's signed value
  341. cmp edx, 03fh // check run for bitstream error
  342. jg error
  343. mov [L_CumRun], edx // update the cumulative run
  344. inc ebp // increment number of coefficients read
  345. // ** AGI **
  346. mov edx, [gTAB_ZZ_RUN+4*edx] // edx = index of the current coefficient
  347. mov ebx, ecx // ebx: bit 8 = last flag
  348. mov [edi+4], edx // save coefficient's index
  349. add edi, 8 // increment coefficient pointer
  350. shr ecx, 9 // ecx = bits decoded
  351. mov edx, [L_Bits] // edx = L_Bits
  352. add ecx, edx // L_Bits += bits decoded
  353. mov edx, ebx // ebx: bit 8 = last flag
  354. mov [L_Bits], ecx // update L_Bits
  355. mov ebx, ecx // ebx = L_Bits
  356. shr ebx, 3 // offset for bitstream load
  357. test edx, 100h // check for last
  358. jz get_next_coefficient
  359. finish:
  360. mov ecx, [P_num] // pointer to number of coeffients read
  361. mov eax, [L_Bits] // return total bits used
  362. pop ebx
  363. pop edi
  364. mov [ecx], ebp // store number of coefficients read
  365. pop esi
  366. pop ebp
  367. ret
  368. //
  369. // process escape code separately
  370. //
  371. // we have the following 4 cases to compute the reconstructed value
  372. // depending on the sign of L=level and the parity of Q=quantizer:
  373. //
  374. // L pos L neg
  375. // Q even 2QL+(Q-1) 2QL-(Q-1)
  376. // Q odd 2QL+(Q) 2QL-(Q)
  377. //
  378. // The Q or Q-1 term is formed by adding Q to its parity bit
  379. // and then subtracting 1.
  380. // The + or - on this term is gotten by anding the term with a
  381. // mask (=0 or =-1) formed from the sign bit of Q*L,
  382. // doubling the result, then subtracting it from the term.
  383. // This will negate the term when L is negative and leave
  384. // it unchanged when L is positive.
  385. //
  386. // Register usages:
  387. // eax starts with bitstream, later L, finally result
  388. // ebx starts with Q, later is the Q or Q-1 term
  389. // ecx startw with mask, later 2*term
  390. // edx bitstream
  391. //
  392. escape_code:
  393. mov edx, eax // edx = bitstream buffer
  394. shl eax, 14 // signed 8-bit level to top
  395. sar eax, 24 // eax = L (signed level)
  396. mov ebx, [L_Quantizer]
  397. test eax, 7fh // test for invalid codes
  398. jz error
  399. imul eax, ebx // eax = Q*L
  400. // *** NOT PAIRABLE ***
  401. // *** 10 cycles ***
  402. dec ebx // term = Q-1
  403. mov ecx, eax // mask = QL
  404. or ebx, 1 // term = Q-1 if Q even, else = Q
  405. sar ecx, 31 // mask = -1 if L neg, else = 0
  406. xor ebx, ecx // term = ~Q[-1] if L neg, else = Q[-1]
  407. add eax, eax // result = 2*Q*L
  408. sub ebx, ecx // term = -(Q[-1]) if L neg, else = Q[-1]
  409. mov ecx, edx // bitstream to ecx to get run
  410. add eax, ebx // result = 2QL +- Q[-1]
  411. // now clip to -2048 ... +2047 (12 bits: 0xfffff800 <= res <= 0x000007ff)
  412. cmp eax, -2048
  413. jge skip1
  414. mov eax, -2048
  415. jmp skip2
  416. skip1:
  417. cmp eax, +2047
  418. jle skip2
  419. mov eax, 2047
  420. skip2:
  421. // update run and compute index
  422. shr ecx, 18 // run to bottom
  423. mov ebx, [L_CumRun] // ebx = old total run
  424. and ecx, 3fh // mask off bottom 6 bits for run
  425. inc ebx // old run ++
  426. add ebx, ecx // ebx = new cumulative run
  427. mov [edi], eax // save coefficient's signed value
  428. cmp ebx, 03fh // check run for bitstream error
  429. jg error
  430. mov [L_CumRun], ebx // update the cumulative run
  431. mov ecx, [L_Bits] // ebx = number of bits used
  432. mov ebx, [gTAB_ZZ_RUN+4*ebx] // ebx = index of the current coefficient
  433. add ecx, 22 // escape code uses 22 bits
  434. mov [edi+4], ebx // save coefficient's index
  435. add edi, 8 // increment coefficient pointer
  436. mov [L_Bits], ecx // update number of bits used
  437. mov ebx, ecx // ebx = L_Bits
  438. shr ebx, 3 // offset for bitstream load
  439. inc ebp // increment number of coefficients read
  440. test edx, 01000000h // check last bit
  441. jz get_next_coefficient
  442. jmp finish
  443. error:
  444. pop ebx
  445. pop edi
  446. pop esi
  447. pop ebp
  448. xor eax, eax // zero bits used indicates ERROR
  449. ret
  450. }
  451. }
  452. #pragma code_seg()