Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1515 lines
72 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995-1996 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. //////////////////////////////////////////////////////////////////////////
  15. // $Author: MBODART $
  16. // $Date: 05 Aug 1996 11:03:52 $
  17. // $Archive: S:\h26x\src\dec\d1idct.cpv $
  18. // $Header: S:\h26x\src\dec\d1idct.cpv 1.0 05 Aug 1996 11:03:52 MBODART $
  19. // $Log: S:\h26x\src\dec\d1idct.cpv $
  20. //
  21. // Rev 1.0 05 Aug 1996 11:03:52 MBODART
  22. // Initial revision.
  23. //
  24. // Started from d3idct.cpp
  25. //
  26. // Rev 1.8 08 Mar 1996 16:46:20 AGUPTA2
  27. // Added pragma code_seg. Rolled the initialization code. Got rid of most
  28. // of 32-bit displacements in instructions. Aligned frequently executed loops
  29. // at 4-byte boundary. Made changes to reflect new size of MapMatrix. Removed
  30. // nop instructions. Deleted code that prefetches output lines in case of
  31. // INTRA blocks. Use ClampTbl instead of ClipPixIntra. Do not clip output
  32. // of INTER blocks; clipping is done in dxblkadd().
  33. //
  34. //
  35. //Block level decoding for H.261 decoder
  36. #include "precomp.h"
  37. /////////////////////////////////////////////////////////////////////////
  38. // Decode each none-empty block
  39. // Input: lpInst: decoder instance,
  40. // lpSrc: input bitstream,
  41. // lpBlockAction:
  42. // the pointer to the block action stream structure
  43. // bitsread: number of bits in the buffer already,
  44. /////////////////////////////////////////////////////////////////////////
  45. // local variable definitions
  46. #define FRAMEPOINTER esp
  47. #define L_BITSUSED FRAMEPOINTER + 0 // 4 byte
  48. #define L_ACCUM L_BITSUSED + 4 //
  49. #define L_DESTBLOCK L_ACCUM + 256 //64 DWORD
  50. #define L_NO_COEFF L_DESTBLOCK + 4
  51. #define L_PRODUCT L_NO_COEFF + 4
  52. #define L_LOOPCOUNTER L_PRODUCT + 80 //20 DWORD
  53. #define L_INPUT_INTER L_LOOPCOUNTER + 4
  54. #define L_esi L_INPUT_INTER + 4
  55. #define L_DESTBLOCK_1 L_esi + 4 // akk
  56. #define L_DESTBLOCK_2 L_DESTBLOCK_1 + 4 // akk
  57. #ifdef PTEL_WORK_AROUND
  58. #define L_COEFFCOUNT L_DESTBLOCK_1 + 4 //akk
  59. #define L_COEFFVALUE L_COEFFCOUNT + 4 //akk
  60. #endif
  61. #define L_END_OF_FRAME FRAMEPOINTER + 512
  62. #define LOCALSIZE ((512+3)&~3) // keep aligned
  63. ////////////////////////////////////////////////////////////////////////////////
  64. // Input:
  65. // pIQ_INDEX, pointer to pointer for Inverse quantization and index
  66. // for the current block.
  67. // No_Coeff, A 32 bit number indicate block types, etc.
  68. // 0--63, inter block, number of coeff
  69. // 64--127 64+ intra block, number of coeff
  70. // pIntraBuf, Buffer pointer for intra blocks.
  71. //
  72. // pInterBuf, Buffer pointer for inter blocks.
  73. //
  74. //
  75. // return:
  76. //
  77. //////////////////////////////////////////////////////////////////////////////////
  78. #pragma code_seg("IACODE2")
  79. __declspec(naked)
  80. U32 DecodeBlock_IDCT ( U32 pIQ_INDEX,
  81. U32 No_Coeff,
  82. U32 pIntraBuf,
  83. U32 pInterBuf)
  84. {
  85. __asm
  86. {
  87. push ebp // save callers frame pointer
  88. mov ebp, esp // make parameters accessible
  89. push esi // assumed preserved
  90. push edi
  91. push ebx
  92. sub esp, LOCALSIZE // reserve local storage
  93. mov eax, pInterBuf
  94. lea edi, [L_ACCUM+128]
  95. mov [L_INPUT_INTER], eax
  96. ;add edi, 128 // Adjust offset to save code space
  97. mov edx, No_Coeff
  98. ;
  99. ////////////////////////////////////////////////////////////////////////
  100. // Initialize accumulators for IDCT
  101. // ROUNDER was pre-computed.
  102. //
  103. // C code:
  104. //
  105. // for (x=0; x<16; x++)
  106. // acc[x] = rounder;
  107. // for (x=16; x<64; x++)
  108. // acc[x] = 0L;
  109. //
  110. mov esi, [edi-128] ; pre-fetch accumulators
  111. mov ebx, [edi-96] ; pre-fetch
  112. mov esi, [edi-64] ; pre-fetch more
  113. mov ebx, [edi-32] ; pre-fetch more
  114. mov esi, [edi] ; pre-fetch more
  115. mov ebx, [edi+32] ; pre-fetch more
  116. mov esi, [edi+64] ; pre-fetch more
  117. mov ebx, [edi+96] ; pre-fetch more
  118. xor esi, esi
  119. sub edi, 128
  120. mov eax, ROUNDER
  121. mov ebx, 64
  122. loop_for_init:
  123. mov [edi], eax
  124. mov [edi+4], eax
  125. mov [edi+ebx], esi
  126. mov [edi+ebx+4], esi
  127. mov [edi+ebx+8], esi
  128. mov [edi+ebx+12], esi
  129. mov [edi+ebx+16], esi
  130. mov [edi+ebx+20], esi
  131. add edi, 8
  132. add ebx, 16
  133. cmp ebx, 192
  134. jl loop_for_init
  135. //end of IDCT init.
  136. #ifdef PTEL_WORK_AROUND
  137. mov [L_COEFFCOUNT], esi // zero out coefficient counter
  138. mov [L_COEFFVALUE], esi // zero out coefficient value
  139. #endif
  140. cmp edx, 65
  141. jg intra_block
  142. mov ebx, pInterBuf
  143. jmp pre_acc_loop
  144. intra_block:
  145. mov ebx, pIntraBuf
  146. sub edx, 65
  147. // register:
  148. // ebp: loop counter
  149. // ebx: inverse quant
  150. // ecx: index [0,63]
  151. pre_acc_loop:
  152. mov esi, pIQ_INDEX
  153. mov [L_DESTBLOCK], ebx
  154. mov [L_esi], esi
  155. ALIGN 4
  156. acc_loop:
  157. mov ebx,[esi+edx*8-8] //Invserse Quant
  158. mov ecx,[esi+edx*8-4] //Coeff index
  159. mov [L_NO_COEFF], edx
  160. call idct_acc
  161. mov esi, [L_esi]
  162. mov edx, [L_NO_COEFF]
  163. dec edx
  164. jnz acc_loop
  165. mov edx, [L_DESTBLOCK]
  166. mov ecx, [L_INPUT_INTER]
  167. cmp edx, ecx
  168. jnz call_intra_bfly
  169. call idct_bfly_inter
  170. add esp, LOCALSIZE // free locals
  171. add eax, edi
  172. pop ebx
  173. pop edi
  174. pop esi
  175. pop ebp
  176. ret
  177. call_intra_bfly:
  178. call idct_bfly_intra
  179. add esp, LOCALSIZE // free locals
  180. add eax, edi
  181. pop ebx
  182. pop edi
  183. pop esi
  184. pop ebp
  185. ret
  186. ///////////////////////////////////////////////////////////////
  187. // This "subroutine" idct_acc performs the accumulator phase of
  188. // the fmidct.
  189. //
  190. // assume parameter passed in by registers
  191. // ebx, inversed quantized value, input
  192. // ecx, index [0,63]
  193. //
  194. // C code:
  195. //
  196. // for (i=0; i<NUM_ELEM; i++) // Loop through each input
  197. // {
  198. // if (input[i])
  199. // {
  200. // pNKernel = &NKernel[i]; // initialize kernel pointer
  201. // totalU = pNKernel->totalUnique;
  202. // for (x=0; x<totalU; x++) // compute positive and negative products
  203. // {
  204. // product[x] = input[i] * pNKernel->coeff[x];
  205. // product[x+totalU] = -product[x];
  206. // }
  207. // // Loop through each entry in the output matrix
  208. // acc[pNKernel->PClass] += product[ pNKernel->matrix[0] ];
  209. // acc[1+pNKernel->PClass] += product[ pNKernel->matrix[1] ];
  210. // acc[2+pNKernel->PClass] += product[ pNKernel->matrix[2] ];
  211. // acc[3+pNKernel->PClass] += product[ pNKernel->matrix[3] ];
  212. // acc[4+pNKernel->PClass] += product[ pNKernel->matrix[4] ];
  213. // acc[5+pNKernel->PClass] += product[ pNKernel->matrix[5] ];
  214. // acc[6+pNKernel->PClass] += product[ pNKernel->matrix[6] ];
  215. // acc[7+pNKernel->PClass] += product[ pNKernel->matrix[7] ];
  216. // acc[8+pNKernel->PClass] += product[ pNKernel->matrix[8] ];
  217. // acc[9+pNKernel->PClass] += product[ pNKernel->matrix[9] ];
  218. // acc[10+pNKernel->PClass] += product[ pNKernel->matrix[10] ];
  219. // acc[11+pNKernel->PClass] += product[ pNKernel->matrix[11] ];
  220. // acc[12+pNKernel->PClass] += product[ pNKernel->matrix[12] ];
  221. // acc[13+pNKernel->PClass] += product[ pNKernel->matrix[13] ];
  222. // acc[14+pNKernel->PClass] += product[ pNKernel->matrix[14] ];
  223. // acc[15+pNKernel->PClass] += product[ pNKernel->matrix[15] ];
  224. // }
  225. // }
  226. ///////////////////////////////////////////////////////////////
  227. // assume parameter passed in by registers
  228. // ebx, inverse quant
  229. // ecx, index [0,63]
  230. idct_acc:
  231. ; For every non-zero coefficient:
  232. ; LoopCounter, on local stack, has index
  233. ; ecx = index (0-63)
  234. ; ebx = non-zero input
  235. ; Note i = index
  236. ;
  237. #ifdef PTEL_WORK_AROUND
  238. mov edx, [L_COEFFCOUNT+4] ; get coefficient counter
  239. mov [L_COEFFVALUE+4], ebx ; store coefficient value
  240. inc edx
  241. ;
  242. mov [L_COEFFCOUNT+4], edx ; store updated coefficient counter
  243. ;
  244. #endif
  245. and ecx, 03fh ; Chad added to prevent GPF
  246. xor edx, edx ; zero out for byte read, use as dword
  247. mov [L_LOOPCOUNTER+4], ecx ; Store Loop counter
  248. mov esi, ecx ; move index to esi
  249. lea eax, Unique ; eax = Address of Unique[0]
  250. mov ebp, ecx ; move index to ebp
  251. shl esi, 3 ; index*8
  252. add ecx, ecx ; index*2
  253. add esi, ecx ; index*10
  254. lea ecx, KernelCoeff ; get KernelCoeff[0][0]
  255. lea edi, [L_PRODUCT+4] ; edi = address of product[0]
  256. mov dl, [eax+ebp] ; get Unique[i]
  257. lea esi, [ecx+4*esi] ; address of KernelCoeff[i][0]
  258. mov ebp, edx ; ebp = Unique[i]
  259. lea eax, [edi+edx*4] ; eax = address of product[totalU]
  260. ;nop
  261. ; ----------------------------------------------------------------------
  262. ; Register usage
  263. ; eax = addr of product[Unique[i]]
  264. ; ebx = input[i]
  265. ; ecx = 0, -product[x]
  266. ; edx = KernelCoeff[i][x], product[x]= KernelCoeff[i][x] * input[i]
  267. ; ebp = x
  268. ; edi = addr of product[0]
  269. ; esi = addr of KernelCoeff[i][x]
  270. ALIGN 4
  271. loop_for_x:
  272. xor ecx, ecx
  273. mov edx, [esi+ebp*4-4] ; read KernelCoeff[i][x]
  274. imul edx, ebx ; KernelCoeff[i][x] * input[i]
  275. mov [edi+ebp*4-4], edx ; product[x] = result of imul
  276. sub ecx, edx
  277. mov [eax+ebp*4-4], ecx ; product[totalU+x] = -product[x]
  278. dec ebp ; decrement x
  279. jnz loop_for_x
  280. ; ----------------------------------------------------------------------
  281. ; Register usage
  282. ; eax = MapMatrix[i][0-15]
  283. ; ebx = address of PClass[0], accum[PClass[i]]
  284. ; ecx = LoopCounter, addr of MapMatrix[i][0]
  285. ; edx = [0-15]+PClass[i], accum[[0-15]+PClass[i]]
  286. ; ebp = product[MapMatrix[i][0-15]]
  287. ; edi = addr of product[0]
  288. ; esi = address of accum[0], address of accum[PClass[i]]
  289. mov ecx, [L_LOOPCOUNTER+4] ; get i
  290. and ecx, 03fh ; Chad added to prevent GPF
  291. lea ebx, PClass ; get addr of PClass[0]
  292. mov esi, ecx ; save i in esi
  293. shl ecx, 4 ; i*16
  294. lea eax, MapMatrix ; get addr of MapMatrix[0][0]
  295. xor edx, edx
  296. nop
  297. mov dl, [ebx+esi] ; get PClass[i]
  298. lea ecx, [eax+ecx] ; get addr of MapMatrix[i][0]
  299. shl edx, 2 ; PClass[i]*4
  300. lea esi, [L_ACCUM+4] ; get addr of accum[0]
  301. ; ----------------------------------------------------------------------
  302. xor eax, eax
  303. add esi, edx ; esi = address of accum[PClass[i]]
  304. mov al, [ecx] ; get MapMatrix[i][0]
  305. nop
  306. ;nop
  307. mov ebx, [esi] ; get accum[PClass[i]]
  308. nop
  309. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][0]]
  310. mov al, [ecx+1] ; get MapMatrix[i][1]
  311. add ebx, ebp ; accum[PClass[i]] += product[
  312. ; MapMatrix[i][0]]
  313. mov edx, [esi+4] ; get accum[1+PClass[i]]
  314. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][1]]
  315. mov al, [ecx+2] ; get MapMatrix[i][2]
  316. add edx, ebp ; accum[1+PClass[i]] += product[
  317. ; MapMatrix[i][1]]
  318. mov [esi], ebx ; store accum[PClass[i]] += product[
  319. ; MapMatrix[i][0]]
  320. mov [esi+4], edx ; store accum[1+PClass[i]] +=
  321. ; product[MapMatrix[i][1]]
  322. mov ebx, [esi+8] ; get accum[2+PClass[i]]
  323. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][2]]
  324. mov al, [ecx+3] ; get MapMatrix[i][3]
  325. add ebx, ebp ; accum[2+PClass[i]] += product[
  326. ; MapMatrix[i][2]]
  327. mov edx, [esi+12] ; get accum[3+PClass[i]]
  328. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][3]]
  329. mov al, [ecx+4] ; get MapMatrix[i][4]
  330. add edx, ebp ; accum[3+PClass[i]] += product[
  331. ; MapMatrix[i][3]]
  332. mov [esi+8], ebx ; store accum[2+PClass[i]] +=
  333. ; product[MapMatrix[i][2]]
  334. mov [esi+12], edx ; store accum[3+PClass[i]] +=
  335. ; product[MapMatrix[i][3]]
  336. ; ----------------------------------------------------------------------
  337. mov ebx, [esi+16] ; get accum[4+PClass[i]]
  338. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][4]]
  339. mov al, [ecx+5] ; get MapMatrix[i][5]
  340. add ebx, ebp ; accum[4+PClass[i]] += product[
  341. ; MapMatrix[i][4]]
  342. mov edx, [esi+20] ; get accum[5+PClass[i]]
  343. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][5]]
  344. mov al, [ecx+6] ; get MapMatrix[i][6]
  345. add edx, ebp ; accum[5+pNkernel->PClass] += product[
  346. ; MapMatrix[i][5]]
  347. mov [esi+16], ebx ; store accum[4+PClass[i]] +=
  348. ; product[MapMatrix[i][4]]
  349. mov [esi+20], edx ; store accum[5+PClass[i]] +=
  350. ; product[MapMatrix[i][5]]
  351. mov ebx, [esi+24] ; get accum[6+PClass[i]]
  352. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][6]]
  353. mov al, [ecx+7] ; get MapMatrix[i][7]
  354. add ebx, ebp
  355. mov edx, [esi+28] ; get accum[7+PClass[i]]
  356. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][7]]
  357. mov al, [ecx+8] ; get MapMatrix[i][8]
  358. add edx, ebp ; accum[7+PClass[i]] += product[
  359. ; MapMatrix[i][7]]
  360. mov [esi+24], ebx ; store accum[6+PClass[i]] +=
  361. ; product[MapMatrix[i][6]]
  362. mov [esi+28], edx ; store accum[7+PClass[i]] +=
  363. ; product[MapMatrix[i][7]]
  364. ; ----------------------------------------------------------------------
  365. mov ebx, [esi+32] ; get accum[8+PClass[i]]
  366. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][8]]
  367. mov al, [ecx+9] ; get MapMatrix[i][9]
  368. add ebx, ebp ; accum[8+PClass[i]] += product[
  369. ; MapMatrix[i][8]]
  370. mov edx, [esi+36] ; get accum[9+PClass[i]]
  371. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][9]]
  372. mov al, [ecx+10] ; get MapMatrix[i][10]
  373. add edx, ebp ; accum[9+pNkernel->PClass] += product[
  374. ; MapMatrix[i][9]]
  375. mov [esi+32], ebx ; store accum[8+PClass[i]] +=
  376. ; product[MapMatrix[i][8]]
  377. mov [esi+36], edx ; store accum[9+PClass[i]] +=
  378. ; product[MapMatrix[i][9]]
  379. mov ebx, [esi+40] ; get accum[10+PClass[i]]
  380. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][10]]
  381. mov al, [ecx+11] ; get MapMatrix[i][11]
  382. add ebx, ebp
  383. mov edx, [esi+44] ; get accum[11+PClass[i]]
  384. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][11]]
  385. ; product[MapMatrix[i][11]]
  386. mov al, [ecx+12] ; get MapMatrix[i][12]
  387. add edx, ebp ; accum[11+PClass[i]] += product[
  388. ; MapMatrix[i][11]]
  389. mov [esi+40], ebx ; store accum[10+PClass[i]] +=
  390. ; product[MapMatrix[i][10]]
  391. mov [esi+44], edx ; store accum[11+PClass[i]] +=
  392. ; product[MapMatrix[i][11]]
  393. ; ----------------------------------------------------------------------
  394. mov ebx, [esi+48] ; get accum[12+PClass[i]]
  395. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][12]]
  396. mov al, [ecx+13] ; get MapMatrix[i][13]
  397. add ebx, ebp ; accum[12+PClass[i]] += product[
  398. ; MapMatrix[i][12]]
  399. mov edx, [esi+52] ; get accum[13+PClass[i]]
  400. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][13]]
  401. mov al, [ecx+14] ; get MapMatrix[i][14]
  402. add edx, ebp ; accum[13+pNkernel->PClass] += product[
  403. ; MapMatrix[i][13]]
  404. mov [esi+48], ebx ; store accum[PClass[i]] += product[
  405. ; MapMatrix[i][13]]
  406. mov [esi+52], edx ; store accum[13+PClass[i]] +=
  407. ; product[MapMatrix[i][13]]
  408. mov ebx, [esi+56] ; get accum[14+PClass[i]]
  409. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][14]]
  410. mov al, [ecx+15] ; get MapMatrix[i][15]
  411. add ebx, ebp
  412. mov edx, [esi+60] ; get accum[15+PClass[i]]
  413. mov ebp, [edi+eax*4] ; get product[MapMatrix[i][15]]
  414. mov [esi+56], ebx ; store accum[14+PClass[i]] +=
  415. ; product[MapMatrix[i][14]]
  416. add edx, ebp ; accum[15+PClass[i]] += product[
  417. ; MapMatrix[i][15]]
  418. mov [esi+60], edx ; store accum[15+PClass[i]] +=
  419. ; product[MapMatrix[i][15]]
  420. ret
  421. //////////////////////////////////////////////////////////////////////
  422. // This "subroutine" idct_bfly_intra performs the butterfly phase of
  423. // the fmidct for intra blocks.
  424. //
  425. // assume parameters passed in by registers
  426. //
  427. // C code:
  428. //
  429. // Upper Left Quadrant
  430. // Upper Right Quadrant
  431. // Lower Left Quadrant
  432. // Lower Right Quadrant
  433. //
  434. // lOut[0][0] = CLIP_INTRA[acc[0]+acc[16] + acc[32]+acc[48]];
  435. // lOut[0][7] = CLIP_INTRA[acc[0]+acc[16] - (acc[32]+acc[48])];
  436. // lOut[7][0] = CLIP_INTRA[(acc[0]-acc[16]) + (acc[32]-acc[48])];
  437. // lOut[7][7] = CLIP_INTRA[(acc[0]-acc[16]) - (acc[32]-acc[48])];
  438. //
  439. // lOut[0][1] = CLIP_INTRA[acc[1]+acc[17] + acc[33]+acc[49]];
  440. // lOut[0][6] = CLIP_INTRA[acc[1]+acc[17] - (acc[33]+acc[49])];
  441. // lOut[7][1] = CLIP_INTRA[(acc[1]-acc[17]) + (acc[33]-acc[49])];
  442. // lOut[7][6] = CLIP_INTRA[(acc[1]-acc[17]) - (acc[33]-acc[49])];
  443. //
  444. // lOut[0][2] = CLIP_INTRA[acc[2]+acc[18] + acc[34]+acc[50]];
  445. // lOut[0][5] = CLIP_INTRA[acc[2]+acc[18] - (acc[34]+acc[50])];
  446. // lOut[7][2] = CLIP_INTRA[(acc[2]-acc[18]) + (acc[34]-acc[50])];
  447. // lOut[7][5] = CLIP_INTRA[(acc[2]-acc[18]) - (acc[34]-acc[50])];
  448. //
  449. // lOut[0][3] = CLIP_INTRA[acc[3]+acc[19] + acc[35]+acc[51]];
  450. // lOut[0][4] = CLIP_INTRA[acc[3]+acc[19] - (acc[35]+acc[51])];
  451. // lOut[7][3] = CLIP_INTRA[(acc[3]-acc[19]) + (acc[35]-acc[51])];
  452. // lOut[7][4] = CLIP_INTRA[(acc[3]-acc[19]) - (acc[35]-acc[51])];
  453. //
  454. //
  455. // lOut[1][0] = CLIP_INTRA[acc[4]+acc[20] + acc[36]+acc[52]];
  456. // lOut[1][7] = CLIP_INTRA[acc[4]+acc[20] - (acc[36]+acc[52])];
  457. // lOut[6][0] = CLIP_INTRA[(acc[4]-acc[20]) + (acc[36]-acc[52])];
  458. // lOut[6][7] = CLIP_INTRA[(acc[4]-acc[20]) - (acc[36]-acc[52])];
  459. //
  460. // lOut[1][1] = CLIP_INTRA[acc[5]+acc[21] + acc[37]+acc[53]];
  461. // lOut[1][6] = CLIP_INTRA[acc[5]+acc[21] - (acc[37]+acc[53])];
  462. // lOut[6][1] = CLIP_INTRA[(acc[5]-acc[21]) + (acc[37]-acc[53])];
  463. // lOut[6][6] = CLIP_INTRA[(acc[5]-acc[21]) - (acc[37]-acc[53])];
  464. //
  465. // lOut[1][2] = CLIP_INTRA[acc[6]+acc[22] + acc[38]+acc[54]];
  466. // lOut[1][5] = CLIP_INTRA[acc[6]+acc[22] - (acc[38]+acc[54])];
  467. // lOut[6][2] = CLIP_INTRA[(acc[6]-acc[22]) + (acc[38]-acc[54])];
  468. // lOut[6][5] = CLIP_INTRA[(acc[6]-acc[22]) - (acc[38]-acc[54])];
  469. //
  470. // lOut[1][3] = CLIP_INTRA[acc[7]+acc[23] + acc[39]+acc[55]];
  471. // lOut[1][4] = CLIP_INTRA[acc[7]+acc[23] - (acc[39]+acc[55])];
  472. // lOut[6][3] = CLIP_INTRA[(acc[7]-acc[23]) + (acc[39]-acc[55])];
  473. // lOut[6][4] = CLIP_INTRA[(acc[7]-acc[23]) - (acc[39]-acc[55])];
  474. //
  475. //
  476. // lOut[2][0] = CLIP_INTRA[acc[8]+acc[24] + acc[40]+acc[56]];
  477. // lOut[2][7] = CLIP_INTRA[acc[8]+acc[24] - (acc[40]+acc[56])];
  478. // lOut[5][0] = CLIP_INTRA[(acc[8]-acc[24]) + (acc[40]-acc[56])];
  479. // lOut[5][7] = CLIP_INTRA[(acc[8]-acc[24]) - (acc[40]-acc[56])];
  480. //
  481. // lOut[2][1] = CLIP_INTRA[acc[9]+acc[25] + acc[41]+acc[57]];
  482. // lOut[2][6] = CLIP_INTRA[acc[9]+acc[25] - (acc[41]+acc[57])];
  483. // lOut[5][1] = CLIP_INTRA[(acc[9]-acc[25]) + (acc[41]-acc[57])];
  484. // lOut[5][6] = CLIP_INTRA[(acc[9]-acc[25]) - (acc[41]-acc[57])];
  485. //
  486. // lOut[2][2] = CLIP_INTRA[acc[10]+acc[26] + acc[42]+acc[58]];
  487. // lOut[2][5] = CLIP_INTRA[acc[10]+acc[26] - (acc[42]+acc[58])];
  488. // lOut[5][2] = CLIP_INTRA[(acc[10]-acc[26]) + (acc[42]-acc[58])];
  489. // lOut[5][5] = CLIP_INTRA[(acc[10]-acc[26]) - (acc[42]-acc[58])];
  490. //
  491. // lOut[2][3] = CLIP_INTRA[acc[11]+acc[27] + acc[43]+acc[59]];
  492. // lOut[2][4] = CLIP_INTRA[acc[11]+acc[27] - (acc[43]+acc[59])];
  493. // lOut[5][3] = CLIP_INTRA[(acc[11]-acc[27]) + (acc[43]-acc[59])];
  494. // lOut[5][4] = CLIP_INTRA[(acc[11]-acc[27]) - (acc[43]-acc[59])];
  495. //
  496. //
  497. // lOut[3][0] = CLIP_INTRA[acc[12]+acc[28] + acc[44]+acc[60]];
  498. // lOut[3][7] = CLIP_INTRA[acc[12]+acc[28] - (acc[44]+acc[60])];
  499. // lOut[4][0] = CLIP_INTRA[(acc[12]-acc[28]) + (acc[44]-acc[60])];
  500. // lOut[4][7] = CLIP_INTRA[(acc[12]-acc[28]) - (acc[44]-acc[60])];
  501. //
  502. // lOut[3][1] = CLIP_INTRA[acc[13]+acc[29] + acc[45]+acc[61]];
  503. // lOut[3][6] = CLIP_INTRA[acc[13]+acc[29] - (acc[45]+acc[61])];
  504. // lOut[4][1] = CLIP_INTRA[(acc[13]-acc[29]) + (acc[45]-acc[61])];
  505. // lOut[4][6] = CLIP_INTRA[(acc[13]-acc[29]) - (acc[45]-acc[61])];
  506. //
  507. // lOut[3][2] = CLIP_INTRA[acc[14]+acc[30] + acc[46]+acc[62]];
  508. // lOut[3][5] = CLIP_INTRA[acc[14]+acc[30] - (acc[46]+acc[62])];
  509. // lOut[4][2] = CLIP_INTRA[(acc[14]-acc[30]) + (acc[46]-acc[62])];
  510. // lOut[4][5] = CLIP_INTRA[(acc[14]-acc[30]) - (acc[46]-acc[62])];
  511. //
  512. // lOut[3][3] = CLIP_INTRA[acc[15]+acc[31] + acc[47]+acc[63]];
  513. // lOut[3][4] = CLIP_INTRA[acc[15]+acc[31] - (acc[47]+acc[63])];
  514. // lOut[4][3] = CLIP_INTRA[(acc[15]-acc[31]) + (acc[47]-acc[63])];
  515. // lOut[4][4] = CLIP_INTRA[(acc[15]-acc[31]) - (acc[47]-acc[63])];
  516. //
  517. ; ----------------------------------------------------------------------
  518. ////////////////////////////////////////////////////////////////////////////
  519. //assume parameters passed in by registers
  520. idct_bfly_intra:
  521. ; ----------------------------------------------------------------------
  522. ; INTRA ONLY Butterfly and clamp
  523. ; Uses all registers.
  524. ; Uses all accumulators[64], accum
  525. ; Uses ClipPixIntra[2048] of BYTES, ClipPixIntra
  526. ; Writes to Output matrix of BYTES, OutputCoeff
  527. ;
  528. ; Process 4 outputs per group, 0-7, 8-15
  529. ; 0
  530. mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of OutputCoeff
  531. lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  532. mov [L_DESTBLOCK_1+4], edi
  533. mov edx, 2 ; just loop 2 times
  534. mov [L_DESTBLOCK_2+4], edi
  535. ALIGN 4
  536. loop_intra_bfly:
  537. mov [L_LOOPCOUNTER+4], edx ; Store local loop counter
  538. nop
  539. mov eax, [esi-128] ; get acc[0]
  540. mov ebx, [esi-64] ; get acc[16]
  541. mov ebp, [esi] ; get acc[32]
  542. mov edx, [esi+64] ; get acc[48]
  543. lea ecx, [eax+ebx] ; acc[0]+acc[16]
  544. sub eax, ebx ; acc[0]-acc[16]
  545. lea ebx, [ebp+edx] ; acc[32]+acc[48]
  546. sub ebp, edx ; acc[32]-acc[48]
  547. mov edx, [edi] ; pre-fetch output cache line 0
  548. mov edi, [edi+7*PITCH] ; pre-fetch output cache line 7
  549. ;mov esi, [edi+7*PITCH] ; pre-fetch output cache line 7
  550. lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
  551. sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
  552. lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
  553. sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
  554. sar edx, SCALER ; tmp1 >> 13
  555. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  556. sar ecx, SCALER ; tmp2 >> 13
  557. ;lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  558. mov edi, [L_DESTBLOCK_1+4]
  559. sar ebx, SCALER ; tmp3 >> 13
  560. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  561. sar eax, SCALER ; tmp4 >> 13
  562. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  563. mov BYTE PTR [edi], dl ; output[0][0] = tmp1
  564. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  565. mov BYTE PTR [edi+7], cl ; output[0][7] = tmp2
  566. mov edi, [L_DESTBLOCK_2+4]
  567. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  568. nop
  569. mov BYTE PTR [edi+7*PITCH], bl ; output[7][0] = tmp3
  570. mov ebx, [esi-60] ; get acc[17]
  571. ; -------------------------------------------------------------------------
  572. ; 1
  573. mov BYTE PTR [edi+7*PITCH+7], al; output[7][7] = tmp4
  574. mov eax, [esi-124] ; get acc[1]
  575. mov ebp, [esi+4] ; get acc[33]
  576. mov edx, [esi+68] ; get acc[49]
  577. lea ecx, [eax+ebx] ; acc[1]+acc[17]
  578. sub eax, ebx ; acc[1]-acc[17]
  579. lea ebx, [ebp+edx] ; acc[33]+acc[49]
  580. sub ebp, edx ; acc[33]-acc[49]
  581. ;nop
  582. ;nop
  583. lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
  584. sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
  585. lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
  586. sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
  587. sar edx, SCALER ; tmp1 >> 13
  588. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  589. sar ecx, SCALER ; tmp2 >> 13
  590. ;nop
  591. mov edi, [L_DESTBLOCK_1+4]
  592. sar ebx, SCALER ; tmp3 >> 13
  593. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  594. sar eax, SCALER ; tmp4 >> 13
  595. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  596. mov BYTE PTR [edi+1], dl ; output[0][1] = tmp1
  597. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  598. mov BYTE PTR [edi+6], cl ; output[0][6] = tmp2
  599. mov edi, [L_DESTBLOCK_2+4]
  600. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  601. nop
  602. mov BYTE PTR [edi+7*PITCH+1], bl ; output[7][1] = tmp3
  603. mov ebx, [esi-56] ; get acc[18]
  604. ; -------------------------------------------------------------------------
  605. ; 2
  606. mov BYTE PTR [edi+7*PITCH+6], al ; output[7][6] = tmp4
  607. mov eax, [esi-120] ; get acc[2]
  608. mov ebp, [esi+8] ; get acc[34]
  609. mov edx, [esi+72] ; get acc[50]
  610. lea ecx, [eax+ebx] ; acc[2]+acc[18]
  611. sub eax, ebx ; acc[2]-acc[18]
  612. lea ebx, [ebp+edx] ; acc[34]+acc[50]
  613. sub ebp, edx ; acc[34]-acc[50]
  614. ;nop
  615. ;nop
  616. lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
  617. sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
  618. lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
  619. sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
  620. sar edx, SCALER ; tmp1 >> 13
  621. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  622. sar ecx, SCALER ; tmp2 >> 13
  623. ;nop
  624. mov edi, [L_DESTBLOCK_1+4]
  625. sar ebx, SCALER ; tmp3 >> 13
  626. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  627. sar eax, SCALER ; tmp4 >> 13
  628. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  629. mov BYTE PTR [edi+2], dl ; output[0][2] = tmp1
  630. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  631. mov BYTE PTR [edi+5], cl ; output[0][5] = tmp2
  632. mov edi, [L_DESTBLOCK_2+4]
  633. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  634. nop
  635. mov BYTE PTR [edi+7*PITCH+2], bl ; output[7][2] = tmp3
  636. mov ebx, [esi-52] ; get acc[19]
  637. ; -------------------------------------------------------------------------
  638. ; 3
  639. mov BYTE PTR [edi+7*PITCH+5], al ; output[7][5] = tmp4
  640. mov eax, [esi-116] ; get acc[3]
  641. mov ebp, [esi+12] ; get acc[35]
  642. mov edx, [esi+76] ; get acc[51]
  643. lea ecx, [eax+ebx] ; acc[3]+acc[19]
  644. sub eax, ebx ; acc[3]-acc[19]
  645. lea ebx, [ebp+edx] ; acc[35]+acc[51]
  646. sub ebp, edx ; acc[35]-acc[51]
  647. ;nop
  648. ;nop
  649. lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
  650. sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
  651. lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
  652. sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
  653. sar edx, SCALER ; tmp1 >> 13
  654. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  655. sar ecx, SCALER ; tmp2 >> 13
  656. ;nop
  657. mov edi, [L_DESTBLOCK_1+4]
  658. sar ebx, SCALER ; tmp3 >> 13
  659. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  660. sar eax, SCALER ; tmp4 >> 13
  661. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  662. mov BYTE PTR [edi+3], dl ; output[0][3] = tmp1
  663. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  664. mov BYTE PTR [edi+4], cl ; output[0][4] = tmp2
  665. mov edi, [L_DESTBLOCK_2+4]
  666. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  667. nop
  668. mov BYTE PTR [edi+7*PITCH+3], bl ; output[7][3] = tmp3
  669. mov ebx, [esi-48] ; get acc[20]
  670. ; -------------------------------------------------------------------------
  671. ; 4
  672. mov BYTE PTR [edi+7*PITCH+4], al ; output[7][4] = tmp4
  673. mov eax, [esi-112] ; get acc[4]
  674. mov ebp, [esi+16] ; get acc[36]
  675. mov edx, [esi+80] ; get acc[52]
  676. lea ecx, [eax+ebx] ; acc[4]+acc[20]
  677. sub eax, ebx ; acc[4]-acc[20]
  678. lea ebx, [ebp+edx] ; acc[36]+acc[52]
  679. sub ebp, edx ; acc[36]-acc[52]
  680. mov edx, [edi+PITCH] ; pre-fetch output cache line 1
  681. mov edi, [edi+6*PITCH] ; pre-fetch output cache line 6
  682. ;mov esi, [edi+6*PITCH] ; pre-fetch output cache line 6
  683. lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
  684. sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
  685. lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
  686. sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
  687. sar edx, SCALER ; tmp1 >> 13
  688. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  689. sar ecx, SCALER ; tmp2 >> 13
  690. ;lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  691. mov edi, [L_DESTBLOCK_1+4]
  692. sar ebx, SCALER ; tmp3 >> 13
  693. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  694. sar eax, SCALER ; tmp4 >> 13
  695. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  696. mov BYTE PTR [edi+PITCH], dl ; output[1][0] = tmp1
  697. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  698. mov BYTE PTR [edi+PITCH+7], cl ; output[1][7] = tmp2
  699. mov edi, [L_DESTBLOCK_2+4]
  700. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  701. nop
  702. mov BYTE PTR [edi+6*PITCH], bl ; output[6][0] = tmp3
  703. mov ebx, [esi-44] ; get acc[21]
  704. ; -------------------------------------------------------------------------
  705. ; 5
  706. mov BYTE PTR [edi+6*PITCH+7], al ; output[6][7] = tmp4
  707. mov eax, [esi-108] ; get acc[5]
  708. mov ebp, [esi+20] ; get acc[37]
  709. mov edx, [esi+84] ; get acc[53]
  710. lea ecx, [eax+ebx] ; acc[5]+acc[21]
  711. sub eax, ebx ; acc[5]-acc[21]
  712. lea ebx, [ebp+edx] ; acc[37]+acc[53]
  713. sub ebp, edx ; acc[37]-acc[53]
  714. ;nop
  715. ;nop
  716. lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
  717. sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
  718. lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
  719. sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
  720. sar edx, SCALER ; tmp1 >> 13
  721. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  722. sar ecx, SCALER ; tmp2 >> 13
  723. ;nop
  724. mov edi, [L_DESTBLOCK_1+4]
  725. sar ebx, SCALER ; tmp3 >> 13
  726. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  727. sar eax, SCALER ; tmp4 >> 13
  728. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  729. mov BYTE PTR [edi+PITCH+1], dl ; output[1][1] = tmp1
  730. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  731. mov BYTE PTR [edi+PITCH+6], cl ; output[1][6] = tmp2
  732. mov edi, [L_DESTBLOCK_2+4]
  733. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  734. nop
  735. mov BYTE PTR [edi+6*PITCH+1], bl ; output[6][1] = tmp3
  736. mov ebx, [esi-40] ; get acc[22]
  737. ; -------------------------------------------------------------------------
  738. ; 6
  739. mov BYTE PTR [edi+6*PITCH+6], al ; output[6][6] = tmp4
  740. mov eax, [esi-104] ; get acc[6]
  741. mov ebp, [esi+24] ; get acc[38]
  742. mov edx, [esi+88] ; get acc[54]
  743. lea ecx, [eax+ebx] ; acc[6]+acc[22]
  744. sub eax, ebx ; acc[6]-acc[22]
  745. lea ebx, [ebp+edx] ; acc[38]+acc[54]
  746. sub ebp, edx ; acc[38]-acc[54]
  747. ;nop
  748. ;nop
  749. lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
  750. sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
  751. lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
  752. sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
  753. sar edx, SCALER ; tmp1 >> 13
  754. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  755. sar ecx, SCALER ; tmp2 >> 13
  756. ;nop
  757. mov edi, [L_DESTBLOCK_1+4]
  758. sar ebx, SCALER ; tmp3 >> 13
  759. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  760. sar eax, SCALER ; tmp4 >> 13
  761. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  762. mov BYTE PTR [edi+PITCH+2], dl ; output[1][2] = tmp1
  763. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  764. mov BYTE PTR [edi+PITCH+5], cl ; output[1][5] = tmp2
  765. mov edi, [L_DESTBLOCK_2+4]
  766. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  767. nop
  768. mov BYTE PTR [edi+6*PITCH+2], bl ; output[6][2] = tmp3
  769. mov ebx, [esi-36] ; get acc[23]
  770. ; -------------------------------------------------------------------------
  771. ; 7
  772. mov BYTE PTR [edi+6*PITCH+5], al ; output[6][5] = tmp4
  773. mov eax, [esi-100] ; get acc[7]
  774. mov ebp, [esi+28] ; get acc[39]
  775. mov edx, [esi+92] ; get acc[55]
  776. lea ecx, [eax+ebx] ; acc[7]+acc[23]
  777. sub eax, ebx ; acc[7]-acc[23]
  778. lea ebx, [ebp+edx] ; acc[39]+acc[55]
  779. sub ebp, edx ; acc[39]-acc[55]
  780. ;nop
  781. ;nop
  782. lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
  783. sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
  784. lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
  785. sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
  786. sar edx, SCALER ; tmp1 >> 13
  787. lea ebp, ClipPixIntra ; ecx gets Base addr of ClipPixIntra
  788. sar ecx, SCALER ; tmp2 >> 13
  789. ;nop
  790. mov edi, [L_DESTBLOCK_1+4]
  791. sar ebx, SCALER ; tmp3 >> 13
  792. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  793. sar eax, SCALER ; tmp4 >> 13
  794. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  795. mov BYTE PTR [edi+PITCH+3], dl ; output[1][3] = tmp1
  796. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  797. mov BYTE PTR [edi+PITCH+4], cl ; output[1][4] = tmp2
  798. mov edi, [L_DESTBLOCK_2+4]
  799. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  800. nop
  801. mov BYTE PTR [edi+6*PITCH+3], bl ; output[6][3] = tmp3
  802. mov edx, [L_LOOPCOUNTER+4] ; fetch local loop counter
  803. mov BYTE PTR [edi+6*PITCH+4], al ; output[6][4] = tmp4
  804. add edi, 2*PITCH
  805. add esi, 32 ; add 32 to esi for second pass
  806. mov [L_DESTBLOCK_1+4], edi
  807. sub edi, 4*PITCH
  808. dec edx
  809. mov [L_DESTBLOCK_2+4], edi
  810. jnz loop_intra_bfly
  811. ret
  812. //////////////////////////////////////////////////////////////////////
  813. // This "subroutine" idct_bfly_inter performs the butterfly phase of
  814. // the fmidct for inter blocks.
  815. //
  816. // assume parameters passed in by registers
  817. //
  818. // C code:
  819. //
  820. // Upper Left Quadrant
  821. // Upper Right Quadrant
  822. // Lower Left Quadrant
  823. // Lower Right Quadrant
  824. //
  825. // lOut[0][0] = CLIP_INTER[acc[0]+acc[16] + acc[32]+acc[48]];
  826. // lOut[0][7] = CLIP_INTER[acc[0]+acc[16] - (acc[32]+acc[48])];
  827. // lOut[7][0] = CLIP_INTER[(acc[0]-acc[16]) + (acc[32]-acc[48])];
  828. // lOut[7][7] = CLIP_INTER[(acc[0]-acc[16]) - (acc[32]-acc[48])];
  829. //
  830. // lOut[0][1] = CLIP_INTER[acc[1]+acc[17] + acc[33]+acc[49]];
  831. // lOut[0][6] = CLIP_INTER[acc[1]+acc[17] - (acc[33]+acc[49])];
  832. // lOut[7][1] = CLIP_INTER[(acc[1]-acc[17]) + (acc[33]-acc[49])];
  833. // lOut[7][6] = CLIP_INTER[(acc[1]-acc[17]) - (acc[33]-acc[49])];
  834. //
  835. // lOut[0][2] = CLIP_INTER[acc[2]+acc[18] + acc[34]+acc[50]];
  836. // lOut[0][5] = CLIP_INTER[acc[2]+acc[18] - (acc[34]+acc[50])];
  837. // lOut[7][2] = CLIP_INTER[(acc[2]-acc[18]) + (acc[34]-acc[50])];
  838. // lOut[7][5] = CLIP_INTER[(acc[2]-acc[18]) - (acc[34]-acc[50])];
  839. //
  840. // lOut[0][3] = CLIP_INTER[acc[3]+acc[19] + acc[35]+acc[51]];
  841. // lOut[0][4] = CLIP_INTER[acc[3]+acc[19] - (acc[35]+acc[51])];
  842. // lOut[7][3] = CLIP_INTER[(acc[3]-acc[19]) + (acc[35]-acc[51])];
  843. // lOut[7][4] = CLIP_INTER[(acc[3]-acc[19]) - (acc[35]-acc[51])];
  844. //
  845. //
  846. // lOut[1][0] = CLIP_INTER[acc[4]+acc[20] + acc[36]+acc[52]];
  847. // lOut[1][7] = CLIP_INTER[acc[4]+acc[20] - (acc[36]+acc[52])];
  848. // lOut[6][0] = CLIP_INTER[(acc[4]-acc[20]) + (acc[36]-acc[52])];
  849. // lOut[6][7] = CLIP_INTER[(acc[4]-acc[20]) - (acc[36]-acc[52])];
  850. //
  851. // lOut[1][1] = CLIP_INTER[acc[5]+acc[21] + acc[37]+acc[53]];
  852. // lOut[1][6] = CLIP_INTER[acc[5]+acc[21] - (acc[37]+acc[53])];
  853. // lOut[6][1] = CLIP_INTER[(acc[5]-acc[21]) + (acc[37]-acc[53])];
  854. // lOut[6][6] = CLIP_INTER[(acc[5]-acc[21]) - (acc[37]-acc[53])];
  855. //
  856. // lOut[1][2] = CLIP_INTER[acc[6]+acc[22] + acc[38]+acc[54]];
  857. // lOut[1][5] = CLIP_INTER[acc[6]+acc[22] - (acc[38]+acc[54])];
  858. // lOut[6][2] = CLIP_INTER[(acc[6]-acc[22]) + (acc[38]-acc[54])];
  859. // lOut[6][5] = CLIP_INTER[(acc[6]-acc[22]) - (acc[38]-acc[54])];
  860. //
  861. // lOut[1][3] = CLIP_INTER[acc[7]+acc[23] + acc[39]+acc[55]];
  862. // lOut[1][4] = CLIP_INTER[acc[7]+acc[23] - (acc[39]+acc[55])];
  863. // lOut[6][3] = CLIP_INTER[(acc[7]-acc[23]) + (acc[39]-acc[55])];
  864. // lOut[6][4] = CLIP_INTER[(acc[7]-acc[23]) - (acc[39]-acc[55])];
  865. //
  866. //
  867. // lOut[2][0] = CLIP_INTER[acc[8]+acc[24] + acc[40]+acc[56]];
  868. // lOut[2][7] = CLIP_INTER[acc[8]+acc[24] - (acc[40]+acc[56])];
  869. // lOut[5][0] = CLIP_INTER[(acc[8]-acc[24]) + (acc[40]-acc[56])];
  870. // lOut[5][7] = CLIP_INTER[(acc[8]-acc[24]) - (acc[40]-acc[56])];
  871. //
  872. // lOut[2][1] = CLIP_INTER[acc[9]+acc[25] + acc[41]+acc[57]];
  873. // lOut[2][6] = CLIP_INTER[acc[9]+acc[25] - (acc[41]+acc[57])];
  874. // lOut[5][1] = CLIP_INTER[(acc[9]-acc[25]) + (acc[41]-acc[57])];
  875. // lOut[5][6] = CLIP_INTER[(acc[9]-acc[25]) - (acc[41]-acc[57])];
  876. //
  877. // lOut[2][2] = CLIP_INTER[acc[10]+acc[26] + acc[42]+acc[58]];
  878. // lOut[2][5] = CLIP_INTER[acc[10]+acc[26] - (acc[42]+acc[58])];
  879. // lOut[5][2] = CLIP_INTER[(acc[10]-acc[26]) + (acc[42]-acc[58])];
  880. // lOut[5][5] = CLIP_INTER[(acc[10]-acc[26]) - (acc[42]-acc[58])];
  881. //
  882. // lOut[2][3] = CLIP_INTER[acc[11]+acc[27] + acc[43]+acc[59]];
  883. // lOut[2][4] = CLIP_INTER[acc[11]+acc[27] - (acc[43]+acc[59])];
  884. // lOut[5][3] = CLIP_INTER[(acc[11]-acc[27]) + (acc[43]-acc[59])];
  885. // lOut[5][4] = CLIP_INTER[(acc[11]-acc[27]) - (acc[43]-acc[59])];
  886. //
  887. //
  888. // lOut[3][0] = CLIP_INTER[acc[12]+acc[28] + acc[44]+acc[60]];
  889. // lOut[3][7] = CLIP_INTER[acc[12]+acc[28] - (acc[44]+acc[60])];
  890. // lOut[4][0] = CLIP_INTER[(acc[12]-acc[28]) + (acc[44]-acc[60])];
  891. // lOut[4][7] = CLIP_INTER[(acc[12]-acc[28]) - (acc[44]-acc[60])];
  892. //
  893. // lOut[3][1] = CLIP_INTER[acc[13]+acc[29] + acc[45]+acc[61]];
  894. // lOut[3][6] = CLIP_INTER[acc[13]+acc[29] - (acc[45]+acc[61])];
  895. // lOut[4][1] = CLIP_INTER[(acc[13]-acc[29]) + (acc[45]-acc[61])];
  896. // lOut[4][6] = CLIP_INTER[(acc[13]-acc[29]) - (acc[45]-acc[61])];
  897. //
  898. // lOut[3][2] = CLIP_INTER[acc[14]+acc[30] + acc[46]+acc[62]];
  899. // lOut[3][5] = CLIP_INTER[acc[14]+acc[30] - (acc[46]+acc[62])];
  900. // lOut[4][2] = CLIP_INTER[(acc[14]-acc[30]) + (acc[46]-acc[62])];
  901. // lOut[4][5] = CLIP_INTER[(acc[14]-acc[30]) - (acc[46]-acc[62])];
  902. //
  903. // lOut[3][3] = CLIP_INTER[acc[15]+acc[31] + acc[47]+acc[63]];
  904. // lOut[3][4] = CLIP_INTER[acc[15]+acc[31] - (acc[47]+acc[63])];
  905. // lOut[4][3] = CLIP_INTER[(acc[15]-acc[31]) + (acc[47]-acc[63])];
  906. // lOut[4][4] = CLIP_INTER[(acc[15]-acc[31]) - (acc[47]-acc[63])];
  907. //
  908. ////////////////////////////////////////////////////////////////////////////
  909. //assume parameters passed in by registers
  910. idct_bfly_inter:
  911. ; ----------------------------------------------------------------------
  912. ; INTER ONLY Butterfly and clamp
  913. ; Uses all registers.
  914. ; Uses all accumulators[64], accum
  915. ; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra
  916. ; Writes to Intermediate matrix [8][8] of DWORDS, Intermediate
  917. ;
  918. ; Process 4 outputs per group, 0-15
  919. ; 0
  920. #ifdef PTEL_WORK_AROUND
  921. mov eax, [L_COEFFCOUNT+4] ; get coefficient counter
  922. mov ebx, [L_COEFFVALUE+4] ; get coefficient value
  923. cmp eax, 1 ; compare counter with 1
  924. jg Normal_Process ; if greater than 1 jump to normal process
  925. cmp ebx, 3
  926. jz Zero_Output ; if value == 3 zero output
  927. cmp ebx, -3
  928. jnz Normal_Process ; if value != -3 Process as usual
  929. Zero_Output:
  930. ////////////////////////////////////////////////////////////////////////
  931. // Zero out intermediate matrix [8][8] of DWORDS
  932. //
  933. // C code:
  934. //
  935. // for (x=0; x<8; x++)
  936. // for (y=16; y<8; y++)
  937. // Intermediate[x][y] = 0L;
  938. //
  939. mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate
  940. xor eax, eax
  941. mov ebx, 8
  942. ;
  943. ALIGN 4
  944. loop_for_reinit:
  945. mov [edi], eax
  946. mov [edi+4], eax
  947. mov [edi+8], eax
  948. mov [edi+12], eax
  949. mov [edi+16], eax
  950. mov [edi+20], eax
  951. mov [edi+24], eax
  952. mov [edi+28], eax
  953. add edi, 32
  954. dec ebx
  955. jnz loop_for_reinit
  956. ret
  957. Normal_Process:
  958. #endif
  959. lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  960. mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate
  961. add edi, 128
  962. nop
  963. mov eax, [esi-128] ; get acc[0]
  964. mov ebx, [esi-64] ; get acc[16]
  965. mov ebp, [esi] ; get acc[32]
  966. mov edx, [esi+64] ; get acc[48]
  967. lea ecx, [eax+ebx] ; acc[0]+acc[16]
  968. sub eax, ebx ; acc[0]-acc[16]
  969. lea ebx, [ebp+edx] ; acc[32]+acc[48]
  970. sub ebp, edx ; acc[32]-acc[48]
  971. mov edx, [edi-128] ; pre-fetch output cache line 0
  972. mov esi, [edi+96] ; pre-fetch output cache line 7
  973. lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
  974. sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
  975. lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
  976. sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
  977. sar edx, SCALER ; tmp1 >> 13
  978. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  979. sar ecx, SCALER ; tmp2 >> 13
  980. lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  981. sar ebx, SCALER ; tmp3 >> 13
  982. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  983. sar eax, SCALER ; tmp4 >> 13
  984. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  985. mov DWORD PTR [edi-128], edx ; Intermediate[0][0] = tmp1
  986. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  987. mov DWORD PTR [edi-128+7*4], ecx ; Intermediate[0][7] = tmp2
  988. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  989. mov DWORD PTR [edi+96], ebx ; Intermediate[7][0] = tmp3
  990. mov ebx, [esi-60] ; get acc[17]
  991. ; -------------------------------------------------------------------------
  992. ; 1
  993. mov DWORD PTR [edi+96+7*4], eax ; Intermediate[7][7] = tmp4
  994. mov eax, [esi-124] ; get acc[1]
  995. mov ebp, [esi+4] ; get acc[33]
  996. mov edx, [esi+68] ; get acc[49]
  997. lea ecx, [eax+ebx] ; acc[1]+acc[17]
  998. sub eax, ebx ; acc[1]-acc[17]
  999. lea ebx, [ebp+edx] ; acc[33]+acc[49]
  1000. sub ebp, edx ; acc[33]-acc[49]
  1001. ;nop
  1002. ;nop
  1003. lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
  1004. sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
  1005. lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
  1006. sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
  1007. sar edx, SCALER ; tmp1 >> 13
  1008. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1009. sar ecx, SCALER ; tmp2 >> 13
  1010. nop
  1011. sar ebx, SCALER ; tmp3 >> 13
  1012. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1013. sar eax, SCALER ; tmp4 >> 13
  1014. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1015. mov DWORD PTR [edi-128+1*4], edx ; Intermediate[0][1] = tmp1
  1016. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1017. mov DWORD PTR [edi-128+6*4], ecx ; Intermediate[0][6] = tmp2
  1018. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1019. mov DWORD PTR [edi+96+1*4], ebx ; Intermediate[7][1] = tmp3
  1020. mov ebx, [esi-56] ; get acc[18]
  1021. ; -------------------------------------------------------------------------
  1022. ; 2
  1023. mov DWORD PTR [edi+96+6*4], eax ; Intermediate[7][6] = tmp4
  1024. mov eax, [esi-120] ; get acc[2]
  1025. mov ebp, [esi+8] ; get acc[34]
  1026. mov edx, [esi+72] ; get acc[50]
  1027. lea ecx, [eax+ebx] ; acc[2]+acc[18]
  1028. sub eax, ebx ; acc[2]-acc[18]
  1029. lea ebx, [ebp+edx] ; acc[34]+acc[50]
  1030. sub ebp, edx ; acc[34]-acc[50]
  1031. ;nop
  1032. ;nop
  1033. lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
  1034. sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
  1035. lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
  1036. sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
  1037. sar edx, SCALER ; tmp1 >> 13
  1038. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1039. sar ecx, SCALER ; tmp2 >> 13
  1040. nop
  1041. sar ebx, SCALER ; tmp3 >> 13
  1042. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1043. sar eax, SCALER ; tmp4 >> 13
  1044. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1045. mov DWORD PTR [edi-128+2*4], edx ; Intermediate[0][2] = tmp1
  1046. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1047. mov DWORD PTR [edi-128+5*4], ecx ; Intermediate[0][5] = tmp2
  1048. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1049. mov DWORD PTR [edi+96+2*4], ebx ; Intermediate[7][2] = tmp3
  1050. mov ebx, [esi-52] ; get acc[19]
  1051. ; -------------------------------------------------------------------------
  1052. ; 3
  1053. mov DWORD PTR [edi+96+5*4], eax ; Intermediate[7][5] = tmp4
  1054. mov eax, [esi-116] ; get acc[3]
  1055. mov ebp, [esi+12] ; get acc[35]
  1056. mov edx, [esi+76] ; get acc[51]
  1057. lea ecx, [eax+ebx] ; acc[3]+acc[19]
  1058. sub eax, ebx ; acc[3]-acc[19]
  1059. lea ebx, [ebp+edx] ; acc[35]+acc[51]
  1060. sub ebp, edx ; acc[35]-acc[51]
  1061. ;nop
  1062. ;nop
  1063. lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
  1064. sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
  1065. lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
  1066. sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
  1067. sar edx, SCALER ; tmp1 >> 13
  1068. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1069. sar ecx, SCALER ; tmp2 >> 13
  1070. nop
  1071. sar ebx, SCALER ; tmp3 >> 13
  1072. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1073. sar eax, SCALER ; tmp4 >> 13
  1074. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1075. mov DWORD PTR [edi-128+3*4], edx ; Intermediate[0][3] = tmp1
  1076. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1077. mov DWORD PTR [edi-128+4*4], ecx ; Intermediate[0][4] = tmp2
  1078. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1079. mov DWORD PTR [edi+96+3*4], ebx ; Intermediate[7][3] = tmp3
  1080. mov ebx, [esi-48] ; get acc[20]
  1081. ; -------------------------------------------------------------------------
  1082. ; 4
  1083. mov DWORD PTR [edi+96+4*4], eax ; Intermediate[7][4] = tmp4
  1084. mov eax, [esi-112] ; get acc[4]
  1085. mov ebp, [esi+16] ; get acc[36]
  1086. mov edx, [esi+80] ; get acc[52]
  1087. lea ecx, [eax+ebx] ; acc[4]+acc[20]
  1088. sub eax, ebx ; acc[4]-acc[20]
  1089. lea ebx, [ebp+edx] ; acc[36]+acc[52]
  1090. sub ebp, edx ; acc[36]-acc[52]
  1091. mov edx, [edi-96] ; pre-fetch output cache line 1
  1092. mov esi, [edi+64] ; pre-fetch output cache line 6
  1093. lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
  1094. sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
  1095. lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
  1096. sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
  1097. sar edx, SCALER ; tmp1 >> 13
  1098. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1099. sar ecx, SCALER ; tmp2 >> 13
  1100. lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  1101. sar ebx, SCALER ; tmp3 >> 13
  1102. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1103. sar eax, SCALER ; tmp4 >> 13
  1104. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1105. mov DWORD PTR [edi-96], edx ; Intermediate[1][0] = tmp1
  1106. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1107. mov DWORD PTR [edi-96+7*4], ecx ; Intermediate[1][7] = tmp2
  1108. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1109. mov DWORD PTR [edi+64], ebx ; Intermediate[6][0] = tmp3
  1110. mov ebx, [esi-44] ; get acc[21]
  1111. ; -------------------------------------------------------------------------
  1112. ; 5
  1113. mov DWORD PTR [edi+64+7*4], eax ; Intermediate[6][7] = tmp4
  1114. mov eax, [esi-108] ; get acc[5]
  1115. mov ebp, [esi+20] ; get acc[37]
  1116. mov edx, [esi+84] ; get acc[53]
  1117. lea ecx, [eax+ebx] ; acc[5]+acc[21]
  1118. sub eax, ebx ; acc[5]-acc[21]
  1119. lea ebx, [ebp+edx] ; acc[37]+acc[53]
  1120. sub ebp, edx ; acc[37]-acc[53]
  1121. ;nop
  1122. ;nop
  1123. lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
  1124. sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
  1125. lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
  1126. sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
  1127. sar edx, SCALER ; tmp1 >> 13
  1128. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1129. sar ecx, SCALER ; tmp2 >> 13
  1130. nop
  1131. sar ebx, SCALER ; tmp3 >> 13
  1132. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1133. sar eax, SCALER ; tmp4 >> 13
  1134. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1135. mov DWORD PTR [edi-96+1*4], edx ; Intermediate[1][1] = tmp1
  1136. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1137. mov DWORD PTR [edi-96+6*4], ecx ; Intermediate[1][6] = tmp2
  1138. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1139. mov DWORD PTR [edi+64+1*4], ebx ; Intermediate[6][1] = tmp3
  1140. mov ebx, [esi-40] ; get acc[22]
  1141. ; -------------------------------------------------------------------------
  1142. ; 6
  1143. mov DWORD PTR [edi+64+6*4], eax ; Intermediate[6][6] = tmp4
  1144. mov eax, [esi-104] ; get acc[6]
  1145. mov ebp, [esi+24] ; get acc[38]
  1146. mov edx, [esi+88] ; get acc[54]
  1147. lea ecx, [eax+ebx] ; acc[6]+acc[22]
  1148. sub eax, ebx ; acc[6]-acc[22]
  1149. lea ebx, [ebp+edx] ; acc[38]+acc[54]
  1150. sub ebp, edx ; acc[38]-acc[54]
  1151. ;nop
  1152. ;nop
  1153. lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
  1154. sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
  1155. lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
  1156. sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
  1157. sar edx, SCALER ; tmp1 >> 13
  1158. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1159. sar ecx, SCALER ; tmp2 >> 13
  1160. nop
  1161. sar ebx, SCALER ; tmp3 >> 13
  1162. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1163. sar eax, SCALER ; tmp4 >> 13
  1164. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1165. mov DWORD PTR [edi-96+2*4], edx ; Intermediate[1][2] = tmp1
  1166. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1167. mov DWORD PTR [edi-96+5*4], ecx ; Intermediate[1][5] = tmp2
  1168. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1169. mov DWORD PTR [edi+64+2*4], ebx ; Intermediate[6][2] = tmp3
  1170. mov ebx, [esi-36] ; get acc[23]
  1171. ; -------------------------------------------------------------------------
  1172. ; 7
  1173. mov DWORD PTR [edi+64+5*4], eax ; Intermediate[6][5] = tmp4
  1174. mov eax, [esi-100] ; get acc[7]
  1175. mov ebp, [esi+28] ; get acc[39]
  1176. mov edx, [esi+92] ; get acc[55]
  1177. lea ecx, [eax+ebx] ; acc[7]+acc[23]
  1178. sub eax, ebx ; acc[7]-acc[23]
  1179. lea ebx, [ebp+edx] ; acc[39]+acc[55]
  1180. sub ebp, edx ; acc[39]-acc[55]
  1181. ;nop
  1182. ;nop
  1183. lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
  1184. sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
  1185. lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
  1186. sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
  1187. sar edx, SCALER ; tmp1 >> 13
  1188. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1189. sar ecx, SCALER ; tmp2 >> 13
  1190. nop
  1191. sar ebx, SCALER ; tmp3 >> 13
  1192. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1193. sar eax, SCALER ; tmp4 >> 13
  1194. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1195. mov DWORD PTR [edi-96+3*4], edx ; Intermediate[1][3] = tmp1
  1196. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1197. mov DWORD PTR [edi-96+4*4], ecx ; Intermediate[1][4] = tmp2
  1198. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1199. mov DWORD PTR [edi+64+3*4], ebx ; Intermediate[6][3] = tmp3
  1200. mov ebx, [esi-32] ; get acc[24]
  1201. ; -------------------------------------------------------------------------
  1202. ; 8
  1203. mov DWORD PTR [edi+64+4*4], eax ; Intermediate[6][4] = tmp4
  1204. mov eax, [esi-96] ; get acc[8]
  1205. mov ebp, [esi+32] ; get acc[40]
  1206. mov edx, [esi+96] ; get acc[56]
  1207. lea ecx, [eax+ebx] ; acc[8]+acc[24]
  1208. sub eax, ebx ; acc[8]-acc[24]
  1209. lea ebx, [ebp+edx] ; acc[40]+acc[56]
  1210. sub ebp, edx ; acc[40]-acc[56]
  1211. mov edx, [edi-64] ; pre-fetch output cache line 2
  1212. mov esi, [edi+32] ; pre-fetch output cache line 5
  1213. lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56]
  1214. sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56])
  1215. lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56])
  1216. sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56])
  1217. sar edx, SCALER ; tmp1 >> 13
  1218. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1219. sar ecx, SCALER ; tmp2 >> 13
  1220. lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  1221. sar ebx, SCALER ; tmp3 >> 13
  1222. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1223. sar eax, SCALER ; tmp4 >> 13
  1224. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1225. mov DWORD PTR [edi-64], edx ; Intermediate[2][0] = tmp1
  1226. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1227. mov DWORD PTR [edi-64+7*4], ecx ; Intermediate[2][7] = tmp2
  1228. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1229. mov DWORD PTR [edi+32], ebx ; Intermediate[5][0] = tmp3
  1230. mov ebx, [esi-28] ; get acc[25]
  1231. ; -------------------------------------------------------------------------
  1232. ; 9
  1233. mov DWORD PTR [edi+32+7*4], eax ; Intermediate[5][7] = tmp4
  1234. mov eax, [esi-92] ; get acc[9]
  1235. mov ebp, [esi+36] ; get acc[41]
  1236. mov edx, [esi+100] ; get acc[57]
  1237. lea ecx, [eax+ebx] ; acc[9]+acc[25]
  1238. sub eax, ebx ; acc[9]-acc[25]
  1239. lea ebx, [ebp+edx] ; acc[41]+acc[57]
  1240. sub ebp, edx ; acc[41]-acc[57]
  1241. ;nop
  1242. ;nop
  1243. lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57]
  1244. sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57])
  1245. lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57])
  1246. sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57])
  1247. sar edx, SCALER ; tmp1 >> 13
  1248. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1249. sar ecx, SCALER ; tmp2 >> 13
  1250. nop
  1251. sar ebx, SCALER ; tmp3 >> 13
  1252. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1253. sar eax, SCALER ; tmp4 >> 13
  1254. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1255. mov DWORD PTR [edi-64+1*4], edx ; Intermediate[2][1] = tmp1
  1256. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1257. mov DWORD PTR [edi-64+6*4], ecx ; Intermediate[2][6] = tmp2
  1258. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1259. mov DWORD PTR [edi+32+1*4], ebx ; Intermediate[5][1] = tmp3
  1260. mov ebx, [esi-24] ; get acc[26]
  1261. ; -------------------------------------------------------------------------
  1262. ; 10
  1263. mov DWORD PTR [edi+32+6*4], eax ; Intermediate[5][6] = tmp4
  1264. mov eax, [esi-88] ; get acc[10]
  1265. mov ebp, [esi+40] ; get acc[42]
  1266. mov edx, [esi+104] ; get acc[58]
  1267. lea ecx, [eax+ebx] ; acc[10]+acc[26]
  1268. sub eax, ebx ; acc[10]-acc[26]
  1269. lea ebx, [ebp+edx] ; acc[42]+acc[58]
  1270. sub ebp, edx ; acc[42]-acc[58]
  1271. ;nop
  1272. ;nop
  1273. lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58]
  1274. sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58])
  1275. lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58])
  1276. sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58])
  1277. sar edx, SCALER ; tmp1 >> 13
  1278. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1279. sar ecx, SCALER ; tmp2 >> 13
  1280. nop
  1281. sar ebx, SCALER ; tmp3 >> 13
  1282. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1283. sar eax, SCALER ; tmp4 >> 13
  1284. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1285. mov DWORD PTR [edi-64+2*4], edx ; Intermediate[2][2] = tmp1
  1286. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1287. mov DWORD PTR [edi-64+5*4], ecx ; Intermediate[2][5] = tmp2
  1288. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1289. mov DWORD PTR [edi+32+2*4], ebx ; Intermediate[5][2] = tmp3
  1290. mov ebx, [esi-20] ; get acc[27]
  1291. ; -------------------------------------------------------------------------
  1292. ; 11
  1293. mov DWORD PTR [edi+32+5*4], eax ; Intermediate[5][5] = tmp4
  1294. mov eax, [esi-84] ; get acc[11]
  1295. mov ebp, [esi+44] ; get acc[43]
  1296. mov edx, [esi+108] ; get acc[59]
  1297. lea ecx, [eax+ebx] ; acc[11]+acc[27]
  1298. sub eax, ebx ; acc[11]-acc[27]
  1299. lea ebx, [ebp+edx] ; acc[43]+acc[59]
  1300. sub ebp, edx ; acc[43]-acc[59]
  1301. ;nop
  1302. ;nop
  1303. lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59]
  1304. sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59])
  1305. lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59])
  1306. sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59])
  1307. sar edx, SCALER ; tmp1 >> 13
  1308. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1309. sar ecx, SCALER ; tmp2 >> 13
  1310. nop
  1311. sar ebx, SCALER ; tmp3 >> 13
  1312. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1313. sar eax, SCALER ; tmp4 >> 13
  1314. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1315. mov DWORD PTR [edi-64+3*4], edx ; Intermediate[2][3] = tmp1
  1316. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1317. mov DWORD PTR [edi-64+4*4], ecx ; Intermediate[2][4] = tmp2
  1318. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1319. mov DWORD PTR [edi+32+3*4], ebx ; Intermediate[5][3] = tmp3
  1320. mov ebx, [esi-16] ; get acc[28]
  1321. ; -------------------------------------------------------------------------
  1322. ; 12
  1323. mov DWORD PTR [edi+32+4*4], eax ; Intermediate[5][4] = tmp4
  1324. mov eax, [esi-80] ; get acc[12]
  1325. mov ebp, [esi+48] ; get acc[44]
  1326. mov edx, [esi+112] ; get acc[60]
  1327. lea ecx, [eax+ebx] ; acc[12]+acc[28]
  1328. sub eax, ebx ; acc[12]-acc[28]
  1329. lea ebx, [ebp+edx] ; acc[44]+acc[60]
  1330. sub ebp, edx ; acc[44]-acc[60]
  1331. mov edx, [edi-32] ; pre-fetch output cache line 3
  1332. mov esi, [edi] ; pre-fetch output cache line 4
  1333. lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60]
  1334. sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60])
  1335. lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60])
  1336. sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60])
  1337. sar edx, SCALER ; tmp1 >> 13
  1338. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1339. sar ecx, SCALER ; tmp2 >> 13
  1340. lea esi, [L_ACCUM+128+4] ; get addr of accum[32]
  1341. sar ebx, SCALER ; tmp3 >> 13
  1342. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1343. sar eax, SCALER ; tmp4 >> 13
  1344. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1345. mov DWORD PTR [edi-32], edx ; Intermediate[3][0] = tmp1
  1346. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1347. mov DWORD PTR [edi-32+7*4], ecx ; Intermediate[3][7] = tmp2
  1348. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1349. mov DWORD PTR [edi], ebx ; Intermediate[4][0] = tmp3
  1350. mov ebx, [esi-12] ; get acc[29]
  1351. ; -------------------------------------------------------------------------
  1352. ; 13
  1353. mov DWORD PTR [edi+7*4], eax ; Intermediate[4][7] = tmp4
  1354. mov eax, [esi-76] ; get acc[13]
  1355. mov ebp, [esi+52] ; get acc[45]
  1356. mov edx, [esi+116] ; get acc[61]
  1357. lea ecx, [eax+ebx] ; acc[13]+acc[29]
  1358. sub eax, ebx ; acc[13]-acc[29]
  1359. lea ebx, [ebp+edx] ; acc[45]+acc[61]
  1360. sub ebp, edx ; acc[45]-acc[61]
  1361. ;nop
  1362. ;nop
  1363. lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61]
  1364. sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61])
  1365. lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61])
  1366. sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61])
  1367. sar edx, SCALER ; tmp1 >> 13
  1368. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1369. sar ecx, SCALER ; tmp2 >> 13
  1370. nop
  1371. sar ebx, SCALER ; tmp3 >> 13
  1372. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1373. sar eax, SCALER ; tmp4 >> 13
  1374. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1375. mov DWORD PTR [edi-32+1*4], edx ; Intermediate[3][1] = tmp1
  1376. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1377. mov DWORD PTR [edi-32+6*4], ecx ; Intermediate[3][6] = tmp2
  1378. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1379. mov DWORD PTR [edi+1*4], ebx ; Intermediate[4][1] = tmp3
  1380. mov ebx, [esi-8] ; get acc[30]
  1381. ; -------------------------------------------------------------------------
  1382. ; 14
  1383. mov DWORD PTR [edi+6*4], eax ; Intermediate[4][6] = tmp4
  1384. mov eax, [esi-72] ; get acc[14]
  1385. mov ebp, [esi+56] ; get acc[46]
  1386. mov edx, [esi+120] ; get acc[62]
  1387. lea ecx, [eax+ebx] ; acc[14]+acc[30]
  1388. sub eax, ebx ; acc[14]-acc[30]
  1389. lea ebx, [ebp+edx] ; acc[46]+acc[62]
  1390. sub ebp, edx ; acc[46]-acc[62]
  1391. ;nop
  1392. ;nop
  1393. lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62]
  1394. sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62])
  1395. lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62])
  1396. sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62])
  1397. sar edx, SCALER ; tmp1 >> 13
  1398. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1399. sar ecx, SCALER ; tmp2 >> 13
  1400. nop
  1401. sar ebx, SCALER ; tmp3 >> 13
  1402. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1403. sar eax, SCALER ; tmp4 >> 13
  1404. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1405. mov DWORD PTR [edi-32+2*4], edx ; Intermediate[3][2] = tmp1
  1406. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1407. mov DWORD PTR [edi-32+5*4], ecx ; Intermediate[3][5] = tmp2
  1408. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1409. mov DWORD PTR [edi+2*4], ebx ; Intermediate[4][2] = tmp3
  1410. mov ebx, [esi-4] ; get acc[31]
  1411. ; -------------------------------------------------------------------------
  1412. ; 15
  1413. mov DWORD PTR [edi+5*4], eax ; Intermediate[4][5] = tmp4
  1414. mov eax, [esi-68] ; get acc[15]
  1415. mov ebp, [esi+60] ; get acc[47]
  1416. mov edx, [esi+124] ; get acc[63]
  1417. lea ecx, [eax+ebx] ; acc[15]+acc[31]
  1418. sub eax, ebx ; acc[15]-acc[31]
  1419. lea ebx, [ebp+edx] ; acc[47]+acc[63]
  1420. sub ebp, edx ; acc[47]-acc[63]
  1421. ;nop
  1422. ;nop
  1423. lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63]
  1424. sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63])
  1425. lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63])
  1426. sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63])
  1427. sar edx, SCALER ; tmp1 >> 13
  1428. lea ebp, ClipPixInter ; ecx gets Base addr of ClipPixInter
  1429. sar ecx, SCALER ; tmp2 >> 13
  1430. nop
  1431. sar ebx, SCALER ; tmp3 >> 13
  1432. mov edx, [ebp+edx*4] ; tmp1 = ClipPixInter[tmp1]
  1433. sar eax, SCALER ; tmp4 >> 13
  1434. mov ecx, [ebp+ecx*4] ; tmp2 = ClipPixInter[tmp2]
  1435. mov DWORD PTR [edi-32+3*4], edx ; Intermediate[3][3] = tmp1
  1436. mov ebx, [ebp+ebx*4] ; tmp3 = ClipPixInter[tmp3]
  1437. mov DWORD PTR [edi-32+4*4], ecx ; Intermediate[3][4] = tmp2
  1438. mov eax, [ebp+eax*4] ; tmp4 = ClipPixInter[tmp4]
  1439. mov DWORD PTR [edi+3*4], ebx ; Intermediate[4][3] = tmp3
  1440. mov DWORD PTR [edi+4*4], eax ; Intermediate[4][4] = tmp4
  1441. ret
  1442. } //end of asm
  1443. }
  1444. #pragma code_seg()