Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1267 lines
67 KiB

  1. // version 003; everything except 1) segment
  2. //
  3. /* *************************************************************************
  4. ** INTEL Corporation Proprietary Information
  5. **
  6. ** This listing is supplied under the terms of a license
  7. ** agreement with INTEL Corporation and may not be copied
  8. ** nor disclosed except in accordance with the terms of
  9. ** that agreement.
  10. **
  11. ** Copyright (c) 1995 Intel Corporation.
  12. ** All Rights Reserved.
  13. **
  14. ** *************************************************************************
  15. */
  16. //////////////////////////////////////////////////////////////////////////
  17. // $Author: AGUPTA2 $
  18. // $Date: 25 Oct 1996 13:32:28 $
  19. // $Archive: S:\h26x\src\dec\d3idct.cpv $
  20. // $Header: S:\h26x\src\dec\d3idct.cpv 1.11 25 Oct 1996 13:32:28 AGUPTA2 $
  21. // $Log: S:\h26x\src\dec\d3idct.cpv $
  22. //
  23. // Rev 1.11 25 Oct 1996 13:32:28 AGUPTA2
  24. // Re-scheduled butterfky code; re-arranged local var declarations.
  25. //
  26. // Rev 1.10 30 Aug 1996 08:39:56 KLILLEVO
  27. // added C version of block edge filter, and changed the bias in
  28. // ClampTbl[] from 128 to CLAMP_BIAS (defined to 128)
  29. // The C version of the block edge filter takes up way too much CPU time
  30. // relative to the rest of the decode time (4 ms for QCIF and 16 ms
  31. // for CIF on a P120, so this needs to coded in assembly)
  32. //
  33. // Rev 1.9 17 Jul 1996 15:33:18 AGUPTA2
  34. // Increased the size of clamping table ClampTbl to 128+256+128.
  35. //
  36. // Rev 1.8 08 Mar 1996 16:46:20 AGUPTA2
  37. // Added pragma code_seg. Rolled the initialization code. Got rid of most
  38. // of 32-bit displacements in instructions. Aligned frequently executed loops
  39. // at 4-byte boundary. Made changes to reflect new size of MapMatrix. Removed
  40. // nop instructions. Deleted code that prefetches output lines in case of
  41. // INTRA blocks. Use ClampTbl instead of ClipPixIntra. Do not clip output
  42. // of INTER blocks; clipping is done in dxblkadd().
  43. //
  44. //
  45. // Rev 1.7 27 Dec 1995 14:36:06 RMCKENZX
  46. // Added copyright notice
  47. //
  48. // Rev 1.6 09 Dec 1995 17:33:20 RMCKENZX
  49. // Re-checked in module to support decoder re-architecture (thru PB Frames)
  50. //
  51. // Rev 1.4 30 Nov 1995 18:02:14 CZHU
  52. // Save and restore register before and after idct_acc
  53. //
  54. // Rev 1.1 27 Nov 1995 13:13:28 CZHU
  55. //
  56. //
  57. // Rev 1.0 27 Nov 1995 13:08:24 CZHU
  58. // Initial revision.
  59. //
  60. //Block level decoding for H.26x decoder
  61. #include "precomp.h"
  62. /////////////////////////////////////////////////////////////////////////
  63. // Decode each none-empty block
  64. // Input: lpInst: decoder instance,
  65. // lpSrc: input bitstream,
  66. // lpBlockAction:
  67. // the pointer to the block action stream structure
  68. // bitsread: number of bits in the buffer already,
  69. /////////////////////////////////////////////////////////////////////////
  70. // local variable definitions
  71. #define FRAMEPOINTER esp
  72. //////////////////////////////////////////////////////////////
  73. // L_ACCUM MUST BE LAST 256 BYTES OF A PAGE
  74. /////////////////////////////////////////////////////////////
  75. #define L_PRODUCT FRAMEPOINTER + 0 // 20 DWORD
  76. #define L_INPUT_INTER L_PRODUCT + 20*4 // DWORD
  77. #define L_esi L_INPUT_INTER + 1*4 // DWORD
  78. #define L_NO_COEFF L_esi + 1*4 // DWORD
  79. #define L_DESTBLOCK L_NO_COEFF + 1*4 // DWORD
  80. #define L_LOOPCOUNTER L_DESTBLOCK + 1*4 // DWORD
  81. #define L_STASHESP L_LOOPCOUNTER + 1*4 // DWORD
  82. #define L_dummy L_STASHESP + 1*4 // 6 DWORDS
  83. #define L_ACCUM L_dummy + 6*4 // 64 DWORD
  84. #define LOCALSIZE (96*4) // 96 DWORDS;multiple of cache line size
  85. ////////////////////////////////////////////////////////////////////////////////
  86. // Input:
  87. // pIQ_INDEX, pointer to pointer for Inverse quantization and index
  88. // for the current block.
  89. // No_Coeff, A 32 bit number indicate block types, etc.
  90. // 0--63, inter block, number of coeff
  91. // 64--127 64+ intra block, number of coeff
  92. // pIntraBuf, Buffer pointer for intra blocks.
  93. //
  94. // pInterBuf, Buffer pointer for inter blocks.
  95. //
  96. //
  97. // return:
  98. //
  99. //////////////////////////////////////////////////////////////////////////////////
  100. #pragma code_seg("IACODE2")
  101. __declspec(naked)
  102. U32 DecodeBlock_IDCT ( U32 pIQ_INDEX,
  103. U32 No_Coeff,
  104. U32 pIntraBuf,
  105. U32 pInterBuf)
  106. {
  107. __asm
  108. {
  109. ////////////////////////////////////////////////////////////////
  110. // DON'T CHANGE LOCAL DECLARATIONS OR STACK POINTER ADJUSTMENT
  111. // CODE WITHOUT TALKING TO ATUL
  112. ////////////////////////////////////////////////////////////////
  113. push ebp // save callers frame pointer
  114. mov ebp, esp // make parameters accessible
  115. push esi // assumed preserved
  116. push edi
  117. push ebx
  118. mov eax, pInterBuf
  119. mov edx, esp // Save old ESP in edx
  120. and esp, -4096 // align at page boundary
  121. xor esi, esi // loop init
  122. sub esp, LOCALSIZE // last 96 DWORDS of page
  123. lea edi, [L_ACCUM]
  124. mov ebx, 64 // loop init
  125. mov [L_STASHESP], edx // Save old esp
  126. mov edx, No_Coeff
  127. mov [L_INPUT_INTER], eax
  128. mov eax, ROUNDER // loop init
  129. ;
  130. /////////////////////////////////////////////////////////////////
  131. // There is no point in pre-loading the cache. That is because
  132. // after the first block it is likely to be in the cache.
  133. //
  134. loop_for_init:
  135. mov [edi], eax
  136. mov [edi+4], eax
  137. mov [edi+ebx], esi
  138. mov [edi+ebx+4], esi
  139. mov [edi+ebx+8], esi
  140. mov [edi+ebx+12], esi
  141. mov [edi+ebx+16], esi
  142. mov [edi+ebx+20], esi
  143. add edi, 8
  144. add ebx, 16
  145. cmp ebx, 192
  146. jl loop_for_init
  147. /////////////////////////////////////////////////////////////////////
  148. // end of new init code
  149. //end of IDCT init.
  150. cmp edx, 65
  151. jg intra_block
  152. mov ebx, pInterBuf
  153. jmp pre_acc_loop
  154. intra_block:
  155. mov ebx, pIntraBuf
  156. sub edx, 65
  157. // register:
  158. // ebp: loop counter
  159. // ebx: inverse quant
  160. // ecx: index [0,63]
  161. pre_acc_loop:
  162. mov esi, pIQ_INDEX
  163. mov [L_DESTBLOCK], ebx
  164. mov [L_esi], esi
  165. ALIGN 4
  166. acc_loop:
  167. mov ebx,[esi+edx*8-8] //Invserse Quant
  168. mov ecx,[esi+edx*8-4] //Coeff index
  169. mov [L_NO_COEFF], edx
  170. call idct_acc
  171. mov esi, [L_esi]
  172. mov edx, [L_NO_COEFF]
  173. dec edx
  174. jnz acc_loop
  175. mov edx, [L_DESTBLOCK]
  176. mov ecx, [L_INPUT_INTER]
  177. cmp edx, ecx
  178. jnz call_intra_bfly
  179. call idct_bfly_inter
  180. mov esp, [L_STASHESP] // free locals
  181. add eax, edi
  182. pop ebx
  183. pop edi
  184. pop esi
  185. pop ebp
  186. ret
  187. call_intra_bfly:
  188. call idct_bfly_intra
  189. mov esp, [L_STASHESP] // free locals
  190. add eax, edi
  191. pop ebx
  192. pop edi
  193. pop esi
  194. pop ebp
  195. ret
  196. ///////////////////////////////////////////////////////////////
  197. // assume parameter passed in by registers
  198. // ebx, inverse quant
  199. // ecx, index [0,63]
  200. idct_acc:
  201. ; For every non-zero coefficient:
  202. ; LoopCounter, on local stack, has index
  203. ; ecx = index (0-63)
  204. ; ebx = non-zero input
  205. ; Note i = index
  206. ;
  207. and ecx, 03fh ; Chad added to prevent GPF
  208. mov [L_LOOPCOUNTER+4], ecx ; Store Loop counter
  209. xor edx, edx ; zero out for byte read, use as dword
  210. mov esi, ecx ; move index to esi
  211. lea eax, Unique ; eax = Address of Unique[0]
  212. mov ebp, ecx ; move index to ebp
  213. shl esi, 3 ; index*8
  214. add ecx, ecx ; index*2
  215. add esi, ecx ; index*10
  216. lea ecx, KernelCoeff ; get KernelCoeff[0][0]
  217. lea edi, [L_PRODUCT+4] ; edi = address of product[0]
  218. mov dl, [eax+ebp] ; get Unique[i]
  219. lea esi, [ecx+4*esi] ; address of KernelCoeff[i][0]
  220. mov ebp, edx ; ebp = Unique[i]
  221. lea eax, [edi+edx*4] ; eax = address of product[totalU]
  222. nop
  223. ; ----------------------------------------------------------------------
  224. ; Register usage
  225. ; eax = addr of product[Unique[i]]
  226. ; ebx = input[i]
  227. ; ecx = 0, -product[x]
  228. ; edx = KernelCoeff[i][x], product[x]= KernelCoeff[i][x] * input[i]
  229. ; ebp = x
  230. ; edi = addr of product[0]
  231. ; esi = addr of KernelCoeff[i][x]
  232. ALIGN 4
  233. loop_for_x:
  234. xor ecx, ecx
  235. mov edx, [esi+ebp*4-4] ; read KernelCoeff[i][x]
  236. imul edx, ebx ; KernelCoeff[i][x] * input[i]
  237. mov [edi+ebp*4-4], edx ; product[x] = result of imul
  238. sub ecx, edx
  239. mov [eax+ebp*4-4], ecx ; product[totalU+x] = -product[x]
  240. dec ebp ; decrement x
  241. jnz loop_for_x
  242. ; ----------------------------------------------------------------------
  243. ; Register usage
  244. ; eax = MapMatrix[0][0]
  245. ; ebx = PClass[0], accum[xxx]
  246. ; ecx = LoopCounter, addr of MapMatrix[i][0]
  247. ; edx = product[0], accum[PClass[i][0-15]]
  248. ; ebp = addr of accum[0], product[MapMatrix[i][0-15]]
  249. ; edi = addr of product[0]
  250. ; esi = PClass[i], address of accum[PClass[i]]
  251. mov ecx, [L_LOOPCOUNTER+4] ; get i
  252. and ecx, 0ffh ; Chad added to prevent GPF
  253. lea ebx, PClass ; get addr of PClass[0]
  254. mov esi, ecx
  255. shl ecx, 4
  256. lea eax, MapMatrix ; get addr of MapMatrix[0][0]
  257. xor edx, edx
  258. nop
  259. mov dl, [ebx+esi] ; get PClass[i]
  260. lea ecx, [eax+1*ecx] ; get addr of MapMatrix[i][0]
  261. shl edx, 2 ; esi*4
  262. lea esi, [L_ACCUM+4] ; get addr of accum[0]
  263. ; ----------------------------------------------------------------------
  264. xor eax, eax ; get MapMatrix[i][0]
  265. add esi, edx ; esi = address of accum[PClass[i]]
  266. mov al, [ecx]
  267. mov ebx, [esi] ; get accum[PClass[i]]
  268. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[0]]
  269. mov al, [ecx+1] ; get pNKernel->matrix[1]
  270. add ebx, ebp ; accum[pNKernel->PClass] += product[
  271. ; pNKernel->matrix[0]]
  272. mov edx, [esi+4] ; get accum[1+pNKernel->PClass]
  273. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[1]]
  274. mov al, [ecx+2] ; get pNKernel->matrix[2]
  275. add edx, ebp ; accum[1+pNkernel->PClass] += product[
  276. ; pNKernel->matrix[1]]
  277. mov [esi], ebx ; store accum[pNKernel->PClass] += product[
  278. ; pNKernel->matrix[0]]
  279. mov [esi+4], edx ; store accum[1+pNKernel->PClass] +=
  280. ; product[pNKernel->matrix[1]]
  281. mov ebx, [esi+8] ; get accum[2+pNKernel->PClass]
  282. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[2]]
  283. mov al, [ecx+3] ; get pNKernel->matrix[3]
  284. add ebx, ebp ; accum[2+pNKernel->PClass] += product[
  285. ; pNKernel->matrix[2]]
  286. mov edx, [esi+12] ; get accum[3+pNKernel->PClass]
  287. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[3]]
  288. mov al, [ecx+4] ; get pNKernel->matrix[4]
  289. add edx, ebp ; accum[3+pNKernel->PClass] += product[
  290. ; pNKernel->matrix[3]]
  291. mov [esi+8], ebx ; store accum[2+pNKernel->PClass] +=
  292. ; product[pNKernel->matrix[2]]
  293. mov [esi+12], edx ; store accum[3+pNKernel->PClass] +=
  294. ; product[pNKernel->matrix[3]]
  295. ; ----------------------------------------------------------------------
  296. mov ebx, [esi+16] ; get accum[4+pNKernel->PClass]
  297. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[4]]
  298. mov al, [ecx+5] ; get pNKernel->matrix[5]
  299. add ebx, ebp ; accum[4+pNKernel->PClass] += product[
  300. ; pNKernel->matrix[4]]
  301. mov edx, [esi+20] ; get accum[5+pNKernel->PClass]
  302. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[5]]
  303. mov al, [ecx+6] ; get pNKernel->matrix[6]
  304. add edx, ebp ; accum[5+pNkernel->PClass] += product[
  305. ; pNKernel->matrix[5]]
  306. mov [esi+16], ebx ; store accum[4+pNKernel->PClass] +=
  307. ; product[pNKernel->matrix[4]]
  308. mov [esi+20], edx ; store accum[5+pNKernel->PClass] +=
  309. ; product[pNKernel->matrix[5]]
  310. mov ebx, [esi+24] ; get accum[6+pNKernel->PClass]
  311. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[6]]
  312. mov al, [ecx+7] ; get pNKernel->matrix[7]
  313. add ebx, ebp
  314. mov edx, [esi+28] ; get accum[7+pNKernel->PClass]
  315. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[7]]
  316. mov al, [ecx+8] ; get pNKernel->matrix[8]
  317. add edx, ebp ; accum[7+pNKernel->PClass] += product[
  318. ; pNKernel->matrix[7]]
  319. mov [esi+24], ebx ; store accum[6+pNKernel->PClass] +=
  320. ; product[pNKernel->matrix[6]]
  321. mov [esi+28], edx ; store accum[7+pNKernel->PClass] +=
  322. ; product[pNKernel->matrix[7]]
  323. ; ----------------------------------------------------------------------
  324. mov ebx, [esi+32] ; get accum[8+pNKernel->PClass]
  325. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[8]]
  326. mov al, [ecx+9] ; get pNKernel->matrix[9]
  327. add ebx, ebp ; accum[8+pNKernel->PClass] += product[
  328. ; pNKernel->matrix[8]]
  329. mov edx, [esi+36] ; get accum[9+pNKernel->PClass]
  330. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[9]]
  331. mov al, [ecx+10] ; get pNKernel->matrix[10]
  332. add edx, ebp ; accum[9+pNkernel->PClass] += product[
  333. ; pNKernel->matrix[9]]
  334. mov [esi+32], ebx ; store accum[8+pNKernel->PClass] +=
  335. ; product[pNKernel->matrix[8]]
  336. mov [esi+36], edx ; store accum[9+pNKernel->PClass] +=
  337. ; product[pNKernel->matrix[9]]
  338. mov ebx, [esi+40] ; get accum[10+pNKernel->PClass]
  339. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[10]]
  340. mov al, [ecx+11] ; get pNKernel->matrix[11]
  341. add ebx, ebp
  342. mov edx, [esi+44] ; get accum[11+pNKernel->PClass]
  343. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[11]]
  344. ; product[pNKernel->matrix[11]]
  345. mov al, [ecx+12] ; get pNKernel->matrix[12]
  346. add edx, ebp ; accum[11+pNKernel->PClass] += product[
  347. ; pNKernel->matrix[11]]
  348. mov [esi+40], ebx ; store accum[10+pNKernel->PClass] +=
  349. ; product[pNKernel->matrix[10]]
  350. mov [esi+44], edx ; store accum[11+pNKernel->PClass] +=
  351. ; product[pNKernel->matrix[11]]
  352. ; ----------------------------------------------------------------------
  353. mov ebx, [esi+48] ; get accum[12+pNKernel->PClass]
  354. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[12]]
  355. mov al, [ecx+13] ; get pNKernel->matrix[13]
  356. add ebx, ebp ; accum[12+pNKernel->PClass] += product[
  357. ; pNKernel->matrix[12]]
  358. mov edx, [esi+52] ; get accum[13+pNKernel->PClass]
  359. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[13]]
  360. mov al, [ecx+14] ; get pNKernel->matrix[14]
  361. add edx, ebp ; accum[13+pNkernel->PClass] += product[
  362. ; pNKernel->matrix[13]]
  363. mov [esi+48], ebx ; store accum[pNKernel->PClass] += product[
  364. ; pNKernel->matrix[13]]
  365. mov [esi+52], edx ; store accum[13+pNKernel->PClass] +=
  366. ; product[pNKernel->matrix[13]]
  367. mov ebx, [esi+56] ; get accum[14+pNKernel->PClass]
  368. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[14]]
  369. mov al, [ecx+15] ; get pNKernel->matrix[15]
  370. add ebx, ebp
  371. mov edx, [esi+60] ; get accum[15+pNKernel->PClass]
  372. mov ebp, [edi+eax*4] ; get product[pNKernel->matrix[15]]
  373. mov [esi+56], ebx ; store accum[14+pNKernel->PClass] +=
  374. ; product[pNKernel->matrix[14]]
  375. add edx, ebp ; accum[15+pNKernel->PClass] += product[
  376. ; pNKernel->matrix[15]]
  377. mov [esi+60], edx ; store accum[15+pNKernel->PClass] +=
  378. ; product[pNKernel->matrix[15]]
  379. ret
  380. ////////////////////////////////////////////////////////////////////////////
  381. //assume parameters passed in by registers
  382. idct_bfly_intra:
  383. ; ----------------------------------------------------------------------
  384. ; INTRA ONLY Butterfly and clamp
  385. ; Uses all registers.
  386. ; Uses all accumulators[64], accum
  387. ; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra
  388. ; Writes to Output matrix of BYTES, OutputCoeff
  389. ;
  390. ; Process 4 outputs per group, 0-15
  391. ; 0
  392. lea esi, [L_ACCUM+4] ; get addr of accum[0]
  393. mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of OutputCoeff
  394. add esi, 128
  395. nop
  396. mov eax, [esi-128] ; get acc[0]
  397. mov ebx, [esi+64-128] ; get acc[16]
  398. mov ebp, [esi+128-128] ; get acc[32]
  399. mov edx, [esi+192-128] ; get acc[48]
  400. lea ecx, [eax+ebx] ; acc[0]+acc[16]
  401. sub eax, ebx ; acc[0]-acc[16]
  402. lea ebx, [ebp+edx] ; acc[32]+acc[48]
  403. sub ebp, edx ; acc[32]-acc[48]
  404. lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
  405. sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
  406. lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
  407. sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
  408. sar edx, SCALER ; tmp1 >> 13
  409. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  410. sar ecx, SCALER ; tmp2 >> 13
  411. ;lea esi, [L_ACCUM+4] ; get addr of accum[0]
  412. sar ebx, SCALER ; tmp3 >> 13
  413. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  414. sar eax, SCALER ; tmp4 >> 13
  415. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  416. mov BYTE PTR [edi], dl ; output[0][0] = tmp1
  417. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  418. mov BYTE PTR [edi+7], cl ; output[0][7] = tmp2
  419. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  420. mov BYTE PTR [edi+7*PITCH], bl ; output[7][0] = tmp3
  421. mov ebx, [esi+68-128] ; get acc[17]
  422. ; -------------------------------------------------------------------------
  423. ; 1
  424. mov BYTE PTR [edi+7*PITCH+7], al ; output[7][7] = tmp4
  425. mov eax, [esi+4-128] ; get acc[1]
  426. mov ebp, [esi+132-128] ; get acc[33]
  427. mov edx, [esi+196-128] ; get acc[49]
  428. lea ecx, [eax+ebx] ; acc[1]+acc[17]
  429. sub eax, ebx ; acc[1]-acc[17]
  430. lea ebx, [ebp+edx] ; acc[33]+acc[49]
  431. sub ebp, edx ; acc[33]-acc[49]
  432. lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
  433. sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
  434. lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
  435. sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
  436. sar edx, SCALER ; tmp1 >> 13
  437. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  438. sar ecx, SCALER ; tmp2 >> 13
  439. ;
  440. sar ebx, SCALER ; tmp3 >> 13
  441. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  442. sar eax, SCALER ; tmp4 >> 13
  443. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  444. mov BYTE PTR [edi+1], dl ; output[0][1] = tmp1
  445. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  446. mov BYTE PTR [edi+6], cl ; output[0][6] = tmp2
  447. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  448. mov BYTE PTR [edi+7*PITCH+1], bl ; output[7][1] = tmp3
  449. mov ebx, [esi+72-128] ; get acc[18]
  450. ; -------------------------------------------------------------------------
  451. ; 2
  452. mov BYTE PTR [edi+7*PITCH+6], al ; output[7][6] = tmp4
  453. mov eax, [esi+8-128] ; get acc[2]
  454. mov ebp, [esi+136-128] ; get acc[34]
  455. mov edx, [esi+200-128] ; get acc[50]
  456. lea ecx, [eax+ebx] ; acc[2]+acc[18]
  457. sub eax, ebx ; acc[2]-acc[18]
  458. lea ebx, [ebp+edx] ; acc[34]+acc[50]
  459. sub ebp, edx ; acc[34]-acc[50]
  460. nop
  461. nop
  462. lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
  463. sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
  464. lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
  465. sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
  466. sar edx, SCALER ; tmp1 >> 13
  467. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  468. sar ecx, SCALER ; tmp2 >> 13
  469. ;
  470. sar ebx, SCALER ; tmp3 >> 13
  471. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  472. sar eax, SCALER ; tmp4 >> 13
  473. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  474. mov BYTE PTR [edi+2], dl ; output[0][2] = tmp1
  475. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  476. mov BYTE PTR [edi+5], cl ; output[0][5] = tmp2
  477. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  478. mov BYTE PTR [edi+7*PITCH+2], bl ; output[7][2] = tmp3
  479. mov ebx, [esi+76-128] ; get acc[19]
  480. ; -------------------------------------------------------------------------
  481. ; 3
  482. mov BYTE PTR [edi+7*PITCH+5], al ; output[7][5] = tmp4
  483. mov eax, [esi+12-128] ; get acc[3]
  484. mov ebp, [esi+140-128] ; get acc[35]
  485. mov edx, [esi+204-128] ; get acc[51]
  486. lea ecx, [eax+ebx] ; acc[3]+acc[19]
  487. sub eax, ebx ; acc[3]-acc[19]
  488. lea ebx, [ebp+edx] ; acc[35]+acc[51]
  489. sub ebp, edx ; acc[35]-acc[51]
  490. lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
  491. sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
  492. lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
  493. sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
  494. sar edx, SCALER ; tmp1 >> 13
  495. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  496. sar ecx, SCALER ; tmp2 >> 13
  497. ;
  498. sar ebx, SCALER ; tmp3 >> 13
  499. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  500. sar eax, SCALER ; tmp4 >> 13
  501. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  502. mov BYTE PTR [edi+3], dl ; output[0][3] = tmp1
  503. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  504. mov BYTE PTR [edi+4], cl ; output[0][4] = tmp2
  505. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  506. mov BYTE PTR [edi+7*PITCH+3], bl ; output[7][3] = tmp3
  507. mov ebx, [esi+80-128] ; get acc[20]
  508. ; -------------------------------------------------------------------------
  509. ; 4
  510. mov BYTE PTR [edi+7*PITCH+4], al ; output[7][4] = tmp4
  511. mov eax, [esi+16-128] ; get acc[4]
  512. mov ebp, [esi+144-128] ; get acc[36]
  513. mov edx, [esi+208-128] ; get acc[52]
  514. lea ecx, [eax+ebx] ; acc[4]+acc[20]
  515. sub eax, ebx ; acc[4]-acc[20]
  516. lea ebx, [ebp+edx] ; acc[36]+acc[52]
  517. sub ebp, edx ; acc[36]-acc[52]
  518. lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
  519. sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
  520. lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
  521. sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
  522. sar edx, SCALER ; tmp1 >> 13
  523. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  524. sar ecx, SCALER ; tmp2 >> 13
  525. ;lea esi, [L_ACCUM+4] ; get addr of accum[0]
  526. sar ebx, SCALER ; tmp3 >> 13
  527. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  528. sar eax, SCALER ; tmp4 >> 13
  529. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  530. mov BYTE PTR [edi+PITCH], dl ; output[1][0] = tmp1
  531. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  532. mov BYTE PTR [edi+PITCH+7], cl ; output[1][7] = tmp2
  533. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  534. mov BYTE PTR [edi+6*PITCH], bl ; output[6][0] = tmp3
  535. mov ebx, [esi+84-128] ; get acc[21]
  536. ; -------------------------------------------------------------------------
  537. ; 5
  538. mov BYTE PTR [edi+6*PITCH+7], al ; output[6][7] = tmp4
  539. mov eax, [esi+20-128] ; get acc[5]
  540. mov ebp, [esi+148-128] ; get acc[37]
  541. mov edx, [esi+212-128] ; get acc[53]
  542. lea ecx, [eax+ebx] ; acc[5]+acc[21]
  543. sub eax, ebx ; acc[5]-acc[21]
  544. lea ebx, [ebp+edx] ; acc[37]+acc[53]
  545. sub ebp, edx ; acc[37]-acc[53]
  546. lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
  547. sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
  548. lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
  549. sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
  550. sar edx, SCALER ; tmp1 >> 13
  551. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  552. sar ecx, SCALER ; tmp2 >> 13
  553. ;
  554. sar ebx, SCALER ; tmp3 >> 13
  555. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  556. sar eax, SCALER ; tmp4 >> 13
  557. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  558. mov BYTE PTR [edi+PITCH+1], dl ; output[1][1] = tmp1
  559. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  560. mov BYTE PTR [edi+PITCH+6], cl ; output[1][6] = tmp2
  561. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  562. mov BYTE PTR [edi+6*PITCH+1], bl ; output[6][1] = tmp3
  563. mov ebx, [esi+88-128] ; get acc[22]
  564. ; -------------------------------------------------------------------------
  565. ; 6
  566. mov BYTE PTR [edi+6*PITCH+6], al ; output[6][6] = tmp4
  567. mov eax, [esi+24-128] ; get acc[6]
  568. mov ebp, [esi+152-128] ; get acc[38]
  569. mov edx, [esi+216-128] ; get acc[54]
  570. lea ecx, [eax+ebx] ; acc[6]+acc[22]
  571. sub eax, ebx ; acc[6]-acc[22]
  572. lea ebx, [ebp+edx] ; acc[38]+acc[54]
  573. sub ebp, edx ; acc[38]-acc[54]
  574. lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
  575. sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
  576. lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
  577. sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
  578. sar edx, SCALER ; tmp1 >> 13
  579. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  580. sar ecx, SCALER ; tmp2 >> 13
  581. ;
  582. sar ebx, SCALER ; tmp3 >> 13
  583. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  584. sar eax, SCALER ; tmp4 >> 13
  585. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  586. mov BYTE PTR [edi+PITCH+2], dl ; output[1][2] = tmp1
  587. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  588. mov BYTE PTR [edi+PITCH+5], cl ; output[1][5] = tmp2
  589. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  590. mov BYTE PTR [edi+6*PITCH+2], bl ; output[6][2] = tmp3
  591. mov ebx, [esi+92-128] ; get acc[23]
  592. ; -------------------------------------------------------------------------
  593. ; 7
  594. mov BYTE PTR [edi+6*PITCH+5], al ; output[6][5] = tmp4
  595. mov eax, [esi+28-128] ; get acc[7]
  596. mov ebp, [esi+156-128] ; get acc[39]
  597. mov edx, [esi+220-128] ; get acc[55]
  598. lea ecx, [eax+ebx] ; acc[7]+acc[23]
  599. sub eax, ebx ; acc[7]-acc[23]
  600. lea ebx, [ebp+edx] ; acc[39]+acc[55]
  601. sub ebp, edx ; acc[39]-acc[55]
  602. lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
  603. sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
  604. lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
  605. sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
  606. sar edx, SCALER ; tmp1 >> 13
  607. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  608. sar ecx, SCALER ; tmp2 >> 13
  609. ;
  610. sar ebx, SCALER ; tmp3 >> 13
  611. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  612. sar eax, SCALER ; tmp4 >> 13
  613. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  614. mov BYTE PTR [edi+PITCH+3], dl ; output[1][3] = tmp1
  615. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  616. mov BYTE PTR [edi+PITCH+4], cl ; output[1][4] = tmp2
  617. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  618. mov BYTE PTR [edi+6*PITCH+3], bl ; output[6][3] = tmp3
  619. mov ebx, [esi+96-128] ; get acc[24]
  620. ; -------------------------------------------------------------------------
  621. ; 8
  622. mov BYTE PTR [edi+6*PITCH+4], al ; output[6][4] = tmp4
  623. mov eax, [esi+32-128] ; get acc[8]
  624. mov ebp, [esi+160-128] ; get acc[40]
  625. mov edx, [esi+224-128] ; get acc[56]
  626. lea ecx, [eax+ebx] ; acc[8]+acc[24]
  627. sub eax, ebx ; acc[8]-acc[24]
  628. lea ebx, [ebp+edx] ; acc[40]+acc[56]
  629. sub ebp, edx ; acc[40]-acc[56]
  630. lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56]
  631. sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56])
  632. lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56])
  633. sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56])
  634. sar edx, SCALER ; tmp1 >> 13
  635. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  636. sar ecx, SCALER ; tmp2 >> 13
  637. ;lea esi, [L_ACCUM+4] ; get addr of accum[0]
  638. sar ebx, SCALER ; tmp3 >> 13
  639. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  640. sar eax, SCALER ; tmp4 >> 13
  641. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  642. mov BYTE PTR [edi+2*PITCH], dl ; output[2][0] = tmp1
  643. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  644. mov BYTE PTR [edi+2*PITCH+7], cl ; output[2][7] = tmp2
  645. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  646. mov BYTE PTR [edi+5*PITCH], bl ; output[5][0] = tmp3
  647. mov ebx, [esi+100-128] ; get acc[25]
  648. ; -------------------------------------------------------------------------
  649. ; 9
  650. mov BYTE PTR [edi+5*PITCH+7], al ; output[5][7] = tmp4
  651. mov eax, [esi+36-128] ; get acc[9]
  652. mov ebp, [esi+164-128] ; get acc[41]
  653. mov edx, [esi+228-128] ; get acc[57]
  654. lea ecx, [eax+ebx] ; acc[9]+acc[25]
  655. sub eax, ebx ; acc[9]-acc[25]
  656. lea ebx, [ebp+edx] ; acc[41]+acc[57]
  657. sub ebp, edx ; acc[41]-acc[57]
  658. lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57]
  659. sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57])
  660. lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57])
  661. sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57])
  662. sar edx, SCALER ; tmp1 >> 13
  663. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  664. sar ecx, SCALER ; tmp2 >> 13
  665. ;
  666. sar ebx, SCALER ; tmp3 >> 13
  667. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  668. sar eax, SCALER ; tmp4 >> 13
  669. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  670. mov BYTE PTR [edi+2*PITCH+1], dl ; output[2][1] = tmp1
  671. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  672. mov BYTE PTR [edi+2*PITCH+6], cl ; output[2][6] = tmp2
  673. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  674. mov BYTE PTR [edi+5*PITCH+1], bl ; output[5][1] = tmp3
  675. mov ebx, [esi+104-128] ; get acc[26]
  676. ; -------------------------------------------------------------------------
  677. ; 10
  678. mov BYTE PTR [edi+5*PITCH+6], al ; output[5][6] = tmp4
  679. mov eax, [esi+40-128] ; get acc[10]
  680. mov ebp, [esi+168-128] ; get acc[42]
  681. mov edx, [esi+232-128] ; get acc[58]
  682. lea ecx, [eax+ebx] ; acc[10]+acc[26]
  683. sub eax, ebx ; acc[10]-acc[26]
  684. lea ebx, [ebp+edx] ; acc[42]+acc[58]
  685. sub ebp, edx ; acc[42]-acc[58]
  686. lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58]
  687. sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58])
  688. lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58])
  689. sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58])
  690. sar edx, SCALER ; tmp1 >> 13
  691. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  692. sar ecx, SCALER ; tmp2 >> 13
  693. ;
  694. sar ebx, SCALER ; tmp3 >> 13
  695. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  696. sar eax, SCALER ; tmp4 >> 13
  697. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  698. mov BYTE PTR [edi+2*PITCH+2], dl ; output[2][2] = tmp1
  699. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  700. mov BYTE PTR [edi+2*PITCH+5], cl ; output[2][5] = tmp2
  701. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  702. mov BYTE PTR [edi+5*PITCH+2], bl ; output[5][2] = tmp3
  703. mov ebx, [esi+108-128] ; get acc[27]
  704. ; -------------------------------------------------------------------------
  705. ; 11
  706. mov BYTE PTR [edi+5*PITCH+5], al ; output[5][5] = tmp4
  707. mov eax, [esi+44-128] ; get acc[11]
  708. mov ebp, [esi+172-128] ; get acc[43]
  709. mov edx, [esi+236-128] ; get acc[59]
  710. lea ecx, [eax+ebx] ; acc[11]+acc[27]
  711. sub eax, ebx ; acc[11]-acc[27]
  712. lea ebx, [ebp+edx] ; acc[43]+acc[59]
  713. sub ebp, edx ; acc[43]-acc[59]
  714. lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59]
  715. sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59])
  716. lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59])
  717. sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59])
  718. sar edx, SCALER ; tmp1 >> 13
  719. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  720. sar ecx, SCALER ; tmp2 >> 13
  721. ;
  722. sar ebx, SCALER ; tmp3 >> 13
  723. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  724. sar eax, SCALER ; tmp4 >> 13
  725. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  726. mov BYTE PTR [edi+2*PITCH+3], dl ; output[2][3] = tmp1
  727. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  728. mov BYTE PTR [edi+2*PITCH+4], cl ; output[2][4] = tmp2
  729. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  730. mov BYTE PTR [edi+5*PITCH+3], bl ; output[5][3] = tmp3
  731. mov ebx, [esi+112-128] ; get acc[28]
  732. ; -------------------------------------------------------------------------
  733. ; 12
  734. mov BYTE PTR [edi+5*PITCH+4], al ; output[5][4] = tmp4
  735. mov eax, [esi+48-128] ; get acc[12]
  736. mov ebp, [esi+176-128] ; get acc[44]
  737. mov edx, [esi+240-128] ; get acc[60]
  738. lea ecx, [eax+ebx] ; acc[12]+acc[28]
  739. sub eax, ebx ; acc[12]-acc[28]
  740. lea ebx, [ebp+edx] ; acc[44]+acc[60]
  741. sub ebp, edx ; acc[44]-acc[60]
  742. lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60]
  743. sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60])
  744. lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60])
  745. sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60])
  746. sar edx, SCALER ; tmp1 >> 13
  747. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  748. sar ecx, SCALER ; tmp2 >> 13
  749. ;lea esi, [L_ACCUM+4] ; get addr of accum[0]
  750. sar ebx, SCALER ; tmp3 >> 13
  751. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  752. sar eax, SCALER ; tmp4 >> 13
  753. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  754. mov BYTE PTR [edi+3*PITCH], dl ; output[3][0] = tmp1
  755. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  756. mov BYTE PTR [edi+3*PITCH+7], cl ; output[3][7] = tmp2
  757. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  758. mov BYTE PTR [edi+4*PITCH], bl ; output[4][0] = tmp3
  759. mov ebx, [esi+116-128] ; get acc[29]
  760. ; -------------------------------------------------------------------------
  761. ; 13
  762. mov BYTE PTR [edi+4*PITCH+7], al ; output[4][7] = tmp4
  763. mov eax, [esi+52-128] ; get acc[13]
  764. mov ebp, [esi+180-128] ; get acc[45]
  765. mov edx, [esi+244-128] ; get acc[61]
  766. lea ecx, [eax+ebx] ; acc[13]+acc[29]
  767. sub eax, ebx ; acc[13]-acc[29]
  768. lea ebx, [ebp+edx] ; acc[45]+acc[61]
  769. sub ebp, edx ; acc[45]-acc[61]
  770. lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61]
  771. sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61])
  772. lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61])
  773. sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61])
  774. sar edx, SCALER ; tmp1 >> 13
  775. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  776. sar ecx, SCALER ; tmp2 >> 13
  777. ;
  778. sar ebx, SCALER ; tmp3 >> 13
  779. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  780. sar eax, SCALER ; tmp4 >> 13
  781. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  782. mov BYTE PTR [edi+3*PITCH+1], dl ; output[3][1] = tmp1
  783. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  784. mov BYTE PTR [edi+3*PITCH+6], cl ; output[3][6] = tmp2
  785. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  786. mov BYTE PTR [edi+4*PITCH+1], bl ; output[4][1] = tmp3
  787. mov ebx, [esi+120-128] ; get acc[30]
  788. ; -------------------------------------------------------------------------
  789. ; 14
  790. mov BYTE PTR [edi+4*PITCH+6], al ; output[4][6] = tmp4
  791. mov eax, [esi+56-128] ; get acc[14]
  792. mov ebp, [esi+184-128] ; get acc[46]
  793. mov edx, [esi+248-128] ; get acc[62]
  794. lea ecx, [eax+ebx] ; acc[14]+acc[30]
  795. sub eax, ebx ; acc[14]-acc[30]
  796. lea ebx, [ebp+edx] ; acc[46]+acc[62]
  797. sub ebp, edx ; acc[46]-acc[62]
  798. lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62]
  799. sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62])
  800. lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62])
  801. sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62])
  802. sar edx, SCALER ; tmp1 >> 13
  803. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  804. sar ecx, SCALER ; tmp2 >> 13
  805. ;
  806. sar ebx, SCALER ; tmp3 >> 13
  807. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  808. sar eax, SCALER ; tmp4 >> 13
  809. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  810. mov BYTE PTR [edi+3*PITCH+2], dl ; output[3][2] = tmp1
  811. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  812. mov BYTE PTR [edi+3*PITCH+5], cl ; output[3][5] = tmp2
  813. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  814. mov BYTE PTR [edi+4*PITCH+2], bl ; output[4][2] = tmp3
  815. mov ebx, [esi+124-128] ; get acc[31]
  816. ; -------------------------------------------------------------------------
  817. ; 15
  818. mov BYTE PTR [edi+4*PITCH+5], al ; output[4][5] = tmp4
  819. mov eax, [esi+60-128] ; get acc[15]
  820. mov ebp, [esi+188-128] ; get acc[47]
  821. mov edx, [esi+252-128] ; get acc[63]
  822. lea ecx, [eax+ebx] ; acc[15]+acc[31]
  823. sub eax, ebx ; acc[15]-acc[31]
  824. lea ebx, [ebp+edx] ; acc[47]+acc[63]
  825. sub ebp, edx ; acc[47]-acc[63]
  826. lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63]
  827. sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63])
  828. lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63])
  829. sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63])
  830. sar edx, SCALER ; tmp1 >> 13
  831. lea ebp, ClampTbl-1024+CLAMP_BIAS ; ecx gets Base addr of ClipPixIntra
  832. sar ecx, SCALER ; tmp2 >> 13
  833. ;
  834. sar ebx, SCALER ; tmp3 >> 13
  835. mov dl, [ebp+edx] ; tmp1 = ClipPixIntra[tmp1]
  836. sar eax, SCALER ; tmp4 >> 13
  837. mov cl, [ebp+ecx] ; tmp2 = ClipPixIntra[tmp2]
  838. mov BYTE PTR [edi+3*PITCH+3], dl ; output[3][3] = tmp1
  839. mov bl, [ebp+ebx] ; tmp3 = ClipPixIntra[tmp3]
  840. mov BYTE PTR [edi+3*PITCH+4], cl ; output[3][4] = tmp2
  841. mov al, [ebp+eax] ; tmp4 = ClipPixIntra[tmp4]
  842. mov BYTE PTR [edi+4*PITCH+3], bl ; output[4][3] = tmp3
  843. mov BYTE PTR [edi+4*PITCH+4], al ; output[4][4] = tmp4
  844. ret
  845. ////////////////////////////////////////////////////////////////////////////
  846. //assume parameters passed in by registers
  847. idct_bfly_inter:
  848. ; ----------------------------------------------------------------------
  849. ; INTER ONLY Butterfly and clamp
  850. ; Uses all registers.
  851. ; Uses all accumulators[64], accum
  852. ; Uses ClipPixIntra[2048] of DWORDS, ClipPixIntra
  853. ; Writes to Intermediate matrix [8][8] of DWORDS, Intermediate
  854. ; NOTE:
  855. ; Code assumes that Intermediate and accumulator arrays are aligned at
  856. ; cache-line boundary
  857. ; Process 4 outputs per group, 0-15
  858. ; 0
  859. mov edi, [L_DESTBLOCK+4] ; edi gets Base addr of Intermediate
  860. lea esi, [L_ACCUM+4+128] ; get addr of accum[0] biased by 128
  861. add edi, 128
  862. nop
  863. mov ebx, [esi+64-128] ; get acc[16]
  864. mov eax, [esi-128] ; get acc[0] bank conflict
  865. ; mov edx, [edi-128] ; pre-fetch line 0; 4 to avoid bank conflict
  866. ; mov ecx, [edi+1*32-128+4] ; pre-fetch line 1
  867. ; mov edx, [edi+2*32-128] ; pre-fetch line 2
  868. ; mov ecx, [edi+3*32-128+4] ; pre-fetch line 3
  869. ; mov edx, [edi+4*32-128] ; pre-fetch line 4
  870. ; mov ecx, [edi+5*32-128+4] ; pre-fetch line 5
  871. ; mov edx, [edi+6*32-128] ; pre-fetch line 6
  872. ; mov ecx, [edi+7*32-128+4] ; pre-fetch line 7
  873. mov ebp, [esi+128-128] ; get acc[32]
  874. lea ecx, [eax+ebx] ; acc[0]+acc[16]
  875. mov edx, [esi+192-128] ; get acc[48]
  876. sub eax, ebx ; acc[0]-acc[16]
  877. lea ebx, [ebp+edx] ; acc[32]+acc[48]
  878. sub ebp, edx ; acc[32]-acc[48]
  879. lea edx, [ecx+ebx] ; tmp1 = acc[0]+acc[16] + acc[32]+acc[48]
  880. sub ecx, ebx ; tmp2 = acc[0]+acc[16] - (acc[32]+acc[48])
  881. sar edx, SCALER ; tmp1 >> 13
  882. lea ebx, [eax+ebp] ; tmp3 = acc[0]-acc[16] + (acc[32]-acc[48])
  883. sar ecx, SCALER ; tmp2 >> 13
  884. sub eax, ebp ; tmp4 = acc[0]-acc[16] - (acc[32]-acc[48])
  885. sar ebx, SCALER ; tmp3 >> 13
  886. mov DWORD PTR [edi-128], edx ; Intermediate[0][0] = tmp1
  887. sar eax, SCALER ; tmp4 >> 13
  888. mov DWORD PTR [edi+7*4-128], ecx ; Intermediate[0][7] = tmp2
  889. mov DWORD PTR [edi+7*32-128], ebx ; Intermediate[7][0] = tmp3
  890. mov ebx, [esi+68-128] ; get acc[17]
  891. ; -------------------------------------------------------------------------
  892. ; 1
  893. mov DWORD PTR [edi+7*32+7*4-128], eax ; Intermediate[7][7] = tmp4
  894. mov eax, [esi+4-128] ; get acc[1]
  895. mov ebp, [esi+132-128] ; get acc[33]
  896. lea ecx, [eax+ebx] ; acc[1]+acc[17]
  897. mov edx, [esi+196-128] ; get acc[49]
  898. sub eax, ebx ; acc[1]-acc[17]
  899. lea ebx, [ebp+edx] ; acc[33]+acc[49]
  900. sub ebp, edx ; acc[33]-acc[49]
  901. lea edx, [ecx+ebx] ; tmp1 = acc[1]+acc[17] + acc[33]+acc[49]
  902. sub ecx, ebx ; tmp2 = acc[1]+acc[17] - (acc[33]+acc[49])
  903. sar edx, SCALER ; tmp1 >> 13
  904. lea ebx, [eax+ebp] ; tmp3 = acc[1]-acc[17] + (acc[33]-acc[49])
  905. sar ecx, SCALER ; tmp2 >> 13
  906. sub eax, ebp ; tmp4 = acc[1]-acc[17] - (acc[33]-acc[49])
  907. sar ebx, SCALER ; tmp3 >> 13
  908. mov DWORD PTR [edi+1*4-128], edx ; Intermediate[0][1] = tmp1
  909. sar eax, SCALER ; tmp4 >> 13
  910. mov DWORD PTR [edi+6*4-128], ecx ; Intermediate[0][6] = tmp2
  911. mov DWORD PTR [edi+7*32+1*4-128], ebx ; Intermediate[7][1] = tmp3
  912. mov ebx, [esi+72-128] ; get acc[18]
  913. ; -------------------------------------------------------------------------
  914. ; 2
  915. mov DWORD PTR [edi+7*32+6*4-128], eax ; Intermediate[7][6] = tmp4
  916. mov eax, [esi+8-128] ; get acc[2]
  917. mov ebp, [esi+136-128] ; get acc[34]
  918. lea ecx, [eax+ebx] ; acc[2]+acc[18]
  919. mov edx, [esi+200-128] ; get acc[50]
  920. sub eax, ebx ; acc[2]-acc[18]
  921. lea ebx, [ebp+edx] ; acc[34]+acc[50]
  922. sub ebp, edx ; acc[34]-acc[50]
  923. lea edx, [ecx+ebx] ; tmp1 = acc[2]+acc[18] + acc[34]+acc[50]
  924. sub ecx, ebx ; tmp2 = acc[2]+acc[18] - (acc[34]+acc[50])
  925. sar edx, SCALER ; tmp1 >> 13
  926. lea ebx, [eax+ebp] ; tmp3 = acc[2]-acc[18] + (acc[34]-acc[50])
  927. sar ecx, SCALER ; tmp2 >> 13
  928. sub eax, ebp ; tmp4 = acc[2]-acc[18] - (acc[34]-acc[50])
  929. sar ebx, SCALER ; tmp3 >> 13
  930. mov DWORD PTR [edi+2*4-128], edx ; Intermediate[0][2] = tmp1
  931. sar eax, SCALER ; tmp4 >> 13
  932. mov DWORD PTR [edi+5*4-128], ecx ; Intermediate[0][5] = tmp2
  933. mov DWORD PTR [edi+7*32+2*4-128], ebx ; Intermediate[7][2] = tmp3
  934. mov ebx, [esi+76-128] ; get acc[19]
  935. ; -------------------------------------------------------------------------
  936. ; 3
  937. mov DWORD PTR [edi+7*32+5*4-128], eax ; Intermediate[7][5] = tmp4
  938. mov eax, [esi+12-128] ; get acc[3]
  939. mov ebp, [esi+140-128] ; get acc[35]
  940. lea ecx, [eax+ebx] ; acc[3]+acc[19]
  941. mov edx, [esi+204-128] ; get acc[51]
  942. sub eax, ebx ; acc[3]-acc[19]
  943. lea ebx, [ebp+edx] ; acc[35]+acc[51]
  944. sub ebp, edx ; acc[35]-acc[51]
  945. lea edx, [ecx+ebx] ; tmp1 = acc[3]+acc[19] + acc[35]+acc[51]
  946. sub ecx, ebx ; tmp2 = acc[3]+acc[19] - (acc[35]+acc[51])
  947. sar edx, SCALER ; tmp1 >> 13
  948. lea ebx, [eax+ebp] ; tmp3 = acc[3]-acc[19] + (acc[35]-acc[51])
  949. sar ecx, SCALER ; tmp2 >> 13
  950. sub eax, ebp ; tmp4 = acc[3]-acc[19] - (acc[35]-acc[51])
  951. sar ebx, SCALER ; tmp3 >> 13
  952. mov DWORD PTR [edi+3*4-128], edx ; Intermediate[0][3] = tmp1
  953. sar eax, SCALER ; tmp4 >> 13
  954. mov DWORD PTR [edi+4*4-128], ecx ; Intermediate[0][4] = tmp2
  955. mov DWORD PTR [edi+7*32+3*4-128], ebx ; Intermediate[7][3] = tmp3
  956. mov ebx, [esi+80-128] ; get acc[20]
  957. ; -------------------------------------------------------------------------
  958. ; 4
  959. mov DWORD PTR [edi+7*32+4*4-128], eax ; Intermediate[7][4] = tmp4
  960. mov eax, [esi+16-128] ; get acc[4]
  961. mov ebp, [esi+144-128] ; get acc[36]
  962. lea ecx, [eax+ebx] ; acc[4]+acc[20]
  963. mov edx, [esi+208-128] ; get acc[52]
  964. sub eax, ebx ; acc[4]-acc[20]
  965. lea ebx, [ebp+edx] ; acc[36]+acc[52]
  966. sub ebp, edx ; acc[36]-acc[52]
  967. lea edx, [ecx+ebx] ; tmp1 = acc[4]+acc[20] + acc[36]+acc[52]
  968. sub ecx, ebx ; tmp2 = acc[4]+acc[20] - (acc[36]+acc[52])
  969. sar edx, SCALER ; tmp1 >> 13
  970. lea ebx, [eax+ebp] ; tmp3 = acc[4]-acc[20] + (acc[36]-acc[52])
  971. sar ecx, SCALER ; tmp2 >> 13
  972. sub eax, ebp ; tmp4 = acc[4]-acc[20] - (acc[36]-acc[52])
  973. sar ebx, SCALER ; tmp3 >> 13
  974. mov DWORD PTR [edi+32-128], edx ; Intermediate[1][0] = tmp1
  975. sar eax, SCALER ; tmp4 >> 13
  976. mov DWORD PTR [edi+32+7*4-128], ecx ; Intermediate[1][7] = tmp2
  977. mov DWORD PTR [edi+6*32-128], ebx ; Intermediate[6][0] = tmp3
  978. mov ebx, [esi+84-128] ; get acc[21]
  979. ; -------------------------------------------------------------------------
  980. ; 5
  981. mov DWORD PTR [edi+6*32+7*4-128], eax ; Intermediate[6][7] = tmp4
  982. mov eax, [esi+20-128] ; get acc[5]
  983. mov ebp, [esi+148-128] ; get acc[37]
  984. lea ecx, [eax+ebx] ; acc[5]+acc[21]
  985. mov edx, [esi+212-128] ; get acc[53]
  986. sub eax, ebx ; acc[5]-acc[21]
  987. lea ebx, [ebp+edx] ; acc[37]+acc[53]
  988. sub ebp, edx ; acc[37]-acc[53]
  989. lea edx, [ecx+ebx] ; tmp1 = acc[5]+acc[21] + acc[37]+acc[53]
  990. sub ecx, ebx ; tmp2 = acc[5]+acc[21] - (acc[37]+acc[53])
  991. sar edx, SCALER ; tmp1 >> 13
  992. lea ebx, [eax+ebp] ; tmp3 = acc[5]-acc[21] + (acc[37]-acc[53])
  993. sar ecx, SCALER ; tmp2 >> 13
  994. sub eax, ebp ; tmp4 = acc[5]-acc[21] - (acc[37]-acc[53])
  995. sar ebx, SCALER ; tmp3 >> 13
  996. mov DWORD PTR [edi+32+1*4-128], edx ; Intermediate[1][1] = tmp1
  997. sar eax, SCALER ; tmp4 >> 13
  998. mov DWORD PTR [edi+32+6*4-128], ecx ; Intermediate[1][6] = tmp2
  999. mov DWORD PTR [edi+6*32+1*4-128], ebx ; Intermediate[6][1] = tmp3
  1000. mov ebx, [esi+88-128] ; get acc[22]
  1001. ; -------------------------------------------------------------------------
  1002. ; 6
  1003. mov DWORD PTR [edi+6*32+6*4-128], eax ; Intermediate[6][6] = tmp4
  1004. mov eax, [esi+24-128] ; get acc[6] Bank conflict
  1005. mov ebp, [esi+152-128] ; get acc[38]
  1006. lea ecx, [eax+ebx] ; acc[6]+acc[22]
  1007. mov edx, [esi+216-128] ; get acc[54]
  1008. sub eax, ebx ; acc[6]-acc[22]
  1009. lea ebx, [ebp+edx] ; acc[38]+acc[54]
  1010. sub ebp, edx ; acc[38]-acc[54]
  1011. lea edx, [ecx+ebx] ; tmp1 = acc[6]+acc[22] + acc[38]+acc[54]
  1012. sub ecx, ebx ; tmp2 = acc[6]+acc[22] - (acc[38]+acc[54])
  1013. sar edx, SCALER ; tmp1 >> 13
  1014. lea ebx, [eax+ebp] ; tmp3 = acc[6]-acc[22] + (acc[38]-acc[54])
  1015. sar ecx, SCALER ; tmp2 >> 13
  1016. sub eax, ebp ; tmp4 = acc[6]-acc[22] - (acc[38]-acc[54])
  1017. sar ebx, SCALER ; tmp3 >> 13
  1018. mov DWORD PTR [edi+32+2*4-128], edx ; Intermediate[1][2] = tmp1
  1019. sar eax, SCALER ; tmp4 >> 13
  1020. mov DWORD PTR [edi+32+5*4-128], ecx ; Intermediate[1][5] = tmp2
  1021. mov DWORD PTR [edi+6*32+2*4-128], ebx ; Intermediate[6][2] = tmp3
  1022. mov ebx, [esi+92-128] ; get acc[23]
  1023. ; -------------------------------------------------------------------------
  1024. ; 7
  1025. mov DWORD PTR [edi+6*32+5*4-128], eax ; Intermediate[6][5] = tmp4
  1026. mov eax, [esi+28-128] ; get acc[7]
  1027. mov ebp, [esi+156-128] ; get acc[39]
  1028. lea ecx, [eax+ebx] ; acc[7]+acc[23]
  1029. mov edx, [esi+220-128] ; get acc[55]
  1030. sub eax, ebx ; acc[7]-acc[23]
  1031. lea ebx, [ebp+edx] ; acc[39]+acc[55]
  1032. sub ebp, edx ; acc[39]-acc[55]
  1033. lea edx, [ecx+ebx] ; tmp1 = acc[7]+acc[23] + acc[39]+acc[55]
  1034. sub ecx, ebx ; tmp2 = acc[7]+acc[23] - (acc[39]+acc[55])
  1035. sar edx, SCALER ; tmp1 >> 13
  1036. lea ebx, [eax+ebp] ; tmp3 = acc[7]-acc[23] + (acc[39]-acc[55])
  1037. sar ecx, SCALER ; tmp2 >> 13
  1038. sub eax, ebp ; tmp4 = acc[7]-acc[23] - (acc[39]-acc[55])
  1039. sar ebx, SCALER ; tmp3 >> 13
  1040. mov DWORD PTR [edi+32+3*4-128], edx ; Intermediate[1][3] = tmp1
  1041. sar eax, SCALER ; tmp4 >> 13
  1042. mov DWORD PTR [edi+32+4*4-128], ecx ; Intermediate[1][4] = tmp2
  1043. mov DWORD PTR [edi+6*32+3*4-128], ebx ; Intermediate[6][3] = tmp3
  1044. mov ebx, [esi+96-128] ; get acc[24]
  1045. ; -------------------------------------------------------------------------
  1046. ; 8
  1047. mov DWORD PTR [edi+6*32+4*4-128], eax ; Intermediate[6][4] = tmp4
  1048. mov eax, [esi+32-128] ; get acc[8]
  1049. mov ebp, [esi+160-128] ; get acc[40]
  1050. lea ecx, [eax+ebx] ; acc[8]+acc[24]
  1051. mov edx, [esi+224-128] ; get acc[56]
  1052. sub eax, ebx ; acc[8]-acc[24]
  1053. lea ebx, [ebp+edx] ; acc[40]+acc[56]
  1054. sub ebp, edx ; acc[40]-acc[56]
  1055. lea edx, [ecx+ebx] ; tmp1 = acc[8]+acc[24] + acc[40]+acc[56]
  1056. sub ecx, ebx ; tmp2 = acc[8]+acc[24] - (acc[40]+acc[56])
  1057. sar edx, SCALER ; tmp1 >> 13
  1058. lea ebx, [eax+ebp] ; tmp3 = acc[8]-acc[24] + (acc[40]-acc[56])
  1059. sar ecx, SCALER ; tmp2 >> 13
  1060. sub eax, ebp ; tmp4 = acc[8]-acc[24] - (acc[40]-acc[56])
  1061. sar ebx, SCALER ; tmp3 >> 13
  1062. mov DWORD PTR [edi+2*32-128], edx ; Intermediate[2][0] = tmp1
  1063. sar eax, SCALER ; tmp4 >> 13
  1064. mov DWORD PTR [edi+2*32+7*4-128], ecx ; Intermediate[2][7] = tmp2
  1065. mov DWORD PTR [edi+5*32-128], ebx ; Intermediate[5][0] = tmp3
  1066. mov ebx, [esi+100-128] ; get acc[25]
  1067. ; -------------------------------------------------------------------------
  1068. ; 9
  1069. mov DWORD PTR [edi+5*32+7*4-128], eax ; Intermediate[5][7] = tmp4
  1070. mov eax, [esi+36-128] ; get acc[9]
  1071. mov ebp, [esi+164-128] ; get acc[41]
  1072. lea ecx, [eax+ebx] ; acc[9]+acc[25]
  1073. mov edx, [esi+228-128] ; get acc[57]
  1074. sub eax, ebx ; acc[9]-acc[25]
  1075. lea ebx, [ebp+edx] ; acc[41]+acc[57]
  1076. sub ebp, edx ; acc[41]-acc[57]
  1077. lea edx, [ecx+ebx] ; tmp1 = acc[9]+acc[25] + acc[41]+acc[57]
  1078. sub ecx, ebx ; tmp2 = acc[9]+acc[25] - (acc[41]+acc[57])
  1079. sar edx, SCALER ; tmp1 >> 13
  1080. lea ebx, [eax+ebp] ; tmp3 = acc[9]-acc[25] + (acc[41]-acc[57])
  1081. sar ecx, SCALER ; tmp2 >> 13
  1082. sub eax, ebp ; tmp4 = acc[9]-acc[25] - (acc[41]-acc[57])
  1083. sar ebx, SCALER ; tmp3 >> 13
  1084. mov DWORD PTR [edi+2*32+1*4-128], edx ; Intermediate[2][1] = tmp1
  1085. sar eax, SCALER ; tmp4 >> 13
  1086. mov DWORD PTR [edi+2*32+6*4-128], ecx ; Intermediate[2][6] = tmp2
  1087. mov DWORD PTR [edi+5*32+1*4-128], ebx ; Intermediate[5][1] = tmp3
  1088. mov ebx, [esi+104-128] ; get acc[26]
  1089. ; -------------------------------------------------------------------------
  1090. ; 10
  1091. mov DWORD PTR [edi+5*32+6*4-128], eax ; Intermediate[5][6] = tmp4
  1092. mov eax, [esi+40-128] ; get acc[10]
  1093. mov ebp, [esi+168-128] ; get acc[42]
  1094. lea ecx, [eax+ebx] ; acc[10]+acc[26]
  1095. mov edx, [esi+232-128] ; get acc[58]
  1096. sub eax, ebx ; acc[10]-acc[26]
  1097. lea ebx, [ebp+edx] ; acc[42]+acc[58]
  1098. sub ebp, edx ; acc[42]-acc[58]
  1099. lea edx, [ecx+ebx] ; tmp1 = acc[10]+acc[26] + acc[42]+acc[58]
  1100. sub ecx, ebx ; tmp2 = acc[10]+acc[26] - (acc[42]+acc[58])
  1101. sar edx, SCALER ; tmp1 >> 13
  1102. lea ebx, [eax+ebp] ; tmp3 = acc[10]-acc[26] + (acc[42]-acc[58])
  1103. sar ecx, SCALER ; tmp2 >> 13
  1104. sub eax, ebp ; tmp4 = acc[10]-acc[26] - (acc[42]-acc[58])
  1105. sar ebx, SCALER ; tmp3 >> 13
  1106. mov DWORD PTR [edi+2*32+2*4-128], edx ; Intermediate[2][2] = tmp1
  1107. sar eax, SCALER ; tmp4 >> 13
  1108. mov DWORD PTR [edi+2*32+5*4-128], ecx ; Intermediate[2][5] = tmp2
  1109. mov DWORD PTR [edi+5*32+2*4-128], ebx ; Intermediate[5][2] = tmp3
  1110. mov ebx, [esi+108-128] ; get acc[27]
  1111. ; -------------------------------------------------------------------------
  1112. ; 11
  1113. mov DWORD PTR [edi+5*32+5*4-128], eax ; Intermediate[5][5] = tmp4
  1114. mov eax, [esi+44-128] ; get acc[11]
  1115. mov ebp, [esi+172-128] ; get acc[43]
  1116. lea ecx, [eax+ebx] ; acc[11]+acc[27]
  1117. mov edx, [esi+236-128] ; get acc[59]
  1118. sub eax, ebx ; acc[11]-acc[27]
  1119. lea ebx, [ebp+edx] ; acc[43]+acc[59]
  1120. sub ebp, edx ; acc[43]-acc[59]
  1121. lea edx, [ecx+ebx] ; tmp1 = acc[11]+acc[27] + acc[43]+acc[59]
  1122. sub ecx, ebx ; tmp2 = acc[11]+acc[27] - (acc[43]+acc[59])
  1123. sar edx, SCALER ; tmp1 >> 13
  1124. lea ebx, [eax+ebp] ; tmp3 = acc[11]-acc[27] + (acc[43]-acc[59])
  1125. sar ecx, SCALER ; tmp2 >> 13
  1126. sub eax, ebp ; tmp4 = acc[11]-acc[27] - (acc[43]-acc[59])
  1127. sar ebx, SCALER ; tmp3 >> 13
  1128. mov DWORD PTR [edi+2*32+3*4-128], edx ; Intermediate[2][3] = tmp1
  1129. sar eax, SCALER ; tmp4 >> 13
  1130. mov DWORD PTR [edi+2*32+4*4-128], ecx ; Intermediate[2][4] = tmp2
  1131. mov DWORD PTR [edi+5*32+3*4-128], ebx ; Intermediate[5][3] = tmp3
  1132. mov ebx, [esi+112-128] ; get acc[28]
  1133. ; -------------------------------------------------------------------------
  1134. ; 12
  1135. mov DWORD PTR [edi+5*32+4*4-128], eax ; Intermediate[5][4] = tmp4
  1136. mov eax, [esi+48-128] ; get acc[12] Bank conflict
  1137. mov ebp, [esi+176-128] ; get acc[44]
  1138. lea ecx, [eax+ebx] ; acc[12]+acc[28]
  1139. mov edx, [esi+240-128] ; get acc[60]
  1140. sub eax, ebx ; acc[12]-acc[28]
  1141. lea ebx, [ebp+edx] ; acc[44]+acc[60]
  1142. sub ebp, edx ; acc[44]-acc[60]
  1143. lea edx, [ecx+ebx] ; tmp1 = acc[12]+acc[28] + acc[44]+acc[60]
  1144. sub ecx, ebx ; tmp2 = acc[12]+acc[28] - (acc[44]+acc[60])
  1145. sar edx, SCALER ; tmp1 >> 13
  1146. lea ebx, [eax+ebp] ; tmp3 = acc[12]-acc[28] + (acc[44]-acc[60])
  1147. sar ecx, SCALER ; tmp2 >> 13
  1148. sub eax, ebp ; tmp4 = acc[12]-acc[28] - (acc[44]-acc[60])
  1149. sar ebx, SCALER ; tmp3 >> 13
  1150. mov DWORD PTR [edi+3*32-128], edx ; Intermediate[3][0] = tmp1
  1151. sar eax, SCALER ; tmp4 >> 13
  1152. mov DWORD PTR [edi+3*32+7*4-128], ecx ; Intermediate[3][7] = tmp2
  1153. mov DWORD PTR [edi+4*32-128], ebx ; Intermediate[4][0] = tmp3
  1154. mov ebx, [esi+116-128] ; get acc[29]
  1155. ; -------------------------------------------------------------------------
  1156. ; 13
  1157. mov DWORD PTR [edi+4*32+7*4-128], eax ; Intermediate[4][7] = tmp4
  1158. mov eax, [esi+52-128] ; get acc[13]
  1159. mov ebp, [esi+180-128] ; get acc[45]
  1160. lea ecx, [eax+ebx] ; acc[13]+acc[29]
  1161. mov edx, [esi+244-128] ; get acc[61]
  1162. sub eax, ebx ; acc[13]-acc[29]
  1163. lea ebx, [ebp+edx] ; acc[45]+acc[61]
  1164. sub ebp, edx ; acc[45]-acc[61]
  1165. lea edx, [ecx+ebx] ; tmp1 = acc[13]+acc[29] + acc[45]+acc[61]
  1166. sub ecx, ebx ; tmp2 = acc[13]+acc[29] - (acc[45]+acc[61])
  1167. sar edx, SCALER ; tmp1 >> 13
  1168. lea ebx, [eax+ebp] ; tmp3 = acc[13]-acc[29] + (acc[45]-acc[61])
  1169. sar ecx, SCALER ; tmp2 >> 13
  1170. sub eax, ebp ; tmp4 = acc[13]-acc[29] - (acc[45]-acc[61])
  1171. sar ebx, SCALER ; tmp3 >> 13
  1172. mov DWORD PTR [edi+3*32+1*4-128], edx ; Intermediate[3][1] = tmp1
  1173. sar eax, SCALER ; tmp4 >> 13
  1174. mov DWORD PTR [edi+3*32+6*4-128], ecx ; Intermediate[3][6] = tmp2
  1175. mov DWORD PTR [edi+4*32+1*4-128], ebx ; Intermediate[4][1] = tmp3
  1176. mov ebx, [esi+120-128] ; get acc[30]
  1177. ; -------------------------------------------------------------------------
  1178. ; 14
  1179. mov DWORD PTR [edi+4*32+6*4-128], eax ; Intermediate[4][6] = tmp4
  1180. mov eax, [esi+56-128] ; get acc[14] Bank conflict
  1181. mov ebp, [esi+184-128] ; get acc[46]
  1182. lea ecx, [eax+ebx] ; acc[14]+acc[30]
  1183. mov edx, [esi+248-128] ; get acc[62]
  1184. sub eax, ebx ; acc[14]-acc[30]
  1185. lea ebx, [ebp+edx] ; acc[46]+acc[62]
  1186. sub ebp, edx ; acc[46]-acc[62]
  1187. lea edx, [ecx+ebx] ; tmp1 = acc[14]+acc[30] + acc[46]+acc[62]
  1188. sub ecx, ebx ; tmp2 = acc[14]+acc[30] - (acc[46]+acc[62])
  1189. sar edx, SCALER ; tmp1 >> 13
  1190. lea ebx, [eax+ebp] ; tmp3 = acc[14]-acc[30] + (acc[46]-acc[62])
  1191. sar ecx, SCALER ; tmp2 >> 13
  1192. sub eax, ebp ; tmp4 = acc[14]-acc[30] - (acc[46]-acc[62])
  1193. sar ebx, SCALER ; tmp3 >> 13
  1194. mov DWORD PTR [edi+3*32+2*4-128], edx ; Intermediate[3][2] = tmp1
  1195. sar eax, SCALER ; tmp4 >> 13
  1196. mov DWORD PTR [edi+3*32+5*4-128], ecx ; Intermediate[3][5] = tmp2
  1197. mov DWORD PTR [edi+4*32+2*4-128], ebx ; Intermediate[4][2] = tmp3
  1198. mov ebx, [esi+124-128] ; get acc[31]
  1199. ; -------------------------------------------------------------------------
  1200. ; 15
  1201. mov DWORD PTR [edi+4*32+5*4-128], eax ; Intermediate[4][5] = tmp4
  1202. mov eax, [esi+60-128] ; get acc[15]
  1203. mov ebp, [esi+188-128] ; get acc[47]
  1204. lea ecx, [eax+ebx] ; acc[15]+acc[31]
  1205. mov edx, [esi+252-128] ; get acc[63]
  1206. sub eax, ebx ; acc[15]-acc[31]
  1207. lea ebx, [ebp+edx] ; acc[47]+acc[63]
  1208. sub ebp, edx ; acc[47]-acc[63]
  1209. lea edx, [ecx+ebx] ; tmp1 = acc[15]+acc[31] + acc[47]+acc[63]
  1210. sub ecx, ebx ; tmp2 = acc[15]+acc[31] - (acc[47]+acc[63])
  1211. sar edx, SCALER ; tmp1 >> 13
  1212. lea ebx, [eax+ebp] ; tmp3 = acc[15]-acc[31] + (acc[47]-acc[63])
  1213. sar ecx, SCALER ; tmp2 >> 13
  1214. sub eax, ebp ; tmp4 = acc[15]-acc[31] - (acc[47]-acc[63])
  1215. sar ebx, SCALER ; tmp3 >> 13
  1216. mov DWORD PTR [edi+3*32+3*4-128], edx ; Intermediate[3][3] = tmp1
  1217. sar eax, SCALER ; tmp4 >> 13
  1218. mov DWORD PTR [edi+3*32+4*4-128], ecx ; Intermediate[3][4] = tmp2
  1219. mov DWORD PTR [edi+4*32+3*4-128], ebx ; Intermediate[4][3] = tmp3
  1220. mov DWORD PTR [edi+4*32+4*4-128], eax ; Intermediate[4][4] = tmp4
  1221. ret
  1222. } //end of asm
  1223. }
  1224. #pragma code_seg()