Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

523 lines
15 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. //////////////////////////////////////////////////////////////////////////
  15. // $Author: AGUPTA2 $
  16. // $Date: 08 Mar 1996 16:46:18 $
  17. // $Archive: S:\h26x\src\dec\d3halfmc.cpv $
  18. // $Header: S:\h26x\src\dec\d3halfmc.cpv 1.15 08 Mar 1996 16:46:18 AGUPTA2 $
  19. // $Log: S:\h26x\src\dec\d3halfmc.cpv $
  20. //
  21. // Rev 1.15 08 Mar 1996 16:46:18 AGUPTA2
  22. // Added pragma code_seg.
  23. //
  24. //
  25. // Rev 1.14 29 Jan 1996 17:53:56 RMCKENZX
  26. // Completely re-wrote all 3 routines. The loops no longer use pseudo
  27. // SIMD logic and have been tightened to 256, 169, and 169 cycles
  28. // for half-half, half-int, and int-half respectively.
  29. //
  30. // Rev 1.13 19 Jan 1996 17:40:36 RMCKENZX
  31. // fixed half-int so it will correctly round
  32. //
  33. // Rev 1.12 19 Jan 1996 13:29:32 RHAZRA
  34. // Fixed halfpixel prediction by bilinear interpolation in ASM code
  35. //
  36. // Rev 1.11 27 Dec 1995 14:36:06 RMCKENZX
  37. // Added copyright notice
  38. //
  39. // Rev 1.10 09 Oct 1995 09:43:36 CZHU
  40. // Fixed bug in (half,half) interpolation optimization
  41. //
  42. // Rev 1.9 08 Oct 1995 13:40:14 CZHU
  43. // Added C version of (half,half) and use it for now until we fix the bug
  44. // in the optimized version
  45. //
  46. // Rev 1.8 03 Oct 1995 15:06:30 CZHU
  47. //
  48. // Adding debug assistance
  49. //
  50. // Rev 1.7 28 Sep 1995 15:32:22 CZHU
  51. // Fixed bugs mast off bits after shift
  52. //
  53. // Rev 1.6 26 Sep 1995 11:13:36 CZHU
  54. //
  55. // Adjust pitch back to normal, and changed UINT to U32
  56. //
  57. // Rev 1.5 25 Sep 1995 09:04:14 CZHU
  58. // Added and cleaned some comments
  59. //
  60. // Rev 1.4 22 Sep 1995 16:42:00 CZHU
  61. //
  62. // improve pairing
  63. //
  64. // Rev 1.3 22 Sep 1995 15:59:48 CZHU
  65. // finished first around coding of half pel interpolation and tested
  66. // with the standalone program
  67. //
  68. // Rev 1.2 21 Sep 1995 16:56:28 CZHU
  69. // Unit tested (half, int) case
  70. //
  71. // Rev 1.1 21 Sep 1995 12:06:22 CZHU
  72. // More development
  73. //
  74. // Rev 1.0 20 Sep 1995 16:27:56 CZHU
  75. // Initial revision.
  76. //
  77. #include "precomp.h"
  78. #define FRAMEPOINTER esp
  79. //Interpolat_Int_half interpolated the pels from the pRef block
  80. //Write to pNewRef.
  81. //Assumes that pRef area has been expanded
  82. // Todo: Loop control and setup the stack for locals,CZHU,9/20/95
  83. // preload output cache lines, 9/21
  84. // Cache preload is no longer needed, 9/21/95
  85. // Cycles count: 50*4 =200 cycles
  86. #pragma code_seg("IACODE2")
  87. __declspec(naked)
  88. void Interpolate_Half_Int (U32 pRef, U32 pNewRef)
  89. {
  90. __asm {
  91. push ebp
  92. push ebx
  93. push edi
  94. push esi
  95. mov esi, [esp+20] // pRef = esp + 4 pushes + ret
  96. mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
  97. sub edi, PITCH // pre-decrement destination
  98. mov ebp, 8 // loop counter
  99. xor eax, eax // clear registers
  100. xor ebx, ebx
  101. xor ecx, ecx
  102. xor edx, edx
  103. //--------------------------------------------------------------------------//
  104. //
  105. // This loop is, basically, a 4 instruction, 2 cycle loop.
  106. // It is 3-folded, meaning that it works on 3 results per each
  107. // 2 cycle unit. It is 8-unrolled, meaning that it does 8 results
  108. // (one block's row) per loop iteration. The basic calculations
  109. // follow this pattern:
  110. //
  111. // pass-> 1 2 3
  112. // cycle
  113. // 1 load | | shift
  114. // -----------------------
  115. // 2 | add | store
  116. //
  117. // This assumes that the prior pell's value was loaded and
  118. // preserved from the prior result's calculation. Therefore
  119. // each result uses 2 registers -- one to load (and preserve)
  120. // the right-hand pell, and the other (overwriting the previous
  121. // result's stored pell value) to add into, shift, and store out
  122. // of. The add is accomplished with the lea instruction, allowing
  123. // a round bit to be added in without using a separate instruction.
  124. //
  125. // The preamble loads & adds for the first result, and loads
  126. // for the second. The body executes the basic pattern six times.
  127. // The postamble shifts and stores for the seventh result and
  128. // adds, shifts, and stores for the eighth.
  129. //
  130. // Timing:
  131. // 4 preamble (including bank conflict)
  132. // 12 body
  133. // 4 postamble
  134. // ----------------
  135. // 20 per loop
  136. // x 8 loops
  137. // ----------------
  138. // 160 subtotal
  139. // 6 initialize
  140. // 3 finalize
  141. // ================
  142. // 169 total cycles
  143. //--------------------------------------------------------------------------//
  144. main_loop:
  145. // preamble
  146. mov al, 0[esi]
  147. mov bl, 1[esi] // probable BANK CONFLICT
  148. mov dl, 0[edi] // heat the cache
  149. add edi, PITCH // increment destination at top
  150. lea eax, [1+eax+ebx] // use a regular add in the preamble
  151. mov cl, 2[esi]
  152. // body (6 pels)
  153. shr eax, 1
  154. mov dl, 3[esi]
  155. lea ebx, [ebx+ecx+1]
  156. mov 0[edi], al
  157. shr ebx, 1
  158. mov al, 4[esi]
  159. lea ecx, [ecx+edx+1]
  160. mov 1[edi], bl
  161. shr ecx, 1
  162. mov bl, 5[esi]
  163. lea edx, [edx+eax+1]
  164. mov 2[edi], cl
  165. shr edx, 1
  166. mov cl, 6[esi]
  167. lea eax, [eax+ebx+1]
  168. mov 3[edi], dl
  169. shr eax, 1
  170. mov dl, 7[esi]
  171. lea ebx, [ebx+ecx+1]
  172. mov 4[edi], al
  173. shr ebx, 1
  174. mov al, 8[esi]
  175. lea ecx, [ecx+edx+1]
  176. mov 5[edi], bl
  177. // postamble
  178. shr ecx, 1
  179. lea edx, [edx+eax+1]
  180. shr edx, 1
  181. mov 6[edi], cl
  182. add esi, PITCH // increment source pointer
  183. mov 7[edi], dl
  184. dec ebp // loop counter
  185. jne main_loop
  186. // restore registers and return
  187. pop esi
  188. pop edi
  189. pop ebx
  190. pop ebp
  191. ret
  192. } //end of asm
  193. }
  194. // end Interpolate_Half_Int()
  195. //--------------------------------------------------------------------------//
  196. __declspec(naked)
  197. void Interpolate_Int_Half (U32 pRef, U32 pNewRef)
  198. {
  199. __asm {
  200. push ebp
  201. push ebx
  202. push edi
  203. push esi
  204. mov esi, [esp+20] // pRef = esp + 4 pushes + ret
  205. mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
  206. dec edi // pre-decrement destination
  207. mov ebp, 8 // loop counter
  208. xor eax, eax // clear registers
  209. xor ebx, ebx
  210. xor ecx, ecx
  211. xor edx, edx
  212. //--------------------------------------------------------------------------//
  213. //
  214. // This loop is, basically, a 4 instruction, 2 cycle loop.
  215. // It is 3-folded, meaning that it works on 3 results per each
  216. // 2 cycle unit. It is 8-unrolled, meaning that it does 8 results
  217. // (one block's row) per loop iteration. The basic calculations
  218. // follow this pattern:
  219. //
  220. // pass-> 1 2 3
  221. // cycle
  222. // 1 load | | shift
  223. // -----------------------
  224. // 2 | add | store
  225. //
  226. // This assumes that the prior pell's value was loaded and
  227. // preserved from the prior result's calculation. Therefore
  228. // each result uses 2 registers -- one to load (and preserve)
  229. // the right-hand pell, and the other (overwriting the previous
  230. // result's stored pell value) to add into, shift, and store out
  231. // of. The add is accomplished with the lea instruction, allowing
  232. // a round bit to be added in without using a separate instruction.
  233. //
  234. // The preamble loads & adds for the first result, and loads
  235. // for the second. The body executes the basic pattern six times.
  236. // The postamble shifts and stores for the seventh result and
  237. // adds, shifts, and stores for the eighth.
  238. //
  239. // Timing:
  240. // 4 preamble (including bank conflict)
  241. // 12 body
  242. // 4 postamble
  243. // ----------------
  244. // 20 per loop
  245. // x 8 loops
  246. // ----------------
  247. // 160 subtotal
  248. // 6 initialize
  249. // 3 finalize
  250. // ================
  251. // 169 total cycles
  252. //--------------------------------------------------------------------------//
  253. main_loop:
  254. // preamble
  255. mov al, [esi]
  256. mov bl, PITCH[esi] // probable BANK CONFLICT
  257. mov dl, [edi] // heat the cache
  258. inc edi // increment destination at top
  259. lea eax, [1+eax+ebx] // use a regular add in the preamble
  260. mov cl, [2*PITCH+esi]
  261. // body (6 pels)
  262. shr eax, 1
  263. mov dl, [3*PITCH+esi]
  264. lea ebx, [ebx+ecx+1]
  265. mov [edi], al
  266. shr ebx, 1
  267. mov al, [4*PITCH+esi]
  268. lea ecx, [ecx+edx+1]
  269. mov [PITCH+edi], bl
  270. shr ecx, 1
  271. mov bl, [5*PITCH+esi]
  272. lea edx, [edx+eax+1]
  273. mov [2*PITCH+edi], cl
  274. shr edx, 1
  275. mov cl, [6*PITCH+esi]
  276. lea eax, [eax+ebx+1]
  277. mov [3*PITCH+edi], dl
  278. shr eax, 1
  279. mov dl, [7*PITCH+esi]
  280. lea ebx, [ebx+ecx+1]
  281. mov [4*PITCH+edi], al
  282. shr ebx, 1
  283. mov al, [8*PITCH+esi]
  284. lea ecx, [ecx+edx+1]
  285. mov [5*PITCH+edi], bl
  286. // postamble
  287. shr ecx, 1
  288. lea edx, [edx+eax+1]
  289. shr edx, 1
  290. mov [6*PITCH+edi], cl
  291. inc esi // increment source pointer
  292. mov [7*PITCH+edi], dl
  293. dec ebp // loop counter
  294. jne main_loop
  295. // restore registers and return
  296. pop esi
  297. pop edi
  298. pop ebx
  299. pop ebp
  300. ret
  301. } // end of asm
  302. }
  303. // end Interpolate_Int_Half()
  304. //--------------------------------------------------------------------------//
  305. __declspec(naked)
  306. void Interpolate_Half_Half (U32 pRef, U32 pNewRef)
  307. {
  308. __asm {
  309. push ebp
  310. push ebx
  311. push edi
  312. push esi
  313. mov esi, [esp+20] // pRef = esp + 4 pushes + ret
  314. mov edi, [esp+24] // pNewRef = esp + 4 pushes + ret + pRef
  315. mov ebp, 8 // loop counter
  316. sub edi, PITCH // pre-decrement destination pointer
  317. xor ecx, ecx
  318. xor edx, edx
  319. //--------------------------------------------------------------------------//
  320. //
  321. // This loop is, basically, a 6 instruction, 3 cycle loop.
  322. // It is 3-folded, meaning that it works on 3 results per each
  323. // 3 cycle unit. It is 8-unrolled, meaning that it does 8 results
  324. // (one block's row) per loop iteration. The basic calculations
  325. // follow this pattern:
  326. //
  327. // pass-> 1 2 3
  328. // cycle
  329. // 1 load | add left |
  330. // ----------------------------
  331. // 2 load | | shift
  332. // ----------------------------
  333. // 3 | add all | store
  334. //
  335. // Five registers are used to preserve values from one pass to the next:
  336. // cl & dl hold the last two pell values
  337. // ebp or ebx holds the sum of the two left-hand pells + 1
  338. // eax holds the sum of all four pells
  339. // Both adds are accomplished with the lea instruction. For the sum
  340. // of the two left-hand pells, this allows a rounding bit to be added
  341. // in without using a separate instruction. For both sums it allows
  342. // the result to be placed into a register independent of the sources'.
  343. // Since the sum of the two left-hand pells is used twice, it is place
  344. // alternately into ebx and ebp.
  345. //
  346. // The preamble does two preliminary loads plus passes 1 & 2 for the
  347. // first result, and pass 1 for the second. The body executes the basic
  348. // pattern six times. The postamble does pass 3 for the
  349. // seventh result and passes 2 & 3 for the eighth.
  350. //
  351. // Due to the need for five registers, the loop counter is kept on
  352. // the stack.
  353. //
  354. // Timing:
  355. // 8 preamble
  356. // 18 body
  357. // 5 postamble
  358. // ----------------
  359. // 31 per loop
  360. // x 8 loops
  361. // ----------------
  362. // 248 subtotal
  363. // 5 initialize
  364. // 3 finalize
  365. // ================
  366. // 256 total cycles
  367. //--------------------------------------------------------------------------//
  368. main_loop:
  369. // preamble
  370. mov cl, [esi] // pell 0
  371. xor eax, eax
  372. mov al, [esi+PITCH] // pell 0
  373. xor ebx, ebx
  374. mov dl, [esi+1] // pell 1
  375. add eax, ecx // partial sum 0 sans round
  376. mov bl, [esi+PITCH+1] // pell 1
  377. inc eax // partial sum 0
  378. mov cl, [esi+2] // pell 2
  379. add ebx, edx // partial sum 1 sans round
  380. mov dl, [esi+PITCH+2] // pell 2
  381. inc ebx // partial sum 1
  382. add eax, ebx // full sum 0
  383. push ebp // save loop counter on stack
  384. mov ebp, [edi+PITCH] // heat the cache
  385. add edi, PITCH // increment dst. pointer at top of loop
  386. // body (x 6)
  387. lea ebp, [ecx+edx+1] // partial sum 2 with round
  388. mov cl, [esi+3] // pell 3
  389. shr eax, 2 // value 0
  390. mov dl, [esi+PITCH+3] // pell 3
  391. mov [edi], al // write value 0
  392. lea eax, [ebx+ebp] // full sum 1
  393. lea ebx, [ecx+edx+1] // partial sum 3 with round
  394. mov cl, [esi+4] // pell 4
  395. shr eax, 2 // value 1
  396. mov dl, [esi+PITCH+4] // pell 4
  397. mov [edi+1], al // write value 1
  398. lea eax, [ebx+ebp] // full sum 2
  399. lea ebp, [ecx+edx+1] // partial sum 4 with round
  400. mov cl, [esi+5] // pell 5
  401. shr eax, 2 // value 2
  402. mov dl, [esi+PITCH+5] // pell 5
  403. mov [edi+2], al // write value 2
  404. lea eax, [ebx+ebp] // full sum 3
  405. lea ebx, [ecx+edx+1] // partial sum 5 with round
  406. mov cl, [esi+6] // pell 6
  407. shr eax, 2 // value 3
  408. mov dl, [esi+PITCH+6] // pell 6
  409. mov [edi+3], al // write value 3
  410. lea eax, [ebx+ebp] // full sum 4
  411. lea ebp, [ecx+edx+1] // partial sum 6 with round
  412. mov cl, [esi+7] // pell 7
  413. shr eax, 2 // value 4
  414. mov dl, [esi+PITCH+7] // pell 7
  415. mov [edi+4], al // write value 4
  416. lea eax, [ebx+ebp] // full sum 5
  417. lea ebx, [ecx+edx+1] // partial sum 7 with round
  418. mov cl, [esi+8] // pell 8
  419. shr eax, 2 // value 5
  420. mov dl, [esi+PITCH+8] // pell 8
  421. mov [edi+5], al // write value 5
  422. lea eax, [ebx+ebp] // full sum 6
  423. // postamble
  424. shr eax, 2 // value 6
  425. lea ebp, [ecx+edx+1] // partial sum 8 with round
  426. mov [edi+6], al // write value 6
  427. add esi, PITCH // increment read pointer
  428. lea eax, [ebx+ebp] // full sum 7
  429. pop ebp // restore loop counter
  430. shr eax, 2 // value 7
  431. dec ebp // decrement loop counter
  432. mov [edi+7], al // write value 7
  433. jne main_loop // loop if not done
  434. // restore registers and return
  435. pop esi
  436. pop edi
  437. pop ebx
  438. pop ebp
  439. ret
  440. } //end of asm
  441. }
  442. #pragma code_seg()
  443. // end Interpolate_Half_Half()
  444. //--------------------------------------------------------------------------//
  445. /*
  446. void Interpolate_Half_Half_C (U32 pRef, U32 pNewRef)
  447. {
  448. U8 * pSrc = (U8 *) pRef;
  449. U8 * pDst = (U8 *) pNewRef;
  450. int i, j;
  451. for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
  452. for (j=0; j<8; j++)
  453. pDst[j] = (pSrc[j] + pSrc[j+1] + pSrc[PITCH+j] + pSrc[PITCH+j+1] + 2) >> 2;
  454. }
  455. void Interpolate_Int_Half_C (U32 pRef, U32 pNewRef)
  456. {
  457. U8 * pSrc = (U8 *) pRef;
  458. U8 * pDst = (U8 *) pNewRef;
  459. int i, j;
  460. for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
  461. for (j=0; j<8; j++)
  462. pDst[j] = (pSrc[j] + pSrc[PITCH+j] + 1) >> 1;
  463. }
  464. void Interpolate_Half_Int_C (U32 pRef, U32 pNewRef)
  465. {
  466. U8 * pSrc = (U8 *) pRef;
  467. U8 * pDst = (U8 *) pNewRef;
  468. int i, j;
  469. for (i=0; i<8; i++, pDst+=PITCH, pSrc+=PITCH)
  470. for (j=0; j<8; j++)
  471. pDst[j] = (pSrc[j] + pSrc[j+1] + 1) >> 1;
  472. }
  473. */