Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
6.2 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995, 1996 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. // $Author: AKASAI $
  15. // $Date: 18 Mar 1996 09:30:48 $
  16. // $Archive: S:\h26x\src\dec\d1blkadd.cpv $
  17. // $Header: S:\h26x\src\dec\d1blkadd.cpv 1.0 18 Mar 1996 09:30:48 AKASAI $
  18. // $Log: S:\h26x\src\dec\d1blkadd.cpv $
  19. //
  20. // Rev 1.0 18 Mar 1996 09:30:48 AKASAI
  21. // Initial revision.
  22. //
  23. // Rev 1.4 22 Dec 1995 13:52:16 KMILLS
  24. //
  25. // added new copyright notice
  26. //
  27. // Rev 1.3 25 Sep 1995 09:03:36 CZHU
  28. // Added comments on cycle counts
  29. //
  30. // Rev 1.2 13 Sep 1995 08:46:44 AKASAI
  31. // Set loopcounter back to 8. Intermediate is 8x8 of DWORDS so TEMPPITCH4
  32. // should be 32 not 64.
  33. //
  34. // Rev 1.1 12 Sep 1995 18:19:20 CZHU
  35. //
  36. // Changed loop from 8 to 7 to start with.
  37. //
  38. // Rev 1.0 11 Sep 1995 16:52:20 CZHU
  39. // Initial revision.
  40. // -------------------------------------------------------------------------
  41. // T is routine performs a block(8 8) addition.
  42. // output = clamp[reference + current]
  43. //
  44. // Input I32 *current (output of FMIDCT)
  45. // U8 *reference (Motion Compensated address of reference)
  46. // U8 *output (Output buffer)
  47. //
  48. // Assumption: reference and output use PITCH
  49. // current as some other pitch
  50. //
  51. // Registers used: eax, ebx, ecx, edx, esi, edi, ebp
  52. //
  53. // -------------------------------------------------------------------------
  54. #include "precomp.h"
  55. #define TEMPPITCH4 32
  56. extern U8 ClipPixIntra[];
  57. #define FRAMEPOINTER esp
  58. #define L_LOOPCOUNTER FRAMEPOINTER + 0 // 4 byte
  59. #define LOCALSIZE 4 // keep aligned
  60. #pragma code_seg("IACODE2")
  61. __declspec(naked)
  62. void BlockAdd (U32 uResidual, U32 uRefBlock, U32 uDstBlock)
  63. {
  64. __asm {
  65. push ebp ;// save callers frame pointer
  66. mov ebp,esp ;// make parameters accessible
  67. push esi ;// assumed preserved
  68. push edi
  69. push ebx
  70. sub esp,LOCALSIZE ;// reserve local storage
  71. mov edi, uDstBlock ;// edi gets Base addr of OutputBuffer
  72. mov esi, uRefBlock; ;// esi gets Base addr of Current
  73. mov ebp, uResidual ;// ebp gets Base addr of Reference
  74. mov ecx, 8
  75. xor eax, eax
  76. // Cylces counts: 26 x 8=208 without cache miss
  77. // czhu, 9/25/95
  78. ALIGN 4
  79. loop_for_i:
  80. mov [L_LOOPCOUNTER], ecx ; save loop counter in temporary
  81. mov ebx, [ebp+8] ; 1) fetch current[i+2]
  82. mov al, BYTE PTR[esi+2] ; 1) fetch ref[i+2]
  83. xor ecx, ecx ; 2)
  84. mov cl, BYTE PTR[esi+3] ; 2) fetch ref[i+3]
  85. mov edx, [ebp+12] ; 2) fetch current[i+3]
  86. add eax, ebx ; 1) result2 = ref[i+2] + current[i+2]
  87. xor ebx, ebx ; 3)
  88. add ecx, edx ; 2) result3= ref[i+3] + current[i+3]
  89. mov bl, BYTE PTR[esi] ; 3) fetch ref[i]
  90. mov dl, ClipPixIntra[1024+eax] ; 1) fetch clamp[result2]
  91. mov eax, [ebp] ; 3) fetch current[i]
  92. add ebx, eax ; 3) result0 = ref[i] + current[i]
  93. xor eax, eax ; 4)
  94. mov dh, ClipPixIntra[1024+ecx] ; 2) fetch clamp[result3]
  95. mov al, [esi+1] ; 4) fetch ref[i+1]
  96. shl edx, 16 ; move 1st 2 results to high word
  97. mov ecx, [ebp+4] ; 4) fetch current[i+1]
  98. mov dl, ClipPixIntra[1024+ebx] ; 3) fetch clamp[result0]
  99. add eax, ecx ; 4) result1 = ref[i+1] + current[i+1]
  100. xor ecx, ecx ; 4+1)
  101. mov ebx, [ebp+24] ; 4+1) fetch current[i+6]
  102. mov dh, ClipPixIntra[1024+eax] ; 4) fetch clamp[result1]
  103. mov cl, BYTE PTR[esi+6] ; 4+1) fetch ref[i+6]
  104. mov [edi], edx ; store 4 output pixels
  105. xor eax, eax ; 4+2)
  106. mov al, BYTE PTR[esi+7] ; 4+2) fetch ref[i+7]
  107. mov edx, [ebp+28] ; 4+2) fetch current[i+7]
  108. add ecx, ebx ; 4+1) result6 = ref[i+6] + current[i+6]
  109. xor ebx, ebx ; 4+3)
  110. add eax, edx ; 4+2) result7= ref[i+7] + current[i+7]
  111. mov bl, BYTE PTR[esi+4] ; 4+3) fetch ref[i+4]
  112. mov dl, ClipPixIntra[1024+ecx] ; 4+1) fetch clamp[result6]
  113. mov ecx, [ebp+16] ; 4+3) fetch current[i+4]
  114. add ebx, ecx ; 4+3) result4 = ref[i+4] + current[i+4]
  115. xor ecx, ecx ; 4+4)
  116. mov dh, ClipPixIntra[1024+eax] ; 4+2) fetch clamp[result7]
  117. mov cl, [esi+5] ; 4+4) fetch ref[i+5]
  118. shl edx, 16 ; move 3rd 2 results to high word
  119. mov eax, [ebp+20] ; 4+4) fetch current[i+5]
  120. add ecx, eax ; 4+4) result5 = ref[i+5] + current[i+5]
  121. add esi, PITCH ; Update address of next line
  122. mov dl, ClipPixIntra[1024+ebx] ; 4+3) fetch clamp[result4]
  123. add ebp, TEMPPITCH4 ; Update address of current to next line
  124. mov dh, ClipPixIntra[1024+ecx] ; 4+4) fetch clamp[result5]
  125. mov ecx, [L_LOOPCOUNTER] ; get loop counter
  126. mov [edi+4], edx ; store 4 output pixels
  127. add edi, PITCH ; Update address of output to next line
  128. xor eax, eax ; 1)
  129. dec ecx
  130. jnz loop_for_i
  131. add esp,LOCALSIZE // free locals
  132. pop ebx
  133. pop edi
  134. pop esi
  135. pop ebp
  136. ret
  137. } //end of asm, BlockAdd
  138. }
  139. #pragma code_seg()