Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

849 lines
19 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. #include "precomp.h"
  15. #if defined(H263P) || defined(USE_BILINEAR_MSH26X) // {
  16. //
  17. // For the P5 versions, the strategy is to compute the Y value for an odd RGB value
  18. // followed by computing the Y value for the corresponding even RGB value. The registers
  19. // are then set with the proper values to compute U and V values for the even RGB
  20. // value. This avoids repeating the shifting and masking needed to extract the Red,
  21. // Green and Blue components.
  22. //
  23. /*****************************************************************************
  24. *
  25. * H26X_CLUT8toYUV12()
  26. *
  27. * Convert from CLUT8 to YUV12 (YCrCb 4:2:0) and copy to destination memory
  28. * with pitch defined by the constant PITCH.
  29. *
  30. * This is needed to support the quickcam.
  31. */
  32. #if 0 // { 0
  33. void C_H26X_CLUT8toYUV12(
  34. LPBITMAPINFOHEADER lpbiInput,
  35. WORD OutputWidth,
  36. WORD OutputHeight,
  37. U8 *lpInput,
  38. U8 *YPlane,
  39. U8 *UPlane,
  40. U8 *VPlane,
  41. const int pitch)
  42. {
  43. U32 tm1, tm2;
  44. C_RGB_COLOR_CONVERT_INIT
  45. // The following assignment is here simply to avoid a warning.
  46. t = t;
  47. // The palette may change with a new frame. Since we don't know when the palette
  48. // changes, we have to be conservative and compute it for each frame. However, this
  49. // should still be quicker than computing Y, U, and V for each pixel.
  50. Compute_YUVPalette(lpbiInput);
  51. for (j = 0; j < LumaIters; j++) {
  52. for (k = 0; k < mark; k++) {
  53. for (i = OutputWidth; (i & ~0x7); i-=8, YPlane+=8, pnext+=2) {
  54. tm1 = *pnext;
  55. *(U32 *)YPlane =
  56. YUVPalette[tm1&0xFF].Yval |
  57. ((YUVPalette[(tm1>>8)&0xFF].Yval) << 8) |
  58. ((YUVPalette[(tm1>>16)&0xFF].Yval) << 16) |
  59. ((YUVPalette[(tm1>>24)].Yval) << 24);
  60. tm2 = *(pnext+1);
  61. *(U32 *)(YPlane+4) =
  62. YUVPalette[tm2&0xFF].Yval |
  63. ((YUVPalette[(tm2>>8)&0xFF].Yval) << 8) |
  64. ((YUVPalette[(tm2>>16)&0xFF].Yval) << 16) |
  65. ((YUVPalette[(tm2>>24)].Yval) << 24);
  66. if (0 == (k&1)) {
  67. *(U32 *)UPlane =
  68. YUVPalette[tm1&0xFF].Uval |
  69. ((YUVPalette[(tm1>>16)&0xFF].Uval) << 8) |
  70. ((YUVPalette[tm2&0xFF].Uval) << 16) |
  71. ((YUVPalette[(tm2>>16)&0xFF].Uval) << 24);
  72. *(U32 *)VPlane =
  73. YUVPalette[tm1&0xFF].Vval |
  74. ((YUVPalette[(tm1>>16)&0xFF].Vval) << 8) |
  75. ((YUVPalette[tm2&0xFF].Vval) << 16) |
  76. ((YUVPalette[(tm2>>16)&0xFF].Vval) << 24);
  77. UPlane +=4; VPlane += 4;
  78. }
  79. }
  80. if (i & 0x4) {
  81. tm = *pnext++;
  82. *(U32 *)YPlane =
  83. YUVPalette[tm&0xFF].Yval |
  84. ((YUVPalette[(tm>>8)&0xFF].Yval) << 8) |
  85. ((YUVPalette[(tm>>16)&0xFF].Yval) << 16) |
  86. ((YUVPalette[(tm>>24)].Yval) << 24);
  87. YPlane += 4;
  88. if (0 == (k&1)) {
  89. *(U16 *)UPlane =
  90. YUVPalette[tm&0xFF].Uval |
  91. ((YUVPalette[(tm>>16)&0xFF].Uval) << 8);
  92. *(U16 *)VPlane =
  93. YUVPalette[tm&0xFF].Vval |
  94. ((YUVPalette[(tm>>16)&0xFF].Vval) << 8);
  95. UPlane += 2; VPlane += 2;
  96. }
  97. }
  98. C_WIDTH_FILL
  99. if (stretch && (0 == k) && j) {
  100. for (i = OutputWidth; i > 0; i -= 8) {
  101. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  102. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  103. *pyspace++ = tm;
  104. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  105. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  106. *pyspace++ = tm;
  107. }
  108. }
  109. pnext += BackTwoLines;
  110. YPlane += byte_ypitch_adj;
  111. // Increment after even lines.
  112. if(0 == (k&1)) {
  113. UPlane += byte_uvpitch_adj;
  114. VPlane += byte_uvpitch_adj;
  115. }
  116. }
  117. if (stretch) {
  118. pyprev = (U32 *)(YPlane - pitch);
  119. pyspace = (U32 *)YPlane;
  120. pynext = (U32 *)(YPlane += pitch);
  121. }
  122. }
  123. C_HEIGHT_FILL
  124. if (stretch) {
  125. for (i = OutputWidth; i > 0; i -= 4) {
  126. *pyspace++ = *pyprev++;
  127. }
  128. }
  129. } // end of H26X_CLUT8toYUV12()
  130. #endif // } 0
  131. __declspec(naked)
  132. void P5_H26X_CLUT8toYUV12(
  133. LPBITMAPINFOHEADER lpbiInput,
  134. WORD OutputWidth,
  135. WORD OutputHeight,
  136. U8 *lpInput,
  137. U8 *YPlane,
  138. U8 *UPlane,
  139. U8 *VPlane,
  140. const int pitch)
  141. {
  142. // Permanent (callee-save) registers - ebx, esi, edi, ebp
  143. // Temporary (caller-save) registers - eax, ecx, edx
  144. //
  145. // Stack frame layout
  146. // | pitch | +136
  147. // | VPlane | +132
  148. // | UPlane | +128
  149. // | YPlane | +124
  150. // | lpInput | +120
  151. // | OutputHeight | +116
  152. // | OutputWidth | +112
  153. // | lpbiInput | +108
  154. // ----------------------------
  155. // | return addr | +104
  156. // | saved ebp | +100
  157. // | saved ebx | + 96
  158. // | saved esi | + 92
  159. // | saved edi | + 88
  160. // | output_width | + 84
  161. // | pyprev | + 80
  162. // | pyspace | + 76
  163. // | pynext | + 72
  164. // | puvprev | + 68
  165. // | puvspace | + 64
  166. // | i | + 60
  167. // | j | + 56
  168. // | k | + 52
  169. // | BackTwoLines | + 48
  170. // | widthx16 | + 44
  171. // | heightx16 | + 40
  172. // | width_diff | + 36
  173. // | height_diff | + 32
  174. // | width_adj | + 28
  175. // | height_adj | + 24
  176. // | stretch | + 20
  177. // | aspect | + 16
  178. // | LumaIters | + 12
  179. // | mark | + 8
  180. // | byte_ypitch_adj | + 4
  181. // | byte_uvpitch_adj | + 0
  182. #define LOCALSIZE 88
  183. #define PITCH_PARM 136
  184. #define VPLANE 132
  185. #define UPLANE 128
  186. #define YPLANE 124
  187. #define LP_INPUT 120
  188. #define OUTPUT_HEIGHT_WORD 116
  189. #define OUTPUT_WIDTH_WORD 112
  190. #define LPBI_INPUT 108
  191. #define OUTPUT_WIDTH 84
  192. #define PYPREV 80
  193. #define PYSPACE 76
  194. #define PYNEXT 72
  195. #define PUVPREV 68
  196. #define PUVSPACE 64
  197. #define LOOP_I 60
  198. #define LOOP_J 56
  199. #define LOOP_K 52
  200. #define BACK_TWO_LINES 48
  201. #define WIDTHX16 44
  202. #define HEIGHTX16 40
  203. #define WIDTH_DIFF 36
  204. #define HEIGHT_DIFF 32
  205. #define WIDTH_ADJ 28
  206. #define HEIGHT_ADJ 24
  207. #define STRETCH 20
  208. #define ASPECT 16
  209. #define LUMA_ITERS 12
  210. #define MARK 8
  211. #define BYTE_YPITCH_ADJ 4
  212. #define BYTE_UVPITCH_ADJ 0
  213. _asm {
  214. push ebp
  215. push ebx
  216. push esi
  217. push edi
  218. sub esp, LOCALSIZE
  219. // int width_diff = 0
  220. // int height_diff = 0
  221. // int width_adj = 0
  222. // int height_adj = 0
  223. // int stretch = 0
  224. // int aspect = 0
  225. xor eax, eax
  226. mov [esp + WIDTH_DIFF], eax
  227. mov [esp + HEIGHT_DIFF], eax
  228. mov [esp + WIDTH_ADJ], eax
  229. mov [esp + HEIGHT_ADJ], eax
  230. mov [esp + STRETCH], eax
  231. mov [esp + ASPECT], eax
  232. // int LumaIters = 1
  233. inc eax
  234. mov [esp + LUMA_ITERS], eax
  235. // int mark = OutputHeight
  236. // int output_width = OutputWidth
  237. // int byte_ypitch_adj = pitch - OutputWidth
  238. // int byte_uvpitch_adj = pitch - (OutputWidth >> 1)
  239. xor ebx, ebx
  240. mov bx, [esp + OUTPUT_HEIGHT_WORD]
  241. mov [esp + MARK], ebx
  242. mov bx, [esp + OUTPUT_WIDTH_WORD]
  243. mov [esp + OUTPUT_WIDTH], ebx
  244. mov ecx, [esp + PITCH_PARM]
  245. mov edx, ecx
  246. sub ecx, ebx
  247. mov [esp + BYTE_YPITCH_ADJ], ecx
  248. shr ebx, 1
  249. sub edx, ebx
  250. mov [esp + BYTE_UVPITCH_ADJ], edx
  251. // if (lpbiInput->biHeight > OutputHeight)
  252. mov ebx, [esp + LPBI_INPUT]
  253. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  254. xor edx, edx
  255. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  256. cmp ecx, edx
  257. jle Lno_stretch
  258. // for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) LumaIters += 4
  259. xor ecx, ecx
  260. Lrepeat48:
  261. lea ecx, [ecx + 4]
  262. sub edx, 48
  263. jnz Lrepeat48
  264. mov [esp + LUMA_ITERS], ecx
  265. // aspect = LumaIters
  266. mov [esp + ASPECT], ecx
  267. // width_adj = (lpbiInput->biWidth - OutputWidth) >> 1
  268. // width_adj *= lpbiInput->biBitCount
  269. // width_adj >>= 3
  270. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  271. mov edx, [esp + OUTPUT_WIDTH]
  272. sub ecx, edx
  273. shr ecx, 1
  274. xor edx, edx
  275. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  276. imul ecx, edx
  277. shr ecx, 3
  278. mov [esp + WIDTH_ADJ], ecx
  279. // height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1
  280. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  281. xor edx, edx
  282. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  283. sub ecx, edx
  284. add ecx, [esp + ASPECT]
  285. shr ecx, 1
  286. mov [esp + HEIGHT_ADJ], ecx
  287. // stretch = 1
  288. // mark = 11
  289. mov ecx, 1
  290. mov edx, 11
  291. mov [esp + STRETCH], ecx
  292. mov [esp + MARK], edx
  293. jmp Lif_done
  294. Lno_stretch:
  295. // widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF
  296. // width_diff = widthx16 - OutputWidth
  297. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  298. add ecx, 00FH
  299. and ecx, 0FFFFFFF0H
  300. mov [esp + WIDTHX16], ecx
  301. mov edx, [esp + OUTPUT_WIDTH]
  302. sub ecx, edx
  303. mov [esp + WIDTH_DIFF], ecx
  304. // byte_ypitch_adj -= width_diff
  305. mov edx, [esp + BYTE_YPITCH_ADJ]
  306. sub edx, ecx
  307. mov [esp + BYTE_YPITCH_ADJ], edx
  308. // byte_uvpitch_adj -= (width_diff >> 1)
  309. mov edx, [esp + BYTE_UVPITCH_ADJ]
  310. shr ecx, 1
  311. sub edx, ecx
  312. mov [esp + BYTE_UVPITCH_ADJ], edx
  313. // heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF
  314. // height_diff = heightx16 - OutputHeight
  315. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  316. add ecx, 00FH
  317. and ecx, 0FFFFFFF0H
  318. mov [esp + HEIGHTX16], ecx
  319. xor edx, edx
  320. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  321. sub ecx, edx
  322. mov [esp + HEIGHT_DIFF], ecx
  323. Lif_done:
  324. // BackTwoLines = -(lpbiInput->biWidth + OutputWidth);
  325. // BackTwoLines *= lpbiInput->biBitCount
  326. // BackTwoLines >>= 3
  327. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  328. mov edx, [esp + OUTPUT_WIDTH]
  329. add ecx, edx
  330. neg ecx
  331. xor edx, edx
  332. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  333. imul ecx, edx
  334. sar ecx, 3
  335. mov [esp + BACK_TWO_LINES], ecx
  336. // pnext = (U32 *)(lpInput +
  337. // (((lpbiInput->biWidth * lpbiInput->biBitCount) >> 3)) *
  338. // ((OutputHeight - aspect - 1) + height_adj)) +
  339. // width_adj)
  340. // assign (esi, pnext)
  341. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  342. xor edx, edx
  343. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  344. imul ecx, edx
  345. shr ecx, 3
  346. xor edx, edx
  347. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  348. sub edx, [esp + ASPECT]
  349. dec edx
  350. add edx, [esp + HEIGHT_ADJ]
  351. imul ecx, edx
  352. add ecx, [esp + WIDTH_ADJ]
  353. add ecx, [esp + LP_INPUT]
  354. mov esi, ecx
  355. // Compute_YUVPalette(lpbiInput)
  356. mov eax, [esp + LPBI_INPUT]
  357. push eax
  358. call Compute_YUVPalette
  359. pop eax
  360. // assign (edi, YPlane)
  361. mov edi, [esp + YPLANE]
  362. // for (j = 0; j < LumaIters; j++)
  363. xor eax, eax
  364. mov [esp + LOOP_J], eax
  365. // for (k = 0; k < mark; k++)
  366. L4:
  367. xor eax, eax
  368. mov [esp + LOOP_K], eax
  369. // for (i = OutputWidth; i > 0; i -= 2, pnext += 4)
  370. L5:
  371. mov ebp, [esp + OUTPUT_WIDTH]
  372. // This jump is here to make sure the following loop starts on the U pipe
  373. jmp L6
  374. L6:
  375. // tm = *pnext
  376. // *(U32 *)YPlane =
  377. // YUVPalette[tm&0xFF].Yval |
  378. // ((YUVPalette[(tm>>8)&0xFF].Yval) << 8) |
  379. // ((YUVPalette[(tm>>16)&0xFF].Yval) << 16) |
  380. // ((YUVPalette[(tm>>24)].Yval) << 24)
  381. mov eax, [esi]
  382. nop
  383. mov ebx, eax
  384. mov ecx, eax
  385. shr ebx, 8
  386. mov edx, eax
  387. shr ecx, 16
  388. and eax, 0xFF
  389. shr edx, 24
  390. and ebx, 0xFF
  391. and ecx, 0xFF
  392. and edx, 0xFF
  393. mov al, [YUVPalette+eax*4].Yval
  394. mov bl, [YUVPalette+ebx*4].Yval
  395. shl ebx, 8
  396. mov cl, [YUVPalette+ecx*4].Yval
  397. shl ecx, 16
  398. mov dl, [YUVPalette+edx*4].Yval
  399. shl edx, 24
  400. or eax, ebx
  401. or eax, ecx
  402. mov ebx, [esp + LOOP_K]
  403. or eax, edx
  404. and ebx, 1
  405. mov [edi], eax
  406. jnz Lno_luma
  407. // tm = *pnext
  408. // *(U32 *)UPlane =
  409. // YUVPalette[tm&0xFF].Uval |
  410. // ((YUVPalette[(tm>>16)&0xFF].Uval) << 8)
  411. // *(U32 *)VPlane =
  412. // YUVPalette[tm&0xFF].Vval |
  413. // ((YUVPalette[(tm>>16)&0xFF].Vval) << 8)
  414. // UPlane +=2
  415. // VPlane += 2
  416. mov eax, [esi]
  417. nop
  418. mov ebx, eax
  419. and eax, 0xFF
  420. shr ebx, 16
  421. mov ecx, [esp + UPLANE]
  422. mov ax, [YUVPalette+eax*4].UVval
  423. and ebx, 0xFF
  424. mov edx, [esp + VPLANE]
  425. add ecx, 2
  426. mov bx, [YUVPalette+ebx*4].UVval
  427. add edx, 2
  428. mov [ecx - 2], al
  429. mov [esp + UPLANE], ecx
  430. mov [edx - 2], ah
  431. mov [esp + VPLANE], edx
  432. mov [ecx - 1], bl
  433. mov [edx - 1], bh
  434. Lno_luma:
  435. // pnext++
  436. // YPlane += 4
  437. lea esi, [esi + 4]
  438. lea edi, [edi + 4]
  439. sub ebp, 4
  440. jnz L6
  441. // Assembler version of C_WIDTH_DIFF
  442. // if (width_diff)
  443. mov eax, [esp + WIDTH_DIFF]
  444. mov edx, eax
  445. test eax, eax
  446. jz Lno_width_diff
  447. // tm = (*(YPlane-1)) << 24
  448. // tm |= (tm>>8) | (tm>>16) | (tm>>24)
  449. mov bl, [edi - 1]
  450. shl ebx, 24
  451. mov ecx, ebx
  452. shr ebx, 8
  453. or ecx, ebx
  454. shr ebx, 8
  455. or ecx, ebx
  456. shr ebx, 8
  457. or ecx, ebx
  458. // *(U32 *)YPlane = tm
  459. mov [edi], ecx
  460. // if ((width_diff-4) > 0)
  461. sub eax, 4
  462. jz Lupdate_YPlane
  463. // *(U32 *)(YPlane + 4) = tm
  464. mov [edi + 4], ecx
  465. sub eax, 4
  466. // if ((width_diff-8) > 0)
  467. jz Lupdate_YPlane
  468. // *(U32 *)(YPlane + 8) = tm
  469. mov [edi + 8], ecx
  470. Lupdate_YPlane:
  471. // YPlane += width_diff
  472. lea edi, [edi + edx]
  473. ///if (0 == (k&1))
  474. mov eax, [esp + LOOP_K]
  475. test eax, 1
  476. jnz Lno_width_diff
  477. // t8u = *(UPlane-1)
  478. // t8v = *(VPlane-1)
  479. // *UPlane++ = t8u
  480. // *UPlane++ = t8u
  481. // *VPlane++ = t8v
  482. // *VPlane++ = t8v
  483. mov ebp, edx
  484. mov eax, [esp + UPLANE]
  485. mov ebx, [esp + VPLANE]
  486. mov cl, [eax - 1]
  487. mov ch, [ebx - 1]
  488. mov [eax], cl
  489. mov [eax + 1], cl
  490. mov [ebx], ch
  491. mov [ebx + 1], ch
  492. // if ((width_diff-4) > 0)
  493. sub ebp, 4
  494. jz Lupdate_UVPlane
  495. // *UPlane++ = t8u
  496. // *UPlane++ = t8u
  497. // *VPlane++ = t8v
  498. // *VPlane++ = t8v
  499. mov [eax + 2], cl
  500. mov [eax + 3], cl
  501. mov [ebx + 2], ch
  502. mov [ebx + 3], ch
  503. // if ((width_diff-8) > 0)
  504. sub ebp, 4
  505. jz Lupdate_UVPlane
  506. // *UPlane++ = t8u
  507. // *UPlane++ = t8u
  508. // *VPlane++ = t8v
  509. // *VPlane++ = t8v
  510. mov [eax + 4], cl
  511. mov [eax + 5], cl
  512. mov [ebx + 4], ch
  513. mov [ebx + 5], ch
  514. Lupdate_UVPlane:
  515. shr edx, 1
  516. lea eax, [eax + edx]
  517. mov [esp + UPLANE], eax
  518. lea ebx, [ebx + edx]
  519. mov [esp + VPLANE], ebx
  520. Lno_width_diff:
  521. // if (stretch && (0 == k) && j)
  522. mov eax, [esp + STRETCH]
  523. test eax, eax
  524. jz L14
  525. mov eax, [esp + LOOP_K]
  526. test eax, eax
  527. jnz L14
  528. mov eax, [esp + LOOP_J]
  529. test eax, eax
  530. jz L14
  531. // spill YPlane ptr
  532. mov [esp + YPLANE], edi
  533. nop
  534. // for (i = OutputWidth; i > 0; i -= 8)
  535. // assign (ebx, pyprev)
  536. // assign (ecx, t)
  537. // assign (edx, pynext)
  538. // assign (edi, pyspace)
  539. // assign (ebp, i)
  540. // make sure offsets are such that there are no bank conflicts here
  541. mov ebx, [esp + PYPREV]
  542. mov edi, [esp + PYSPACE]
  543. mov edx, [esp + PYNEXT]
  544. mov ebp, [esp + OUTPUT_WIDTH]
  545. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  546. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  547. // *pyspace++ = t
  548. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  549. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  550. // *pyspace++ = t
  551. L15:
  552. // 1
  553. mov eax, [ebx]
  554. lea ebx, [ebx + 4]
  555. // 2
  556. mov ecx, [edx]
  557. lea edx, [edx + 4]
  558. // 3
  559. shr ecx, 1
  560. and eax, 0xFEFEFEFE
  561. // 4
  562. shr eax, 1
  563. and ecx, 0x7F7F7F7F
  564. // 5
  565. add eax, ecx
  566. mov ecx, [ebx]
  567. // 6
  568. shr ecx, 1
  569. mov [edi], eax
  570. // 7
  571. mov eax, [edx]
  572. and ecx, 0x7F7F7F7F
  573. // 8
  574. shr eax, 1
  575. lea edi, [edi + 4]
  576. // 9
  577. and eax, 0x7F7F7F7F
  578. lea ebx, [ebx + 4]
  579. // 10
  580. lea edx, [edx + 4]
  581. add eax, ecx
  582. // 11
  583. mov [edi], eax
  584. lea edi, [edi + 4]
  585. // 12
  586. sub ebp, 8
  587. jnz L15
  588. // kill (ebx, pyprev)
  589. // kill (ecx, t)
  590. // kill (edx, pynext)
  591. // kill (edi, pyspace)
  592. // kill (ebp, i)
  593. // restore YPlane
  594. mov edi, [esp + YPLANE]
  595. // pnext += BackTwoLines
  596. L14:
  597. add esi, [esp + BACK_TWO_LINES]
  598. // YPlane += byte_ypitch_adj;
  599. add edi, [esp + BYTE_YPITCH_ADJ]
  600. // if(0 == (k&1))
  601. mov eax, [esp + LOOP_K]
  602. and eax, 1
  603. jnz L16
  604. // UPlane += byte_uvpitch_adj;
  605. // VPlane += byte_uvpitch_adj;
  606. mov eax, [esp + BYTE_UVPITCH_ADJ]
  607. add [esp + UPLANE], eax
  608. add [esp + VPLANE], eax
  609. L16:
  610. inc DWORD PTR [esp + LOOP_K]
  611. mov eax, [esp + LOOP_K]
  612. cmp eax, [esp + MARK]
  613. jl L5
  614. // if (stretch)
  615. cmp DWORD PTR [esp + STRETCH], 0
  616. je L17
  617. // pyprev = YPlane - pitch
  618. mov eax, edi
  619. sub eax, [esp + PITCH_PARM]
  620. mov [esp + PYPREV], eax
  621. // pyspace = YPlane
  622. mov [esp + PYSPACE], edi
  623. // pynext = (YPlane += pitch)
  624. add edi, [esp + PITCH_PARM]
  625. mov [esp + PYNEXT], edi
  626. L17:
  627. inc DWORD PTR [esp + LOOP_J]
  628. mov eax, [esp + LOOP_J]
  629. cmp eax, [esp + LUMA_ITERS]
  630. jl L4
  631. // kill (esi, pnext)
  632. // kill (edi, YPlane)
  633. // ASM version of C_HEIGHT_FILL
  634. // if (height_diff)
  635. mov eax, [esp + HEIGHT_DIFF]
  636. test eax, eax
  637. jz Lno_height_diff
  638. // pyspace = (U32 *)YPlane
  639. mov esi, edi
  640. // pyprev = (U32 *)(YPlane - pitch)
  641. sub esi, [esp + PITCH_PARM]
  642. // for (j = height_diff; j > 0; j--)
  643. Lheight_yfill_loop:
  644. mov ebx, [esp + WIDTHX16]
  645. // for (i = widthx16; i>0; i -=4)
  646. Lheight_yfill_row:
  647. // *pyspace++ = *pyprev++
  648. mov ecx, [esi]
  649. lea esi, [esi + 4]
  650. mov [edi], ecx
  651. lea edi, [edi + 4]
  652. sub ebx, 4
  653. jnz Lheight_yfill_row
  654. // pyspace += word_ypitch_adj
  655. // pyprev += word_ypitch_adj
  656. add esi, [esp + BYTE_YPITCH_ADJ]
  657. add edi, [esp + BYTE_YPITCH_ADJ]
  658. dec eax
  659. jnz Lheight_yfill_loop
  660. mov eax, [esp + HEIGHT_DIFF]
  661. mov edi, [esp + UPLANE]
  662. // puvspace = (U32 *)UPlane
  663. mov esi, edi
  664. // puvprev = (U32 *)(UPlane - pitch)
  665. sub esi, [esp + PITCH_PARM]
  666. // for (j = height_diff; j > 0; j -= 2)
  667. Lheight_ufill_loop:
  668. mov ebx, [esp + WIDTHX16]
  669. // for (i = widthx16; i>0; i -= 8)
  670. Lheight_ufill_row:
  671. // *puvspace++ = *puvprev++
  672. mov ecx, [esi]
  673. mov [edi], ecx
  674. lea esi, [esi + 4]
  675. lea edi, [edi + 4]
  676. sub ebx, 8
  677. jnz Lheight_ufill_row
  678. // puvspace += word_uvpitch_adj
  679. // puvprev += word_uvpitch_adj
  680. add esi, [esp + BYTE_UVPITCH_ADJ]
  681. add edi, [esp + BYTE_UVPITCH_ADJ]
  682. sub eax, 2
  683. jnz Lheight_ufill_loop
  684. mov eax, [esp + HEIGHT_DIFF]
  685. mov edi, [esp + VPLANE]
  686. // puvspace = (U32 *)VPlane
  687. mov esi, edi
  688. // puvprev = (U32 *)(VPlane - pitch)
  689. sub esi, [esp + PITCH_PARM]
  690. // for (j = height_diff; j > 0; j -= 2)
  691. Lheight_vfill_loop:
  692. mov ebx, [esp + WIDTHX16]
  693. // for (i = widthx16; i>0; i -= 8)
  694. Lheight_vfill_row:
  695. // *puvspace++ = *puvprev++
  696. mov ecx, [esi]
  697. mov [edi], ecx
  698. lea esi, [esi + 4]
  699. lea edi, [edi + 4]
  700. sub ebx, 8
  701. jnz Lheight_vfill_row
  702. // puvspace += word_uvpitch_adj
  703. // puvprev += word_uvpitch_adj
  704. add esi, [esp + BYTE_UVPITCH_ADJ]
  705. add edi, [esp + BYTE_UVPITCH_ADJ]
  706. sub eax, 2
  707. jnz Lheight_vfill_loop
  708. Lno_height_diff:
  709. // if (stretch)
  710. mov esi, [esp + PYPREV]
  711. cmp DWORD PTR [esp + STRETCH], 0
  712. je L19
  713. // for (i = OutputWidth; i > 0; i -= 4)
  714. // assign (esi, pyprev)
  715. // assign (edi, pyspace)
  716. // assign (ebp, i)
  717. mov ebp, [esp + OUTPUT_WIDTH]
  718. mov edi, [esp + PYSPACE]
  719. L18:
  720. mov ecx, [esi]
  721. lea esi, [esi + 4]
  722. mov [edi], ecx
  723. lea edi, [edi + 4]
  724. sub ebp, 4
  725. jnz L18
  726. // kill (esi, pyprev)
  727. // kill (edi, pyspace)
  728. // kill (ebp, i)
  729. L19:
  730. add esp, LOCALSIZE
  731. pop edi
  732. pop esi
  733. pop ebx
  734. pop ebp
  735. ret
  736. }
  737. }
  738. #undef LOCALSIZE
  739. #undef PITCH_PARM
  740. #undef VPLANE
  741. #undef UPLANE
  742. #undef YPLANE
  743. #undef LP_INPUT
  744. #undef OUTPUT_HEIGHT_WORD
  745. #undef OUTPUT_WIDTH_WORD
  746. #undef LPBI_INPUT
  747. #undef OUTPUT_WIDTH
  748. #undef PYPREV
  749. #undef PYSPACE
  750. #undef PYNEXT
  751. #undef PUVPREV
  752. #undef PUVSPACE
  753. #undef LOOP_I
  754. #undef LOOP_J
  755. #undef LOOP_K
  756. #undef BACK_TWO_LINES
  757. #undef WIDTHX16
  758. #undef HEIGHTX16
  759. #undef WIDTH_DIFF
  760. #undef HEIGHT_DIFF
  761. #undef WIDTH_ADJ
  762. #undef HEIGHT_ADJ
  763. #undef STRETCH
  764. #undef ASPECT
  765. #undef LUMA_ITERS
  766. #undef MARK
  767. #undef BYTE_YPITCH_ADJ
  768. #undef BYTE_UVPITCH_ADJ
  769. #endif // } H263P