Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

984 lines
22 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. #include "precomp.h"
  15. #if defined(H263P) || defined(USE_BILINEAR_MSH26X) // {
  16. //
  17. // For the P5 versions, the strategy is to compute the Y value for an odd RGB value
  18. // followed by computing the Y value for the corresponding even RGB value. The registers
  19. // are then set with the proper values to compute U and V values for the even RGB
  20. // value. This avoids repeating the shifting and masking needed to extract the Red,
  21. // Green and Blue components.
  22. //
  23. // Only the 555 version of RGB16 input color conversion is provided. To generate
  24. // other versions, use the following table.
  25. //
  26. // number shift mask
  27. // B, G, R
  28. // ------ ----------- ----------------
  29. // 555 2, 3, 8 0x7C, 0x7C, 0x7C
  30. // 664 3, 3, 9 0x78, 0x7E, 0x7E
  31. // 565 2, 4, 9 0x7C, 0x7E, 0x7C
  32. // 655 2, 3, 9 0x7C, 0x7C, 0x7E
  33. //
  34. // Only 555 falls under BI_RGB. The others are specified using the
  35. // BI_BITFIELDS compression specification. For BI_BITFIELDS, call
  36. // Build16bitModeID to get the actual bitfield number. This routine requires the
  37. // three array elements in the bmiColors field of a BITMAPINFO object.
  38. //
  39. /*****************************************************************************
  40. *
  41. * H26X_BGR16555toYUV12()
  42. *
  43. * Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory
  44. * with pitch defined by the constant PITCH. The input data is stored in
  45. * the order B,G,R,B,G,R...
  46. *
  47. */
  48. #if 0 // { 0
  49. void C_H26X_BGR16555toYUV12(
  50. LPBITMAPINFOHEADER lpbiInput,
  51. WORD OutputWidth,
  52. WORD OutputHeight,
  53. U8 *lpInput,
  54. U8 *YPlane,
  55. U8 *UPlane,
  56. U8 *VPlane,
  57. const int pitch)
  58. {
  59. int t1, t2;
  60. int tm1, tm2;
  61. C_RGB_COLOR_CONVERT_INIT
  62. for ( j = 0; j < LumaIters; j++) {
  63. for (k = 0; k < mark; k++) {
  64. for (i = OutputWidth; i > 0; i-=4, YPlane+=4) {
  65. tm1 = *pnext++;
  66. t1 = (BYUV[(tm1<<2)&0x7C].YU +
  67. GYUV[(tm1>>3)&0x7C].YU +
  68. RYUV[(tm1>>8)&0x7C].YU);
  69. *(YPlane) = (U8)((t1>>SHIFT_WIDTH)+8);
  70. t = (BYUV[(tm1>>14)&0x7C].YU +
  71. GYUV[(tm1>>19)&0x7C].YU +
  72. RYUV[(tm1>>24)&0x7C].YU);
  73. *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
  74. tm2 = *pnext++;
  75. t2 = (BYUV[(tm2<<2)&0x7C].YU +
  76. GYUV[(tm2>>3)&0x7C].YU +
  77. RYUV[(tm2>>8)&0x7C].YU);
  78. *(YPlane+2) = (U8)((t2>>SHIFT_WIDTH)+8);
  79. t = (BYUV[(tm2>>14)&0x7C].YU +
  80. GYUV[(tm2>>19)&0x7C].YU +
  81. RYUV[(tm2>>24)&0x7C].YU);
  82. *(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8);
  83. if (0 == (k&1)) {
  84. *(U16 *)UPlane = ((t1+0x40000000)>>24) | (((t2+0x40000000)>>16)&0xFF00);
  85. t1 = (RYUV[(tm1>>8)&0x7C].V +
  86. GYUV[(tm1>>3)&0x7C].V +
  87. BYUV[(tm1<<2)&0x7C].V);
  88. t2 = (RYUV[(tm2>>8)&0x7C].V +
  89. GYUV[(tm2>>3)&0x7C].V +
  90. BYUV[(tm2<<2)&0x7C].V);
  91. *(U16 *)VPlane = ((t1+0x4000)>>8) | ((t2+0x4000)&0xFF00);
  92. UPlane += 2; VPlane += 2;
  93. }
  94. }
  95. // The next two cases are mutually exclusive.
  96. // If there is a width_diff there cannot be a stretch and
  97. // if there is a stretch, there cannot be a width_diff.
  98. C_WIDTH_FILL
  99. if (stretch && (0 == k) && j) {
  100. for (i = OutputWidth; i > 0; i -= 8) {
  101. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  102. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  103. *pyspace++ = tm;
  104. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  105. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  106. *pyspace++ = tm;
  107. }
  108. }
  109. pnext += BackTwoLines;
  110. YPlane += byte_ypitch_adj;
  111. // Increment after even lines.
  112. if(0 == (k&1)) {
  113. UPlane += byte_uvpitch_adj;
  114. VPlane += byte_uvpitch_adj;
  115. }
  116. } // end of for k
  117. if (stretch) {
  118. pyprev = (U32 *)(YPlane - pitch);
  119. pyspace = (U32 *)YPlane;
  120. pynext = (U32 *)(YPlane += pitch);
  121. }
  122. } // end of for j
  123. // The next two cases are mutually exclusive.
  124. // If there is a height_diff there cannot be a stretch and
  125. // if there is a stretch, there cannot be a height_diff.
  126. C_HEIGHT_FILL
  127. if (stretch) {
  128. for (i = OutputWidth; i > 0; i -= 4) {
  129. *pyspace++ = *pyprev++;
  130. }
  131. }
  132. } // end of C_H26X_BGR55516toYUV12
  133. #endif // } 0
  134. __declspec(naked)
  135. void P5_H26X_BGR16555toYUV12(
  136. LPBITMAPINFOHEADER lpbiInput,
  137. WORD OutputWidth,
  138. WORD OutputHeight,
  139. U8 *lpInput,
  140. U8 *YPlane,
  141. U8 *UPlane,
  142. U8 *VPlane,
  143. const int pitch)
  144. {
  145. // Permanent (callee-save) registers - ebx, esi, edi, ebp
  146. // Temporary (caller-save) registers - eax, ecx, edx
  147. //
  148. // Stack frame layout
  149. // | pitch | +136
  150. // | VPlane | +132
  151. // | UPlane | +128
  152. // | YPlane | +124
  153. // | lpInput | +120
  154. // | OutputHeight | +116
  155. // | OutputWidth | +112
  156. // | lpbiInput | +108
  157. // ----------------------------
  158. // | return addr | +104
  159. // | saved ebp | +100
  160. // | saved ebx | + 96
  161. // | saved esi | + 92
  162. // | saved edi | + 88
  163. // | output_width | + 84
  164. // | pyprev | + 80
  165. // | pyspace | + 76
  166. // | pynext | + 72
  167. // | puvprev | + 68
  168. // | puvspace | + 64
  169. // | i | + 60
  170. // | j | + 56
  171. // | k | + 52
  172. // | BackTwoLines | + 48
  173. // | widthx16 | + 44
  174. // | heightx16 | + 40
  175. // | width_diff | + 36
  176. // | height_diff | + 32
  177. // | width_adj | + 28
  178. // | height_adj | + 24
  179. // | stretch | + 20
  180. // | aspect | + 16
  181. // | LumaIters | + 12
  182. // | mark | + 8
  183. // | byte_ypitch_adj | + 4
  184. // | byte_uvpitch_adj | + 0
  185. #define LOCALSIZE 88
  186. #define PITCH_PARM 136
  187. #define VPLANE 132
  188. #define UPLANE 128
  189. #define YPLANE 124
  190. #define LP_INPUT 120
  191. #define OUTPUT_HEIGHT_WORD 116
  192. #define OUTPUT_WIDTH_WORD 112
  193. #define LPBI_INPUT 108
  194. #define OUTPUT_WIDTH 84
  195. #define PYPREV 80
  196. #define PYSPACE 76
  197. #define PYNEXT 72
  198. #define PUVPREV 68
  199. #define PUVSPACE 64
  200. #define LOOP_I 60
  201. #define LOOP_J 56
  202. #define LOOP_K 52
  203. #define BACK_TWO_LINES 48
  204. #define WIDTHX16 44
  205. #define HEIGHTX16 40
  206. #define WIDTH_DIFF 36
  207. #define HEIGHT_DIFF 32
  208. #define WIDTH_ADJ 28
  209. #define HEIGHT_ADJ 24
  210. #define STRETCH 20
  211. #define ASPECT 16
  212. #define LUMA_ITERS 12
  213. #define MARK 8
  214. #define BYTE_YPITCH_ADJ 4
  215. #define BYTE_UVPITCH_ADJ 0
  216. _asm {
  217. push ebp
  218. push ebx
  219. push esi
  220. push edi
  221. sub esp, LOCALSIZE
  222. // int width_diff = 0
  223. // int height_diff = 0
  224. // int width_adj = 0
  225. // int height_adj = 0
  226. // int stretch = 0
  227. // int aspect = 0
  228. xor eax, eax
  229. mov [esp + WIDTH_DIFF], eax
  230. mov [esp + HEIGHT_DIFF], eax
  231. mov [esp + WIDTH_ADJ], eax
  232. mov [esp + HEIGHT_ADJ], eax
  233. mov [esp + STRETCH], eax
  234. mov [esp + ASPECT], eax
  235. // int LumaIters = 1
  236. inc eax
  237. mov [esp + LUMA_ITERS], eax
  238. // int mark = OutputHeight
  239. // int output_width = OutputWidth
  240. // int byte_ypitch_adj = pitch - OutputWidth
  241. // int byte_uvpitch_adj = pitch - (OutputWidth >> 1)
  242. xor ebx, ebx
  243. mov bx, [esp + OUTPUT_HEIGHT_WORD]
  244. mov [esp + MARK], ebx
  245. mov bx, [esp + OUTPUT_WIDTH_WORD]
  246. mov [esp + OUTPUT_WIDTH], ebx
  247. mov ecx, [esp + PITCH_PARM]
  248. mov edx, ecx
  249. sub ecx, ebx
  250. mov [esp + BYTE_YPITCH_ADJ], ecx
  251. shr ebx, 1
  252. sub edx, ebx
  253. mov [esp + BYTE_UVPITCH_ADJ], edx
  254. // if (lpbiInput->biHeight > OutputHeight)
  255. mov ebx, [esp + LPBI_INPUT]
  256. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  257. xor edx, edx
  258. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  259. cmp ecx, edx
  260. jle Lno_stretch
  261. // for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) LumaIters += 4
  262. xor ecx, ecx
  263. Lrepeat48:
  264. lea ecx, [ecx + 4]
  265. sub edx, 48
  266. jnz Lrepeat48
  267. mov [esp + LUMA_ITERS], ecx
  268. // aspect = LumaIters
  269. mov [esp + ASPECT], ecx
  270. // width_adj = (lpbiInput->biWidth - OutputWidth) >> 1
  271. // width_adj *= lpbiInput->biBitCount
  272. // width_adj >>= 3
  273. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  274. mov edx, [esp + OUTPUT_WIDTH]
  275. sub ecx, edx
  276. shr ecx, 1
  277. xor edx, edx
  278. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  279. imul ecx, edx
  280. shr ecx, 3
  281. mov [esp + WIDTH_ADJ], ecx
  282. // height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1
  283. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  284. xor edx, edx
  285. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  286. sub ecx, edx
  287. add ecx, [esp + ASPECT]
  288. shr ecx, 1
  289. mov [esp + HEIGHT_ADJ], ecx
  290. // stretch = 1
  291. // mark = 11
  292. mov ecx, 1
  293. mov edx, 11
  294. mov [esp + STRETCH], ecx
  295. mov [esp + MARK], edx
  296. jmp Lif_done
  297. Lno_stretch:
  298. // widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF
  299. // width_diff = widthx16 - OutputWidth
  300. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  301. add ecx, 00FH
  302. and ecx, 0FFFFFFF0H
  303. mov [esp + WIDTHX16], ecx
  304. mov edx, [esp + OUTPUT_WIDTH]
  305. sub ecx, edx
  306. mov [esp + WIDTH_DIFF], ecx
  307. // byte_ypitch_adj -= width_diff
  308. mov edx, [esp + BYTE_YPITCH_ADJ]
  309. sub edx, ecx
  310. mov [esp + BYTE_YPITCH_ADJ], edx
  311. // byte_uvpitch_adj -= (width_diff >> 1)
  312. mov edx, [esp + BYTE_UVPITCH_ADJ]
  313. shr ecx, 1
  314. sub edx, ecx
  315. mov [esp + BYTE_UVPITCH_ADJ], edx
  316. // heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF
  317. // height_diff = heightx16 - OutputHeight
  318. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  319. add ecx, 00FH
  320. and ecx, 0FFFFFFF0H
  321. mov [esp + HEIGHTX16], ecx
  322. xor edx, edx
  323. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  324. sub ecx, edx
  325. mov [esp + HEIGHT_DIFF], ecx
  326. Lif_done:
  327. // BackTwoLines = -(lpbiInput->biWidth + OutputWidth);
  328. // BackTwoLines *= lpbiInput->biBitCount
  329. // BackTwoLines >>= 3
  330. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  331. mov edx, [esp + OUTPUT_WIDTH]
  332. add ecx, edx
  333. neg ecx
  334. xor edx, edx
  335. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  336. imul ecx, edx
  337. sar ecx, 3
  338. mov [esp + BACK_TWO_LINES], ecx
  339. // pnext = (U32 *)(lpInput +
  340. // (((lpbiInput->biWidth * lpbiInput->biBitCount) >> 3)) *
  341. // ((OutputHeight - aspect - 1) + height_adj)) +
  342. // width_adj)
  343. // assign (esi, pnext)
  344. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  345. xor edx, edx
  346. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  347. imul ecx, edx
  348. shr ecx, 3
  349. xor edx, edx
  350. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  351. sub edx, [esp + ASPECT]
  352. dec edx
  353. add edx, [esp + HEIGHT_ADJ]
  354. imul ecx, edx
  355. add ecx, [esp + WIDTH_ADJ]
  356. add ecx, [esp + LP_INPUT]
  357. mov esi, ecx
  358. // assign (edi, YPlane)
  359. mov edi, [esp + YPLANE]
  360. // for (j = 0; j < LumaIters; j++)
  361. xor eax, eax
  362. mov [esp + LOOP_J], eax
  363. // for (k = 0; k < mark; k++)
  364. L4:
  365. xor eax, eax
  366. mov [esp + LOOP_K], eax
  367. // for (i = OutputWidth; i > 0; i -= 2, pnext += 4)
  368. L5:
  369. mov eax, [esp + OUTPUT_WIDTH]
  370. // This jump is here to make sure the following loop starts on the U pipe
  371. jmp L6
  372. L6:
  373. // tm1 = pnext[0]
  374. // t = ( BYUV[(tm1>>14)&0x7C].YU +
  375. // GYUV[(tm1>>19)&0x7C].YU +
  376. // RYUV[(tm1>>24)&0x7C].YU )
  377. // *(YPlane+1) = (U8)((t>>8)+8)
  378. // t1 = ( BYUV[(tm1<< 2)&0x7C].YU +
  379. // GYUV[(tm1>> 8)&0x7C].YU +
  380. // RYUV[(tm1>>13)&0x7C].YU )
  381. // *YPlane = (U8)((t1>>8)+8)
  382. // assign(eax: B2/Y1/Y2/U)
  383. // assign(ebx: B1/V)
  384. // assign(ecx: G2/G1)
  385. // assign(edx: R2/R1)
  386. // assign(ebp: B1)
  387. // 1
  388. mov ebx, [esi]
  389. mov [esp + LOOP_I], eax
  390. // 2
  391. mov eax, ebx
  392. mov ecx, ebx
  393. // 3
  394. shr eax, 14
  395. mov edx, ebx
  396. // 4
  397. shr ecx, 19
  398. and eax, 0x7C
  399. // 5
  400. shr edx, 24
  401. and ecx, 0x7C
  402. // 6
  403. mov eax, [BYUV+eax*8].YU
  404. and edx, 0x7C
  405. // 7
  406. add eax, [GYUV+ecx*8].YU
  407. mov ecx, ebx
  408. // 8
  409. add eax, [RYUV+edx*8].YU
  410. mov edx, ebx
  411. // 9
  412. sar eax, 8
  413. and ebx, 0x1F
  414. // 10
  415. shl ebx, 2
  416. add eax, 8
  417. // 11
  418. shr ecx, 3
  419. mov [edi + 1], al
  420. // 12
  421. shr edx, 8
  422. and ecx, 0x7C
  423. // 13
  424. mov eax, [BYUV+ebx*8].YU
  425. and edx, 0x7C
  426. // 14
  427. add eax, [GYUV+ecx*8].YU
  428. mov ebp, ebx
  429. // 15
  430. add eax, [RYUV+edx*8].YU
  431. lea edi, [edi + 4]
  432. // 16
  433. sar eax, 8
  434. mov ebx, [esp + LOOP_K]
  435. // 17
  436. add eax, 8
  437. and ebx, 1
  438. // 18
  439. mov [edi - 4], al
  440. jnz L9a
  441. // At this point, ebp: B1, ecx: G1, edx: R1
  442. // *UPlane++ = (U8)((t1>>24)+64)
  443. // t = ( VBGR[(t>>13)&0x7C].VR +
  444. // VBGR[(t>> 8)&0x7C].VG +
  445. // VBGR[(t<< 2)&0x7C].VB )
  446. // *VPlane++ = (U8)((t>>8)+64)
  447. // 19
  448. mov ebx, [RYUV+edx*8].V
  449. mov edx, [esp + UPLANE]
  450. // 20
  451. sar eax, 16
  452. add ebx, [GYUV+ecx*8].V
  453. // 21
  454. add eax, 64
  455. add ebx, [BYUV+ebp*8].V
  456. // 22
  457. mov [edx], al
  458. inc edx
  459. // 23
  460. mov [esp + UPLANE], edx
  461. mov edx, [esp + VPLANE]
  462. // 24
  463. sar ebx, 8
  464. inc edx
  465. // 25
  466. add ebx, 64
  467. mov [esp + VPLANE], edx
  468. // 26
  469. mov [edx - 1], bl
  470. nop
  471. L9a:
  472. // tm2 = pnext[1]
  473. // t = ( BYUV[(tm2>>14)&0x7C].YU +
  474. // GYUV[(tm2>>19)&0x7C].YU +
  475. // RYUV[(tm2>>24)&0x7C].YU )
  476. // *(YPlane+1) = (U8)((t>>8)+8)
  477. // t2 = ( BYUV[(tm2<< 2)&0x7C].YU +
  478. // GYUV[(tm2>> 8)&0x7C].YU +
  479. // RYUV[(tm2>>13)&0x7C].YU )
  480. // *YPlane = (U8)((t2>>8)+8)
  481. // YPlane += 4
  482. // assign(eax: B2/Y1/Y2/U)
  483. // assign(ebx: B1/V)
  484. // assign(ecx: G2/G1)
  485. // assign(edx: R2/R1)
  486. // assign(ebp: B1)
  487. // 27
  488. mov eax, [esi + 4]
  489. lea esi, [esi + 8]
  490. // 28
  491. mov ebx, eax
  492. mov ecx, eax
  493. // 29
  494. shr eax, 14
  495. mov edx, ebx
  496. // 30
  497. shr ecx, 19
  498. and eax, 0x7C
  499. // 31
  500. shr edx, 24
  501. and ecx, 0x7C
  502. // 32
  503. mov eax, [BYUV+eax*8].YU
  504. and edx, 0x7C
  505. // 33
  506. add eax, [GYUV+ecx*8].YU
  507. mov ecx, ebx
  508. // 34
  509. add eax, [RYUV+edx*8].YU
  510. mov edx, ebx
  511. // 35
  512. sar eax, 8
  513. and ebx, 0x1F
  514. // 36
  515. shl ebx, 2
  516. add eax, 8
  517. // 37
  518. shr ecx, 3
  519. mov [edi - 1], al
  520. // 38
  521. shr edx, 8
  522. and ecx, 0x7C
  523. // 39
  524. mov eax, [BYUV+ebx*8].YU
  525. and edx, 0x7C
  526. // 40
  527. add eax, [GYUV+ecx*8].YU
  528. mov ebp, ebx
  529. // 41
  530. add eax, [RYUV+edx*8].YU
  531. nop
  532. // 42
  533. sar eax, 8
  534. mov ebx, [esp + LOOP_K]
  535. // 43
  536. add eax, 8
  537. and ebx, 1
  538. // 44
  539. mov [edi - 2], al
  540. jnz L9
  541. // At this point, ebp: B1, ecx: G1, edx: R1
  542. // *UPlane++ = (U8)((t2>>24)+64)
  543. // t = ( VBGR[(t>>13)&0x7C].VR +
  544. // VBGR[(t>> 8)&0x7C].VG +
  545. // VBGR[(t<< 2)&0x7C].VB )
  546. // *VPlane++ = (U8)((t>>8)+64)
  547. // 45
  548. mov ebx, [RYUV+edx*8].V
  549. mov edx, [esp + UPLANE]
  550. // 46
  551. sar eax, 16
  552. add ebx, [GYUV+ecx*8].V
  553. // 47
  554. add eax, 64
  555. add ebx, [BYUV+ebp*8].V
  556. // 48
  557. mov [edx], al
  558. inc edx
  559. // 49
  560. mov [esp + UPLANE], edx
  561. mov edx, [esp + VPLANE]
  562. // 50
  563. sar ebx, 8
  564. inc edx
  565. // 51
  566. add ebx, 64
  567. mov [esp + VPLANE], edx
  568. // 52
  569. mov [edx - 1], bl
  570. nop
  571. L9:
  572. // 53
  573. mov eax, [esp + LOOP_I]
  574. nop
  575. // 54
  576. sub eax, 4
  577. jnz L6
  578. // Assembler version of C_WIDTH_DIFF
  579. // if (width_diff)
  580. mov eax, [esp + WIDTH_DIFF]
  581. mov edx, eax
  582. test eax, eax
  583. jz Lno_width_diff
  584. // tm = (*(YPlane-1)) << 24
  585. // tm |= (tm>>8) | (tm>>16) | (tm>>24)
  586. mov bl, [edi - 1]
  587. shl ebx, 24
  588. mov ecx, ebx
  589. shr ebx, 8
  590. or ecx, ebx
  591. shr ebx, 8
  592. or ecx, ebx
  593. shr ebx, 8
  594. or ecx, ebx
  595. // *(U32 *)YPlane = tm
  596. mov [edi], ecx
  597. // if ((width_diff-4) > 0)
  598. sub eax, 4
  599. jz Lupdate_YPlane
  600. // *(U32 *)(YPlane + 4) = tm
  601. mov [edi + 4], ecx
  602. sub eax, 4
  603. // if ((width_diff-8) > 0)
  604. jz Lupdate_YPlane
  605. // *(U32 *)(YPlane + 8) = tm
  606. mov [edi + 8], ecx
  607. Lupdate_YPlane:
  608. // YPlane += width_diff
  609. lea edi, [edi + edx]
  610. ///if (0 == (k&1))
  611. mov eax, [esp + LOOP_K]
  612. test eax, 1
  613. jnz Lno_width_diff
  614. // t8u = *(UPlane-1)
  615. // t8v = *(VPlane-1)
  616. // *UPlane++ = t8u
  617. // *UPlane++ = t8u
  618. // *VPlane++ = t8v
  619. // *VPlane++ = t8v
  620. mov ebp, edx
  621. mov eax, [esp + UPLANE]
  622. mov ebx, [esp + VPLANE]
  623. mov cl, [eax - 1]
  624. mov ch, [ebx - 1]
  625. mov [eax], cl
  626. mov [eax + 1], cl
  627. mov [ebx], ch
  628. mov [ebx + 1], ch
  629. // if ((width_diff-4) > 0)
  630. sub ebp, 4
  631. jz Lupdate_UVPlane
  632. // *UPlane++ = t8u
  633. // *UPlane++ = t8u
  634. // *VPlane++ = t8v
  635. // *VPlane++ = t8v
  636. mov [eax + 2], cl
  637. mov [eax + 3], cl
  638. mov [ebx + 2], ch
  639. mov [ebx + 3], ch
  640. // if ((width_diff-8) > 0)
  641. sub ebp, 4
  642. jz Lupdate_UVPlane
  643. // *UPlane++ = t8u
  644. // *UPlane++ = t8u
  645. // *VPlane++ = t8v
  646. // *VPlane++ = t8v
  647. mov [eax + 4], cl
  648. mov [eax + 5], cl
  649. mov [ebx + 4], ch
  650. mov [ebx + 5], ch
  651. Lupdate_UVPlane:
  652. shr edx, 1
  653. lea eax, [eax + edx]
  654. mov [esp + UPLANE], eax
  655. lea ebx, [ebx + edx]
  656. mov [esp + VPLANE], ebx
  657. Lno_width_diff:
  658. // if (stretch && (0 == k) && j)
  659. mov eax, [esp + STRETCH]
  660. test eax, eax
  661. jz L14
  662. mov eax, [esp + LOOP_K]
  663. test eax, eax
  664. jnz L14
  665. mov eax, [esp + LOOP_J]
  666. test eax, eax
  667. jz L14
  668. // spill YPlane ptr
  669. mov [esp + YPLANE], edi
  670. nop
  671. // for (i = OutputWidth; i > 0; i -= 8)
  672. // assign (ebx, pyprev)
  673. // assign (ecx, t)
  674. // assign (edx, pynext)
  675. // assign (edi, pyspace)
  676. // assign (ebp, i)
  677. // make sure offsets are such that there are no bank conflicts here
  678. mov ebx, [esp + PYPREV]
  679. mov edi, [esp + PYSPACE]
  680. mov edx, [esp + PYNEXT]
  681. mov ebp, [esp + OUTPUT_WIDTH]
  682. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  683. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  684. // *pyspace++ = t
  685. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  686. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  687. // *pyspace++ = t
  688. L15:
  689. // 1
  690. mov eax, [ebx]
  691. lea ebx, [ebx + 4]
  692. // 2
  693. mov ecx, [edx]
  694. lea edx, [edx + 4]
  695. // 3
  696. shr ecx, 1
  697. and eax, 0xFEFEFEFE
  698. // 4
  699. shr eax, 1
  700. and ecx, 0x7F7F7F7F
  701. // 5
  702. add eax, ecx
  703. mov ecx, [ebx]
  704. // 6
  705. shr ecx, 1
  706. mov [edi], eax
  707. // 7
  708. mov eax, [edx]
  709. and ecx, 0x7F7F7F7F
  710. // 8
  711. shr eax, 1
  712. lea edi, [edi + 4]
  713. // 9
  714. and eax, 0x7F7F7F7F
  715. lea ebx, [ebx + 4]
  716. // 10
  717. lea edx, [edx + 4]
  718. add eax, ecx
  719. // 11
  720. mov [edi], eax
  721. lea edi, [edi + 4]
  722. // 12
  723. sub ebp, 8
  724. jnz L15
  725. // kill (ebx, pyprev)
  726. // kill (ecx, t)
  727. // kill (edx, pynext)
  728. // kill (edi, pyspace)
  729. // kill (ebp, i)
  730. // restore YPlane
  731. mov edi, [esp + YPLANE]
  732. // pnext += BackTwoLines
  733. L14:
  734. add esi, [esp + BACK_TWO_LINES]
  735. // YPlane += byte_ypitch_adj;
  736. add edi, [esp + BYTE_YPITCH_ADJ]
  737. // if(0 == (k&1))
  738. mov eax, [esp + LOOP_K]
  739. and eax, 1
  740. jnz L16
  741. // UPlane += byte_uvpitch_adj;
  742. // VPlane += byte_uvpitch_adj;
  743. mov eax, [esp + BYTE_UVPITCH_ADJ]
  744. add [esp + UPLANE], eax
  745. add [esp + VPLANE], eax
  746. L16:
  747. inc DWORD PTR [esp + LOOP_K]
  748. mov eax, [esp + LOOP_K]
  749. cmp eax, [esp + MARK]
  750. jl L5
  751. // if (stretch)
  752. cmp DWORD PTR [esp + STRETCH], 0
  753. je L17
  754. // pyprev = YPlane - pitch
  755. mov eax, edi
  756. sub eax, [esp + PITCH_PARM]
  757. mov [esp + PYPREV], eax
  758. // pyspace = YPlane
  759. mov [esp + PYSPACE], edi
  760. // pynext = (YPlane += pitch)
  761. add edi, [esp + PITCH_PARM]
  762. mov [esp + PYNEXT], edi
  763. L17:
  764. inc DWORD PTR [esp + LOOP_J]
  765. mov eax, [esp + LOOP_J]
  766. cmp eax, [esp + LUMA_ITERS]
  767. jl L4
  768. // kill (esi, pnext)
  769. // kill (edi, YPlane)
  770. // ASM version of C_HEIGHT_FILL
  771. // if (height_diff)
  772. mov eax, [esp + HEIGHT_DIFF]
  773. test eax, eax
  774. jz Lno_height_diff
  775. // pyspace = (U32 *)YPlane
  776. mov esi, edi
  777. // pyprev = (U32 *)(YPlane - pitch)
  778. sub esi, [esp + PITCH_PARM]
  779. // for (j = height_diff; j > 0; j--)
  780. Lheight_yfill_loop:
  781. mov ebx, [esp + WIDTHX16]
  782. // for (i = widthx16; i>0; i -=4)
  783. Lheight_yfill_row:
  784. // *pyspace++ = *pyprev++
  785. mov ecx, [esi]
  786. lea esi, [esi + 4]
  787. mov [edi], ecx
  788. lea edi, [edi + 4]
  789. sub ebx, 4
  790. jnz Lheight_yfill_row
  791. // pyspace += word_ypitch_adj
  792. // pyprev += word_ypitch_adj
  793. add esi, [esp + BYTE_YPITCH_ADJ]
  794. add edi, [esp + BYTE_YPITCH_ADJ]
  795. dec eax
  796. jnz Lheight_yfill_loop
  797. mov eax, [esp + HEIGHT_DIFF]
  798. mov edi, [esp + UPLANE]
  799. // puvspace = (U32 *)UPlane
  800. mov esi, edi
  801. // puvprev = (U32 *)(UPlane - pitch)
  802. sub esi, [esp + PITCH_PARM]
  803. // for (j = height_diff; j > 0; j -= 2)
  804. Lheight_ufill_loop:
  805. mov ebx, [esp + WIDTHX16]
  806. // for (i = widthx16; i>0; i -= 8)
  807. Lheight_ufill_row:
  808. // *puvspace++ = *puvprev++
  809. mov ecx, [esi]
  810. mov [edi], ecx
  811. lea esi, [esi + 4]
  812. lea edi, [edi + 4]
  813. sub ebx, 8
  814. jnz Lheight_ufill_row
  815. // puvspace += word_uvpitch_adj
  816. // puvprev += word_uvpitch_adj
  817. add esi, [esp + BYTE_UVPITCH_ADJ]
  818. add edi, [esp + BYTE_UVPITCH_ADJ]
  819. sub eax, 2
  820. jnz Lheight_ufill_loop
  821. mov eax, [esp + HEIGHT_DIFF]
  822. mov edi, [esp + VPLANE]
  823. // puvspace = (U32 *)VPlane
  824. mov esi, edi
  825. // puvprev = (U32 *)(VPlane - pitch)
  826. sub esi, [esp + PITCH_PARM]
  827. // for (j = height_diff; j > 0; j -= 2)
  828. Lheight_vfill_loop:
  829. mov ebx, [esp + WIDTHX16]
  830. // for (i = widthx16; i>0; i -= 8)
  831. Lheight_vfill_row:
  832. // *puvspace++ = *puvprev++
  833. mov ecx, [esi]
  834. mov [edi], ecx
  835. lea esi, [esi + 4]
  836. lea edi, [edi + 4]
  837. sub ebx, 8
  838. jnz Lheight_vfill_row
  839. // puvspace += word_uvpitch_adj
  840. // puvprev += word_uvpitch_adj
  841. add esi, [esp + BYTE_UVPITCH_ADJ]
  842. add edi, [esp + BYTE_UVPITCH_ADJ]
  843. sub eax, 2
  844. jnz Lheight_vfill_loop
  845. Lno_height_diff:
  846. // if (stretch)
  847. mov esi, [esp + PYPREV]
  848. cmp DWORD PTR [esp + STRETCH], 0
  849. je L19
  850. // for (i = OutputWidth; i > 0; i -= 4)
  851. // assign (esi, pyprev)
  852. // assign (edi, pyspace)
  853. // assign (ebp, i)
  854. mov ebp, [esp + OUTPUT_WIDTH]
  855. mov edi, [esp + PYSPACE]
  856. L18:
  857. mov ecx, [esi]
  858. lea esi, [esi + 4]
  859. mov [edi], ecx
  860. lea edi, [edi + 4]
  861. sub ebp, 4
  862. jnz L18
  863. // kill (esi, pyprev)
  864. // kill (edi, pyspace)
  865. // kill (ebp, i)
  866. L19:
  867. add esp, LOCALSIZE
  868. pop edi
  869. pop esi
  870. pop ebx
  871. pop ebp
  872. ret
  873. }
  874. }
  875. #undef LOCALSIZE
  876. #undef PITCH_PARM
  877. #undef VPLANE
  878. #undef UPLANE
  879. #undef YPLANE
  880. #undef LP_INPUT
  881. #undef OUTPUT_HEIGHT_WORD
  882. #undef OUTPUT_WIDTH_WORD
  883. #undef LPBI_INPUT
  884. #undef OUTPUT_WIDTH
  885. #undef PYPREV
  886. #undef PYSPACE
  887. #undef PYNEXT
  888. #undef PUVPREV
  889. #undef PUVSPACE
  890. #undef LOOP_I
  891. #undef LOOP_J
  892. #undef LOOP_K
  893. #undef BACK_TWO_LINES
  894. #undef WIDTHX16
  895. #undef HEIGHTX16
  896. #undef WIDTH_DIFF
  897. #undef HEIGHT_DIFF
  898. #undef WIDTH_ADJ
  899. #undef HEIGHT_ADJ
  900. #undef STRETCH
  901. #undef ASPECT
  902. #undef LUMA_ITERS
  903. #undef MARK
  904. #undef BYTE_YPITCH_ADJ
  905. #undef BYTE_UVPITCH_ADJ
  906. #endif // } H263P