Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1012 lines
22 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. #include "precomp.h"
  15. #ifdef H263P // {
  16. //
  17. // For the P5 versions, the strategy is to compute the Y value for an odd RGB value
  18. // followed by computing the Y value for the corresponding even RGB value. The registers
  19. // are then set with the proper values to compute U and V values for the even RGB
  20. // value. This avoids repeating the shifting and masking needed to extract the Red,
  21. // Green and Blue components.
  22. //
  23. /*****************************************************************************
  24. *
  25. * H26X_BGR32toYUV12()
  26. *
  27. * Convert from BGR32 to YUV12 (YCrCb 4:2:0) and copy to destination memory
  28. * with pitch defined by the constant PITCH. The input data is stored in
  29. * the order B,G,R,B,G,R...
  30. *
  31. */
  32. #if 0 // { 0
  33. void C_H26X_BGR32toYUV12(
  34. LPBITMAPINFOHEADER lpbiInput,
  35. WORD OutputWidth,
  36. WORD OutputHeight,
  37. U8 *lpInput,
  38. U8 *YPlane,
  39. U8 *UPlane,
  40. U8 *VPlane,
  41. const int pitch)
  42. {
  43. int tm1, tm2;
  44. int t1, t2, t3, t4;
  45. C_RGB_COLOR_CONVERT_INIT
  46. // This assignment statement is here simply to avoid a warning message.
  47. t = t;
  48. for ( j = 0; j < LumaIters; j++) {
  49. for (k = 0; k < mark; k++) {
  50. for (i = OutputWidth; i > 0; i-=4, YPlane+=4) {
  51. tm1 = *pnext++;
  52. t1 = (BYUV[(tm1>>1)&0x7F].YU +
  53. GYUV[(tm1>>9)&0x7F].YU +
  54. RYUV[(tm1>>17)&0x7F].YU);
  55. tm = *pnext++;
  56. t2 = (BYUV[(tm>>1)&0x7F].YU +
  57. GYUV[(tm>>9)&0x7F].YU +
  58. RYUV[(tm>>17)&0x7F].YU);
  59. tm2 = *pnext++;
  60. t3 = (BYUV[(tm2>>1)&0x7F].YU +
  61. GYUV[(tm2>>9)&0x7F].YU +
  62. RYUV[(tm2>>17)&0x7F].YU);
  63. tm = *pnext++;
  64. t4 = (BYUV[(tm>>1)&0x7F].YU +
  65. GYUV[(tm>>9)&0x7F].YU +
  66. RYUV[(tm>>17)&0x7F].YU);
  67. *(U32 *)YPlane =
  68. (((t1+0x800)>>8)&0xFF) |
  69. ((t2+0x800)&0xFF00) |
  70. (((t3+0x800)<<8)&0xFF0000) |
  71. (((t4+0x800)<<16)&0xFF000000);
  72. if (0 == (k&1)) {
  73. *(U16 *)UPlane =
  74. ((t1+0x40000000)>>24) |
  75. (((t2+0x40000000)>>16)&0xFF00);
  76. t1 = (BYUV[(tm1>>1)&0x7F].V +
  77. GYUV[(tm1>>9)&0x7F].V +
  78. RYUV[(tm1>>17)&0x7F].V);
  79. t2 = (BYUV[(tm2>>1)&0x7F].V +
  80. GYUV[(tm2>>9)&0x7F].V +
  81. RYUV[(tm2>>17)&0x7F].V);
  82. *(U16 *)VPlane =
  83. ((t1+0x4000)>>8) |
  84. ((t2+0x4000)&0xFF00);
  85. UPlane += 2; VPlane += 2;
  86. }
  87. }
  88. // The next two cases are mutually exclusive.
  89. // If there is a width_diff there cannot be a stretch and
  90. // if there is a stretch, there cannot be a width_diff.
  91. C_WIDTH_FILL
  92. if (stretch && (0 == k) && j) {
  93. for (i = OutputWidth; i > 0; i -= 8) {
  94. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  95. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  96. *pyspace++ = tm;
  97. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  98. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  99. *pyspace++ = tm;
  100. }
  101. }
  102. pnext += BackTwoLines;
  103. YPlane += byte_ypitch_adj;
  104. // Increment after even lines.
  105. if(0 == (k&1)) {
  106. UPlane += byte_uvpitch_adj;
  107. VPlane += byte_uvpitch_adj;
  108. }
  109. } // end of for k
  110. if (stretch) {
  111. pyprev = (U32 *)(YPlane - pitch);
  112. pyspace = (U32 *)YPlane;
  113. pynext = (U32 *)(YPlane += pitch);
  114. }
  115. } // end of for j
  116. // The next two cases are mutually exclusive.
  117. // If there is a height_diff there cannot be a stretch and
  118. // if there is a stretch, there cannot be a height_diff.
  119. C_HEIGHT_FILL
  120. if (stretch) {
  121. for (i = OutputWidth; i > 0; i -= 4) {
  122. *pyspace++ = *pyprev++;
  123. }
  124. }
  125. } // end of C_H26X_BGR32toYUV12()
  126. #endif // } 0
  127. __declspec(naked)
  128. void P5_H26X_BGR32toYUV12(
  129. LPBITMAPINFOHEADER lpbiInput,
  130. WORD OutputWidth,
  131. WORD OutputHeight,
  132. U8 *lpInput,
  133. U8 *YPlane,
  134. U8 *UPlane,
  135. U8 *VPlane,
  136. const int pitch)
  137. {
  138. // Permanent (callee-save) registers - ebx, esi, edi, ebp
  139. // Temporary (caller-save) registers - eax, ecx, edx
  140. //
  141. // Stack frame layout
  142. // | pitch | +136
  143. // | VPlane | +132
  144. // | UPlane | +128
  145. // | YPlane | +124
  146. // | lpInput | +120
  147. // | OutputHeight | +116
  148. // | OutputWidth | +112
  149. // | lpbiInput | +108
  150. // ----------------------------
  151. // | return addr | +104
  152. // | saved ebp | +100
  153. // | saved ebx | + 96
  154. // | saved esi | + 92
  155. // | saved edi | + 88
  156. // | output_width | + 84
  157. // | pyprev | + 80
  158. // | pyspace | + 76
  159. // | pynext | + 72
  160. // | puvprev | + 68
  161. // | puvspace | + 64
  162. // | i | + 60
  163. // | j | + 56
  164. // | k | + 52
  165. // | BackTwoLines | + 48
  166. // | widthx16 | + 44
  167. // | heightx16 | + 40
  168. // | width_diff | + 36
  169. // | height_diff | + 32
  170. // | width_adj | + 28
  171. // | height_adj | + 24
  172. // | stretch | + 20
  173. // | aspect | + 16
  174. // | LumaIters | + 12
  175. // | mark | + 8
  176. // | byte_ypitch_adj | + 4
  177. // | byte_uvpitch_adj | + 0
  178. #define LOCALSIZE 88
  179. #define PITCH_PARM 136
  180. #define VPLANE 132
  181. #define UPLANE 128
  182. #define YPLANE 124
  183. #define LP_INPUT 120
  184. #define OUTPUT_HEIGHT_WORD 116
  185. #define OUTPUT_WIDTH_WORD 112
  186. #define LPBI_INPUT 108
  187. #define OUTPUT_WIDTH 84
  188. #define PYPREV 80
  189. #define PYSPACE 76
  190. #define PYNEXT 72
  191. #define PUVPREV 68
  192. #define PUVSPACE 64
  193. #define LOOP_I 60
  194. #define LOOP_J 56
  195. #define LOOP_K 52
  196. #define BACK_TWO_LINES 48
  197. #define WIDTHX16 44
  198. #define HEIGHTX16 40
  199. #define WIDTH_DIFF 36
  200. #define HEIGHT_DIFF 32
  201. #define WIDTH_ADJ 28
  202. #define HEIGHT_ADJ 24
  203. #define STRETCH 20
  204. #define ASPECT 16
  205. #define LUMA_ITERS 12
  206. #define MARK 8
  207. #define BYTE_YPITCH_ADJ 4
  208. #define BYTE_UVPITCH_ADJ 0
  209. _asm {
  210. push ebp
  211. push ebx
  212. push esi
  213. push edi
  214. sub esp, LOCALSIZE
  215. // int width_diff = 0
  216. // int height_diff = 0
  217. // int width_adj = 0
  218. // int height_adj = 0
  219. // int stretch = 0
  220. // int aspect = 0
  221. xor eax, eax
  222. mov [esp + WIDTH_DIFF], eax
  223. mov [esp + HEIGHT_DIFF], eax
  224. mov [esp + WIDTH_ADJ], eax
  225. mov [esp + HEIGHT_ADJ], eax
  226. mov [esp + STRETCH], eax
  227. mov [esp + ASPECT], eax
  228. // int LumaIters = 1
  229. inc eax
  230. mov [esp + LUMA_ITERS], eax
  231. // int mark = OutputHeight
  232. // int output_width = OutputWidth
  233. // int byte_ypitch_adj = pitch - OutputWidth
  234. // int byte_uvpitch_adj = pitch - (OutputWidth >> 1)
  235. xor ebx, ebx
  236. mov bx, [esp + OUTPUT_HEIGHT_WORD]
  237. mov [esp + MARK], ebx
  238. mov bx, [esp + OUTPUT_WIDTH_WORD]
  239. mov [esp + OUTPUT_WIDTH], ebx
  240. mov ecx, [esp + PITCH_PARM]
  241. mov edx, ecx
  242. sub ecx, ebx
  243. mov [esp + BYTE_YPITCH_ADJ], ecx
  244. shr ebx, 1
  245. sub edx, ebx
  246. mov [esp + BYTE_UVPITCH_ADJ], edx
  247. // if (lpbiInput->biHeight > OutputHeight)
  248. mov ebx, [esp + LPBI_INPUT]
  249. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  250. xor edx, edx
  251. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  252. cmp ecx, edx
  253. jle Lno_stretch
  254. // for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) LumaIters += 4
  255. xor ecx, ecx
  256. Lrepeat48:
  257. lea ecx, [ecx + 4]
  258. sub edx, 48
  259. jnz Lrepeat48
  260. mov [esp + LUMA_ITERS], ecx
  261. // aspect = LumaIters
  262. mov [esp + ASPECT], ecx
  263. // width_adj = (lpbiInput->biWidth - OutputWidth) >> 1
  264. // width_adj *= lpbiInput->biBitCount
  265. // width_adj >>= 3
  266. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  267. mov edx, [esp + OUTPUT_WIDTH]
  268. sub ecx, edx
  269. shr ecx, 1
  270. xor edx, edx
  271. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  272. imul ecx, edx
  273. shr ecx, 3
  274. mov [esp + WIDTH_ADJ], ecx
  275. // height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1
  276. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  277. xor edx, edx
  278. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  279. sub ecx, edx
  280. add ecx, [esp + ASPECT]
  281. shr ecx, 1
  282. mov [esp + HEIGHT_ADJ], ecx
  283. // stretch = 1
  284. // mark = 11
  285. mov ecx, 1
  286. mov edx, 11
  287. mov [esp + STRETCH], ecx
  288. mov [esp + MARK], edx
  289. jmp Lif_done
  290. Lno_stretch:
  291. // widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF
  292. // width_diff = widthx16 - OutputWidth
  293. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  294. add ecx, 00FH
  295. and ecx, 0FFFFFFF0H
  296. mov [esp + WIDTHX16], ecx
  297. mov edx, [esp + OUTPUT_WIDTH]
  298. sub ecx, edx
  299. mov [esp + WIDTH_DIFF], ecx
  300. // byte_ypitch_adj -= width_diff
  301. mov edx, [esp + BYTE_YPITCH_ADJ]
  302. sub edx, ecx
  303. mov [esp + BYTE_YPITCH_ADJ], edx
  304. // byte_uvpitch_adj -= (width_diff >> 1)
  305. mov edx, [esp + BYTE_UVPITCH_ADJ]
  306. shr ecx, 1
  307. sub edx, ecx
  308. mov [esp + BYTE_UVPITCH_ADJ], edx
  309. // heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF
  310. // height_diff = heightx16 - OutputHeight
  311. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  312. add ecx, 00FH
  313. and ecx, 0FFFFFFF0H
  314. mov [esp + HEIGHTX16], ecx
  315. xor edx, edx
  316. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  317. sub ecx, edx
  318. mov [esp + HEIGHT_DIFF], ecx
  319. Lif_done:
  320. // BackTwoLines = -(lpbiInput->biWidth + OutputWidth);
  321. // BackTwoLines *= lpbiInput->biBitCount
  322. // BackTwoLines >>= 3
  323. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  324. mov edx, [esp + OUTPUT_WIDTH]
  325. add ecx, edx
  326. neg ecx
  327. xor edx, edx
  328. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  329. imul ecx, edx
  330. sar ecx, 3
  331. mov [esp + BACK_TWO_LINES], ecx
  332. // pnext = (U32 *)(lpInput +
  333. // (((lpbiInput->biWidth * lpbiInput->biBitCount) >> 3)) *
  334. // ((OutputHeight - aspect - 1) + height_adj)) +
  335. // width_adj)
  336. // assign (esi, pnext)
  337. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  338. xor edx, edx
  339. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  340. imul ecx, edx
  341. shr ecx, 3
  342. xor edx, edx
  343. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  344. sub edx, [esp + ASPECT]
  345. dec edx
  346. add edx, [esp + HEIGHT_ADJ]
  347. imul ecx, edx
  348. add ecx, [esp + WIDTH_ADJ]
  349. add ecx, [esp + LP_INPUT]
  350. mov esi, ecx
  351. // assign (edi, YPlane)
  352. mov edi, [esp + YPLANE]
  353. // for (j = 0; j < LumaIters; j++)
  354. xor eax, eax
  355. mov [esp + LOOP_J], eax
  356. // for (k = 0; k < mark; k++)
  357. L4:
  358. xor eax, eax
  359. mov [esp + LOOP_K], eax
  360. // for (i = OutputWidth; i > 0; i -= 4, pnext += 16)
  361. L5:
  362. mov eax, [esp + OUTPUT_WIDTH]
  363. mov [esp + LOOP_I], eax
  364. // This jump is here to make sure the following loop starts in the U pipe
  365. jmp L6
  366. L6:
  367. // ---------------------
  368. // | | R1 | G1 | B1 | pnext[0]
  369. // ---------------------
  370. // | | R2 | G2 | B2 | pnext[1]
  371. // ---------------------
  372. // | | R3 | G3 | B3 | pnext[2]
  373. // ---------------------
  374. // | | R4 | G4 | B4 | pnext[3]
  375. // ---------------------
  376. // t0 = pnext[0]
  377. // t1 = pnext[1]
  378. // t = ( BYUV[(t1>> 1)&0x7F].YU +
  379. // GYUV[(t1>> 9)&0x7F].YU +
  380. // RYUV[(t1>>17)&0x7F].YU )
  381. // *(YPlane+1) = ((t>>8)+8)
  382. // t = ( BYUV[(t0>> 1)&0x7F].YU +
  383. // GYUV[(t0>> 9)&0x7F].YU +
  384. // RYUV[(t0>>17)&0x7F].YU )
  385. // *YPlane = ((t>>8)+8)
  386. // assign(eax: B2,Y1,Y2,U)
  387. // assign(ebx: B1,V)
  388. // assign(ecx: G2,G1)
  389. // assign(edx: R2,R1)
  390. // assign(ebp: B1)
  391. // 1
  392. mov ebx, [esi]
  393. mov ecx, [esi + 4]
  394. // 2
  395. mov eax, ecx
  396. mov edx, ecx
  397. // 3
  398. shr eax, 1
  399. and ecx, 0xFE00
  400. // 4
  401. shr ecx, 9
  402. and eax, 0x7F
  403. // 5
  404. shr edx, 17
  405. nop
  406. // 6
  407. mov eax, [BYUV+eax*8].YU
  408. and edx, 0x7F
  409. // 7
  410. add eax, [GYUV+ecx*8].YU
  411. mov ecx, ebx
  412. // 8
  413. add eax, [RYUV+edx*8].YU
  414. mov edx, ebx
  415. // 9
  416. shr ebx, 1
  417. add eax, 0x800
  418. // 10
  419. sar eax, 8
  420. and ecx, 0xFE00
  421. // 11
  422. shr ecx, 9
  423. and ebx, 0x7F
  424. // 12
  425. shr edx, 17
  426. mov [edi + 1], al
  427. // 13
  428. mov eax, [BYUV+ebx*8].YU
  429. and edx, 0x7F
  430. // 14
  431. add eax, [GYUV+ecx*8].YU
  432. mov ebp, ebx
  433. // 15
  434. add eax, [RYUV+edx*8].YU
  435. nop
  436. // 16
  437. sar eax, 8
  438. mov ebx, [esp + LOOP_K]
  439. // 17
  440. add eax, 8
  441. and ebx, 1
  442. // 18
  443. mov [edi], al
  444. jnz L9
  445. // At this point, ebp: B1, ecx: G1, edx: R1
  446. // t0 = pnext[0]
  447. // *UPlane++ = ((t>>24)+64)
  448. // t = ( RYUV[(t0>>17)&0x7F].V +
  449. // GYUV[(t0>> 9)&0x7F].V +
  450. // BYUV[(t0>> 1)&0x7F].V )
  451. // *VPlane++ = ((t>>8)+64)
  452. // 19
  453. mov ebx, [RYUV+edx*8].V
  454. mov edx, [esp + UPLANE]
  455. // 20
  456. sar eax, 16
  457. add ebx, [GYUV+ecx*8].V
  458. // 21
  459. add eax, 64
  460. add ebx, [BYUV+ebp*8].V
  461. // 22
  462. mov [edx], al
  463. inc edx
  464. // 23
  465. mov [esp + UPLANE], edx
  466. mov edx, [esp + VPLANE]
  467. // 24
  468. sar ebx, 8
  469. inc edx
  470. // 25
  471. add ebx, 64
  472. mov [esp + VPLANE], edx
  473. // 26
  474. mov [edx - 1], bl
  475. nop
  476. L9:
  477. // ---------------------
  478. // | | R1 | G1 | B1 | pnext[0]
  479. // ---------------------
  480. // | | R2 | G2 | B2 | pnext[1]
  481. // ---------------------
  482. // | | R3 | G3 | B3 | pnext[2]
  483. // ---------------------
  484. // | | R4 | G4 | B4 | pnext[3]
  485. // ---------------------
  486. // t2 = pnext[2]
  487. // t3 = pnext[3]
  488. // t = ( BYUV[(t3>> 1)&0x7F].YU +
  489. // GYUV[(t3>> 9)&0x7F].YU +
  490. // RYUV[(t3>>17)&0x7F].YU )
  491. // *(YPlane+3) = ((t>>8)+8)
  492. // t = ( BYUV[(t2>> 1)&0x7F].YU +
  493. // GYUV[(t2>> 9)&0x7F].YU +
  494. // RYUV[(t2>>17)&0x7F].YU )
  495. // *(YPlane+2) = ((t>>8)+8)
  496. // YPlane += 4
  497. // assign(eax: B4,Y3,Y4,U)
  498. // assign(ebx: R3,V)
  499. // assign(ecx: G4,G3)
  500. // assign(edx: R4/B3)
  501. // assign(ebp: R3)
  502. // 27
  503. mov ebx, [esi + 8]
  504. mov ecx, [esi + 12]
  505. // 28
  506. mov eax, ecx
  507. mov edx, ecx
  508. // 29
  509. shr eax, 1
  510. and ecx, 0xFE00
  511. // 30
  512. shr ecx, 9
  513. and eax, 0x7F
  514. // 31
  515. shr edx, 17
  516. nop
  517. // 32
  518. mov eax, [BYUV+eax*8].YU
  519. and edx, 0x7F
  520. // 33
  521. add eax, [GYUV+ecx*8].YU
  522. mov ecx, ebx
  523. // 34
  524. add eax, [RYUV+edx*8].YU
  525. mov edx, ebx
  526. // 35
  527. shr ebx, 1
  528. add eax, 0x800
  529. // 36
  530. sar eax, 8
  531. and ebx, 0x7F
  532. // 37
  533. shr ecx, 9
  534. mov [edi + 3], al
  535. // 38
  536. shr edx, 17
  537. and ecx, 0x7F
  538. // 39
  539. mov eax, [BYUV+ebx*8].YU
  540. and edx, 0x7F
  541. // 40
  542. add eax, [GYUV+ecx*8].YU
  543. mov ebp, ebx
  544. // 41
  545. add eax, [RYUV+edx*8].YU
  546. nop
  547. // 42
  548. sar eax, 8
  549. mov ebx, [esp + LOOP_K]
  550. // 43
  551. add eax, 8
  552. and ebx, 1
  553. // 44
  554. mov [edi + 2], al
  555. jnz L16
  556. // At this point, ebp: R3, ecx: G3, edx: B3
  557. // t1 = pnext[1]
  558. // t2 = pnext[2]
  559. // *UPlane++ = ((t>>16)+64)
  560. // t = ( RYUV[(t2>> 1)&0x7F].V +
  561. // GYUV[t1>>25].V +
  562. // BYUV[(t1>>17)&0x7F].V )
  563. // *VPlane++ = ((t>>8)+64)
  564. // 45
  565. mov ebx, [RYUV+edx*8].V
  566. mov edx, [esp + UPLANE]
  567. // 46
  568. sar eax, 16
  569. add ebx, [GYUV+ecx*8].V
  570. // 47
  571. add eax, 64
  572. add ebx, [BYUV+ebp*8].V
  573. // 48
  574. mov [edx], al
  575. inc edx
  576. // 49
  577. mov [esp + UPLANE], edx
  578. mov edx, [esp + VPLANE]
  579. // 50
  580. sar ebx, 8
  581. inc edx
  582. // 51
  583. add ebx, 64
  584. mov [esp + VPLANE], edx
  585. // 52
  586. mov [edx - 1], bl
  587. nop
  588. L16:
  589. // 53
  590. mov eax, [esp + LOOP_I]
  591. lea esi, [esi + 16]
  592. // 54
  593. sub eax, 4
  594. lea edi, [edi + 4]
  595. // 55
  596. mov [esp + LOOP_I], eax
  597. jnz L6
  598. // Assembler version of C_WIDTH_DIFF
  599. // if (width_diff)
  600. mov eax, [esp + WIDTH_DIFF]
  601. mov edx, eax
  602. test eax, eax
  603. jz Lno_width_diff
  604. // tm = (*(YPlane-1)) << 24
  605. // tm |= (tm>>8) | (tm>>16) | (tm>>24)
  606. mov bl, [edi - 1]
  607. shl ebx, 24
  608. mov ecx, ebx
  609. shr ebx, 8
  610. or ecx, ebx
  611. shr ebx, 8
  612. or ecx, ebx
  613. shr ebx, 8
  614. or ecx, ebx
  615. // *(U32 *)YPlane = tm
  616. mov [edi], ecx
  617. // if ((width_diff-4) > 0)
  618. sub eax, 4
  619. jz Lupdate_YPlane
  620. // *(U32 *)(YPlane + 4) = tm
  621. mov [edi + 4], ecx
  622. sub eax, 4
  623. // if ((width_diff-8) > 0)
  624. jz Lupdate_YPlane
  625. // *(U32 *)(YPlane + 8) = tm
  626. mov [edi + 8], ecx
  627. Lupdate_YPlane:
  628. // YPlane += width_diff
  629. lea edi, [edi + edx]
  630. ///if (0 == (k&1))
  631. mov eax, [esp + LOOP_K]
  632. test eax, 1
  633. jnz Lno_width_diff
  634. // t8u = *(UPlane-1)
  635. // t8v = *(VPlane-1)
  636. // *UPlane++ = t8u
  637. // *UPlane++ = t8u
  638. // *VPlane++ = t8v
  639. // *VPlane++ = t8v
  640. mov ebp, edx
  641. mov eax, [esp + UPLANE]
  642. mov ebx, [esp + VPLANE]
  643. mov cl, [eax - 1]
  644. mov ch, [ebx - 1]
  645. mov [eax], cl
  646. mov [eax + 1], cl
  647. mov [ebx], ch
  648. mov [ebx + 1], ch
  649. // if ((width_diff-4) > 0)
  650. sub ebp, 4
  651. jz Lupdate_UVPlane
  652. // *UPlane++ = t8u
  653. // *UPlane++ = t8u
  654. // *VPlane++ = t8v
  655. // *VPlane++ = t8v
  656. mov [eax + 2], cl
  657. mov [eax + 3], cl
  658. mov [ebx + 2], ch
  659. mov [ebx + 3], ch
  660. // if ((width_diff-8) > 0)
  661. sub ebp, 4
  662. jz Lupdate_UVPlane
  663. // *UPlane++ = t8u
  664. // *UPlane++ = t8u
  665. // *VPlane++ = t8v
  666. // *VPlane++ = t8v
  667. mov [eax + 4], cl
  668. mov [eax + 5], cl
  669. mov [ebx + 4], ch
  670. mov [ebx + 5], ch
  671. Lupdate_UVPlane:
  672. shr edx, 1
  673. lea eax, [eax + edx]
  674. mov [esp + UPLANE], eax
  675. lea ebx, [ebx + edx]
  676. mov [esp + VPLANE], ebx
  677. Lno_width_diff:
  678. // if (stretch && (0 == k) && j)
  679. mov eax, [esp + STRETCH]
  680. test eax, eax
  681. jz L21
  682. mov eax, [esp + LOOP_K]
  683. test eax, eax
  684. jnz L21
  685. mov eax, [esp + LOOP_J]
  686. test eax, eax
  687. jz L21
  688. // spill YPlane ptr
  689. mov [esp + YPLANE], edi
  690. nop
  691. // for (i = OutputWidth; i > 0; i -= 8)
  692. // assign (ebx, pyprev)
  693. // assign (ecx, t)
  694. // assign (edx, pynext)
  695. // assign (edi, pyspace)
  696. // assign (ebp, i)
  697. // make sure offsets are such that there are no bank conflicts here
  698. mov ebx, [esp + PYPREV]
  699. mov edi, [esp + PYSPACE]
  700. mov edx, [esp + PYNEXT]
  701. mov ebp, [esp + OUTPUT_WIDTH]
  702. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  703. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  704. // *pyspace++ = t
  705. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  706. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  707. // *pyspace++ = t
  708. L22:
  709. // 1
  710. mov eax, [ebx]
  711. lea ebx, [ebx + 4]
  712. // 2
  713. mov ecx, [edx]
  714. lea edx, [edx + 4]
  715. // 3
  716. shr ecx, 1
  717. and eax, 0xFEFEFEFE
  718. // 4
  719. shr eax, 1
  720. and ecx, 0x7F7F7F7F
  721. // 5
  722. add eax, ecx
  723. mov ecx, [ebx]
  724. // 6
  725. shr ecx, 1
  726. mov [edi], eax
  727. // 7
  728. mov eax, [edx]
  729. and ecx, 0x7F7F7F7F
  730. // 8
  731. shr eax, 1
  732. lea edi, [edi + 4]
  733. // 9
  734. and eax, 0x7F7F7F7F
  735. lea ebx, [ebx + 4]
  736. // 10
  737. lea edx, [edx + 4]
  738. add eax, ecx
  739. // 11
  740. mov [edi], eax
  741. lea edi, [edi + 4]
  742. // 12
  743. sub ebp, 8
  744. jnz L22
  745. // kill (ebx, pyprev)
  746. // kill (ecx, t)
  747. // kill (edx, pynext)
  748. // kill (edi, pyspace)
  749. // kill (ebp, i)
  750. // restore YPlane
  751. mov edi, [esp + YPLANE]
  752. // pnext += BackTwoLines
  753. L21:
  754. add esi, [esp + BACK_TWO_LINES]
  755. // YPlane += byte_ypitch_adj;
  756. add edi, [esp + BYTE_YPITCH_ADJ]
  757. // if(0 == (k&1))
  758. mov eax, [esp + LOOP_K]
  759. and eax, 1
  760. jnz L23
  761. // UPlane += byte_uvpitch_adj;
  762. // VPlane += byte_uvpitch_adj;
  763. mov eax, [esp + BYTE_UVPITCH_ADJ]
  764. add [esp + UPLANE], eax
  765. add [esp + VPLANE], eax
  766. L23:
  767. inc DWORD PTR [esp + LOOP_K]
  768. mov eax, [esp + LOOP_K]
  769. cmp eax, [esp + MARK]
  770. jl L5
  771. // if (stretch)
  772. cmp DWORD PTR [esp + STRETCH], 0
  773. je L24
  774. // pyprev = YPlane - pitch
  775. mov eax, edi
  776. sub eax, [esp + PITCH_PARM]
  777. mov [esp + PYPREV], eax
  778. // pyspace = YPlane
  779. mov [esp + PYSPACE], edi
  780. // pynext = (YPlane += pitch)
  781. add edi, [esp + PITCH_PARM]
  782. mov [esp + PYNEXT], edi
  783. L24:
  784. inc DWORD PTR [esp + LOOP_J]
  785. mov eax, [esp + LOOP_J]
  786. cmp eax, [esp + LUMA_ITERS]
  787. jl L4
  788. // kill (esi, pnext)
  789. // kill (edi, YPlane)
  790. // ASM version of C_HEIGHT_FILL
  791. // if (height_diff)
  792. mov eax, [esp + HEIGHT_DIFF]
  793. test eax, eax
  794. jz Lno_height_diff
  795. // pyspace = (U32 *)YPlane
  796. mov esi, edi
  797. // pyprev = (U32 *)(YPlane - pitch)
  798. sub esi, [esp + PITCH_PARM]
  799. // for (j = height_diff; j > 0; j--)
  800. Lheight_yfill_loop:
  801. mov ebx, [esp + WIDTHX16]
  802. // for (i = widthx16; i>0; i -=4)
  803. Lheight_yfill_row:
  804. // *pyspace++ = *pyprev++
  805. mov ecx, [esi]
  806. lea esi, [esi + 4]
  807. mov [edi], ecx
  808. lea edi, [edi + 4]
  809. sub ebx, 4
  810. jnz Lheight_yfill_row
  811. // pyspace += word_ypitch_adj
  812. // pyprev += word_ypitch_adj
  813. add esi, [esp + BYTE_YPITCH_ADJ]
  814. add edi, [esp + BYTE_YPITCH_ADJ]
  815. dec eax
  816. jnz Lheight_yfill_loop
  817. mov eax, [esp + HEIGHT_DIFF]
  818. mov edi, [esp + UPLANE]
  819. // puvspace = (U32 *)UPlane
  820. mov esi, edi
  821. // puvprev = (U32 *)(UPlane - pitch)
  822. sub esi, [esp + PITCH_PARM]
  823. // for (j = height_diff; j > 0; j -= 2)
  824. Lheight_ufill_loop:
  825. mov ebx, [esp + WIDTHX16]
  826. // for (i = widthx16; i>0; i -= 8)
  827. Lheight_ufill_row:
  828. // *puvspace++ = *puvprev++
  829. mov ecx, [esi]
  830. mov [edi], ecx
  831. lea esi, [esi + 4]
  832. lea edi, [edi + 4]
  833. sub ebx, 8
  834. jnz Lheight_ufill_row
  835. // puvspace += word_uvpitch_adj
  836. // puvprev += word_uvpitch_adj
  837. add esi, [esp + BYTE_UVPITCH_ADJ]
  838. add edi, [esp + BYTE_UVPITCH_ADJ]
  839. sub eax, 2
  840. jnz Lheight_ufill_loop
  841. mov eax, [esp + HEIGHT_DIFF]
  842. mov edi, [esp + VPLANE]
  843. // puvspace = (U32 *)VPlane
  844. mov esi, edi
  845. // puvprev = (U32 *)(VPlane - pitch)
  846. sub esi, [esp + PITCH_PARM]
  847. // for (j = height_diff; j > 0; j -= 2)
  848. Lheight_vfill_loop:
  849. mov ebx, [esp + WIDTHX16]
  850. // for (i = widthx16; i>0; i -= 8)
  851. Lheight_vfill_row:
  852. // *puvspace++ = *puvprev++
  853. mov ecx, [esi]
  854. mov [edi], ecx
  855. lea esi, [esi + 4]
  856. lea edi, [edi + 4]
  857. sub ebx, 8
  858. jnz Lheight_vfill_row
  859. // puvspace += word_uvpitch_adj
  860. // puvprev += word_uvpitch_adj
  861. add esi, [esp + BYTE_UVPITCH_ADJ]
  862. add edi, [esp + BYTE_UVPITCH_ADJ]
  863. sub eax, 2
  864. jnz Lheight_vfill_loop
  865. Lno_height_diff:
  866. // if (stretch)
  867. mov esi, [esp + PYPREV]
  868. cmp DWORD PTR [esp + STRETCH], 0
  869. je L26
  870. // for (i = OutputWidth; i > 0; i -= 4)
  871. // assign (esi, pyprev)
  872. // assign (edi, pyspace)
  873. // assign (ebp, i)
  874. mov ebp, [esp + OUTPUT_WIDTH]
  875. mov edi, [esp + PYSPACE]
  876. L25:
  877. mov ecx, [esi]
  878. lea esi, [esi + 4]
  879. mov [edi], ecx
  880. lea edi, [edi + 4]
  881. sub ebp, 4
  882. jnz L25
  883. // kill (esi, pyprev)
  884. // kill (edi, pyspace)
  885. // kill (ebp, i)
  886. L26:
  887. add esp, LOCALSIZE
  888. pop edi
  889. pop esi
  890. pop ebx
  891. pop ebp
  892. ret
  893. }
  894. }
  895. #undef LOCALSIZE
  896. #undef PITCH_PARM
  897. #undef VPLANE
  898. #undef UPLANE
  899. #undef YPLANE
  900. #undef LP_INPUT
  901. #undef OUTPUT_HEIGHT_WORD
  902. #undef OUTPUT_WIDTH_WORD
  903. #undef LPBI_INPUT
  904. #undef OUTPUT_WIDTH
  905. #undef PYPREV
  906. #undef PYSPACE
  907. #undef PYNEXT
  908. #undef PUVPREV
  909. #undef PUVSPACE
  910. #undef LOOP_I
  911. #undef LOOP_J
  912. #undef LOOP_K
  913. #undef BACK_TWO_LINES
  914. #undef WIDTHX16
  915. #undef HEIGHTX16
  916. #undef WIDTH_DIFF
  917. #undef HEIGHT_DIFF
  918. #undef WIDTH_ADJ
  919. #undef HEIGHT_ADJ
  920. #undef STRETCH
  921. #undef ASPECT
  922. #undef LUMA_ITERS
  923. #undef MARK
  924. #undef BYTE_YPITCH_ADJ
  925. #undef BYTE_UVPITCH_ADJ
  926. #endif // } H263P