Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1002 lines
21 KiB

  1. /* *************************************************************************
  2. ** INTEL Corporation Proprietary Information
  3. **
  4. ** This listing is supplied under the terms of a license
  5. ** agreement with INTEL Corporation and may not be copied
  6. ** nor disclosed except in accordance with the terms of
  7. ** that agreement.
  8. **
  9. ** Copyright (c) 1995 Intel Corporation.
  10. ** All Rights Reserved.
  11. **
  12. ** *************************************************************************
  13. */
  14. #include "precomp.h"
  15. #if defined(H263P) || defined(USE_BILINEAR_MSH26X) // {
  16. //
  17. // For the P5 versions, the strategy is to compute the Y value for an odd RGB value
  18. // followed by computing the Y value for the corresponding even RGB value. The registers
  19. // are then set with the proper values to compute U and V values for the even RGB
  20. // value. This avoids repeating the shifting and masking needed to extract the Red,
  21. // Green and Blue components.
  22. //
  23. /*****************************************************************************
  24. *
  25. * H26X_BGR24toYUV12()
  26. *
  27. * Convert from BGR24 to YUV12 (YCrCb 4:2:0) and copy to destination memory
  28. * with pitch defined by the constant PITCH. The input data is stored in
  29. * the order B,G,R,B,G,R...
  30. *
  31. */
  32. #if 0 // { 0
  33. void C_H26X_BGR24toYUV12(
  34. LPBITMAPINFOHEADER lpbiInput,
  35. WORD OutputWidth,
  36. WORD OutputHeight,
  37. U8 *lpInput,
  38. U8 *YPlane,
  39. U8 *UPlane,
  40. U8 *VPlane,
  41. const int pitch)
  42. {
  43. C_RGB_COLOR_CONVERT_INIT
  44. for ( j = 0; j < LumaIters; j++) {
  45. for (k = 0; k < mark; k++) {
  46. for (i = OutputWidth; i > 0; i -= 4, pnext += 3) {
  47. tm = pnext[0];
  48. t = BYUV[tm>>25].YU;
  49. tm = pnext[1];
  50. t += (GYUV[(tm>>1)&0x7F].YU +
  51. RYUV[(tm>>9)&0x7F].YU);
  52. *(YPlane+1) = (U8)((t>>SHIFT_WIDTH)+8);
  53. tm = pnext[0];
  54. t = (BYUV[(tm>>1)&0x7F].YU +
  55. GYUV[(tm>>9)&0x7F].YU +
  56. RYUV[(tm>>17)&0x7F].YU);
  57. *YPlane = (U8)((t>>SHIFT_WIDTH)+8);
  58. if (0 == (k&1)) {
  59. *UPlane++ = (U8)((t>>24)+64);
  60. t = (RYUV[(tm>>17)&0x7F].V +
  61. GYUV[(tm>>9)&0x7F].V +
  62. BYUV[(tm>>1)&0x7F].V);
  63. *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
  64. }
  65. tm = pnext[2];
  66. t = (BYUV[(tm>>9)&0x7F].YU +
  67. GYUV[(tm>>17)&0x7F].YU +
  68. RYUV[tm>>25].YU);
  69. *(YPlane+3) = (U8)((t>>SHIFT_WIDTH)+8);
  70. tm = pnext[1];
  71. t = BYUV[(tm>>17)&0x7F].YU + GYUV[tm>>25].YU;
  72. tm = pnext[2];
  73. t += RYUV[(tm>>1)&0x7F].YU;
  74. *(YPlane+2) = (U8)((t>>SHIFT_WIDTH)+8);
  75. YPlane += 4;
  76. if (0 == (k&1)) {
  77. *UPlane++ = (U8)((t>>24)+64);
  78. t = RYUV[(tm>>1)&0x7F].V;
  79. tm = pnext[1];
  80. t += GYUV[tm>>25].V + BYUV[(tm>>17)&0x7F].V;
  81. *VPlane++ = (U8)((t>>SHIFT_WIDTH)+64);
  82. }
  83. }
  84. // The next two cases are mutually exclusive.
  85. // If there is a width_diff there cannot be a stretch and
  86. // if there is a stretch, there cannot be a width_diff.
  87. C_WIDTH_FILL
  88. if (stretch && (0 == k) && j) {
  89. for (i = OutputWidth; i > 0; i -= 8) {
  90. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  91. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  92. *pyspace++ = tm;
  93. tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
  94. tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
  95. *pyspace++ = tm;
  96. }
  97. }
  98. pnext += BackTwoLines;
  99. YPlane += byte_ypitch_adj;
  100. // Increment after even lines.
  101. if(0 == (k&1)) {
  102. UPlane += byte_uvpitch_adj;
  103. VPlane += byte_uvpitch_adj;
  104. }
  105. } // end of for k
  106. if (stretch) {
  107. pyprev = (U32 *)(YPlane - pitch);
  108. pyspace = (U32 *)YPlane;
  109. pynext = (U32 *)(YPlane += pitch);
  110. }
  111. } // end of for j
  112. // The next two cases are mutually exclusive.
  113. // If there is a height_diff there cannot be a stretch and
  114. // if there is a stretch, there cannot be a height_diff.
  115. C_HEIGHT_FILL
  116. if (stretch) {
  117. for (i = OutputWidth; i > 0; i -= 4) {
  118. *pyspace++ = *pyprev++;
  119. }
  120. }
  121. } // end of C_H26X_BGR24toYUV12()
  122. #endif // } 0
  123. __declspec(naked)
  124. void P5_H26X_BGR24toYUV12(
  125. LPBITMAPINFOHEADER lpbiInput,
  126. WORD OutputWidth,
  127. WORD OutputHeight,
  128. U8 *lpInput,
  129. U8 *YPlane,
  130. U8 *UPlane,
  131. U8 *VPlane,
  132. const int pitch)
  133. {
  134. // Permanent (callee-save) registers - ebx, esi, edi, ebp
  135. // Temporary (caller-save) registers - eax, ecx, edx
  136. //
  137. // Stack frame layout
  138. // | pitch | +136
  139. // | VPlane | +132
  140. // | UPlane | +128
  141. // | YPlane | +124
  142. // | lpInput | +120
  143. // | OutputHeight | +116
  144. // | OutputWidth | +112
  145. // | lpbiInput | +108
  146. // ----------------------------
  147. // | return addr | +104
  148. // | saved ebp | +100
  149. // | saved ebx | + 96
  150. // | saved esi | + 92
  151. // | saved edi | + 88
  152. // | output_width | + 84
  153. // | pyprev | + 80
  154. // | pyspace | + 76
  155. // | pynext | + 72
  156. // | puvprev | + 68
  157. // | puvspace | + 64
  158. // | i | + 60
  159. // | j | + 56
  160. // | k | + 52
  161. // | BackTwoLines | + 48
  162. // | widthx16 | + 44
  163. // | heightx16 | + 40
  164. // | width_diff | + 36
  165. // | height_diff | + 32
  166. // | width_adj | + 28
  167. // | height_adj | + 24
  168. // | stretch | + 20
  169. // | aspect | + 16
  170. // | LumaIters | + 12
  171. // | mark | + 8
  172. // | byte_ypitch_adj | + 4
  173. // | byte_uvpitch_adj | + 0
  174. #define LOCALSIZE 88
  175. #define PITCH_PARM 136
  176. #define VPLANE 132
  177. #define UPLANE 128
  178. #define YPLANE 124
  179. #define LP_INPUT 120
  180. #define OUTPUT_HEIGHT_WORD 116
  181. #define OUTPUT_WIDTH_WORD 112
  182. #define LPBI_INPUT 108
  183. #define OUTPUT_WIDTH 84
  184. #define PYPREV 80
  185. #define PYSPACE 76
  186. #define PYNEXT 72
  187. #define PUVPREV 68
  188. #define PUVSPACE 64
  189. #define LOOP_I 60
  190. #define LOOP_J 56
  191. #define LOOP_K 52
  192. #define BACK_TWO_LINES 48
  193. #define WIDTHX16 44
  194. #define HEIGHTX16 40
  195. #define WIDTH_DIFF 36
  196. #define HEIGHT_DIFF 32
  197. #define WIDTH_ADJ 28
  198. #define HEIGHT_ADJ 24
  199. #define STRETCH 20
  200. #define ASPECT 16
  201. #define LUMA_ITERS 12
  202. #define MARK 8
  203. #define BYTE_YPITCH_ADJ 4
  204. #define BYTE_UVPITCH_ADJ 0
  205. _asm {
  206. push ebp
  207. push ebx
  208. push esi
  209. push edi
  210. sub esp, LOCALSIZE
  211. // int width_diff = 0
  212. // int height_diff = 0
  213. // int width_adj = 0
  214. // int height_adj = 0
  215. // int stretch = 0
  216. // int aspect = 0
  217. xor eax, eax
  218. mov [esp + WIDTH_DIFF], eax
  219. mov [esp + HEIGHT_DIFF], eax
  220. mov [esp + WIDTH_ADJ], eax
  221. mov [esp + HEIGHT_ADJ], eax
  222. mov [esp + STRETCH], eax
  223. mov [esp + ASPECT], eax
  224. // int LumaIters = 1
  225. inc eax
  226. mov [esp + LUMA_ITERS], eax
  227. // int mark = OutputHeight
  228. // int output_width = OutputWidth
  229. // int byte_ypitch_adj = pitch - OutputWidth
  230. // int byte_uvpitch_adj = pitch - (OutputWidth >> 1)
  231. xor ebx, ebx
  232. mov bx, [esp + OUTPUT_HEIGHT_WORD]
  233. mov [esp + MARK], ebx
  234. mov bx, [esp + OUTPUT_WIDTH_WORD]
  235. mov [esp + OUTPUT_WIDTH], ebx
  236. mov ecx, [esp + PITCH_PARM]
  237. mov edx, ecx
  238. sub ecx, ebx
  239. mov [esp + BYTE_YPITCH_ADJ], ecx
  240. shr ebx, 1
  241. sub edx, ebx
  242. mov [esp + BYTE_UVPITCH_ADJ], edx
  243. // if (lpbiInput->biHeight > OutputHeight)
  244. mov ebx, [esp + LPBI_INPUT]
  245. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  246. xor edx, edx
  247. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  248. cmp ecx, edx
  249. jle Lno_stretch
  250. // for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) LumaIters += 4
  251. xor ecx, ecx
  252. Lrepeat48:
  253. lea ecx, [ecx + 4]
  254. sub edx, 48
  255. jnz Lrepeat48
  256. mov [esp + LUMA_ITERS], ecx
  257. // aspect = LumaIters
  258. mov [esp + ASPECT], ecx
  259. // width_adj = (lpbiInput->biWidth - OutputWidth) >> 1
  260. // width_adj *= lpbiInput->biBitCount
  261. // width_adj >>= 3
  262. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  263. mov edx, [esp + OUTPUT_WIDTH]
  264. sub ecx, edx
  265. shr ecx, 1
  266. xor edx, edx
  267. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  268. imul ecx, edx
  269. shr ecx, 3
  270. mov [esp + WIDTH_ADJ], ecx
  271. // height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1
  272. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  273. xor edx, edx
  274. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  275. sub ecx, edx
  276. add ecx, [esp + ASPECT]
  277. shr ecx, 1
  278. mov [esp + HEIGHT_ADJ], ecx
  279. // stretch = 1
  280. // mark = 11
  281. mov ecx, 1
  282. mov edx, 11
  283. mov [esp + STRETCH], ecx
  284. mov [esp + MARK], edx
  285. jmp Lif_done
  286. Lno_stretch:
  287. // widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF
  288. // width_diff = widthx16 - OutputWidth
  289. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  290. add ecx, 00FH
  291. and ecx, 0FFFFFFF0H
  292. mov [esp + WIDTHX16], ecx
  293. mov edx, [esp + OUTPUT_WIDTH]
  294. sub ecx, edx
  295. mov [esp + WIDTH_DIFF], ecx
  296. // byte_ypitch_adj -= width_diff
  297. mov edx, [esp + BYTE_YPITCH_ADJ]
  298. sub edx, ecx
  299. mov [esp + BYTE_YPITCH_ADJ], edx
  300. // byte_uvpitch_adj -= (width_diff >> 1)
  301. mov edx, [esp + BYTE_UVPITCH_ADJ]
  302. shr ecx, 1
  303. sub edx, ecx
  304. mov [esp + BYTE_UVPITCH_ADJ], edx
  305. // heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF
  306. // height_diff = heightx16 - OutputHeight
  307. mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
  308. add ecx, 00FH
  309. and ecx, 0FFFFFFF0H
  310. mov [esp + HEIGHTX16], ecx
  311. xor edx, edx
  312. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  313. sub ecx, edx
  314. mov [esp + HEIGHT_DIFF], ecx
  315. Lif_done:
  316. // BackTwoLines = -(lpbiInput->biWidth + OutputWidth);
  317. // BackTwoLines *= lpbiInput->biBitCount
  318. // BackTwoLines >>= 3
  319. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  320. mov edx, [esp + OUTPUT_WIDTH]
  321. add ecx, edx
  322. neg ecx
  323. xor edx, edx
  324. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  325. imul ecx, edx
  326. sar ecx, 3
  327. mov [esp + BACK_TWO_LINES], ecx
  328. // pnext = (U32 *)(lpInput +
  329. // (((lpbiInput->biWidth * lpbiInput->biBitCount) >> 3)) *
  330. // ((OutputHeight - aspect - 1) + height_adj)) +
  331. // width_adj)
  332. // assign (esi, pnext)
  333. mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
  334. xor edx, edx
  335. mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
  336. imul ecx, edx
  337. shr ecx, 3
  338. xor edx, edx
  339. mov dx, [esp + OUTPUT_HEIGHT_WORD]
  340. sub edx, [esp + ASPECT]
  341. dec edx
  342. add edx, [esp + HEIGHT_ADJ]
  343. imul ecx, edx
  344. add ecx, [esp + WIDTH_ADJ]
  345. add ecx, [esp + LP_INPUT]
  346. mov esi, ecx
  347. // assign (edi, YPlane)
  348. mov edi, [esp + YPLANE]
  349. // for (j = 0; j < LumaIters; j++)
  350. xor eax, eax
  351. mov [esp + LOOP_J], eax
  352. // for (k = 0; k < mark; k++)
  353. L4:
  354. xor eax, eax
  355. mov [esp + LOOP_K], eax
  356. // for (i = OutputWidth; i > 0; i -= 4, pnext += 12)
  357. L5:
  358. mov eax, [esp + OUTPUT_WIDTH]
  359. mov [esp + LOOP_I], eax
  360. // This jump is here to make sure the following loop starts in the U pipe
  361. jmp L6
  362. L6:
  363. // ---------------------
  364. // | B2 | R1 | G1 | B1 | pnext[0]
  365. // ---------------------
  366. // | G3 | B3 | R2 | G2 | pnext[1]
  367. // ---------------------
  368. // | R4 | G4 | B4 | R3 | pnext[2]
  369. // ---------------------
  370. // t0 = pnext[0]
  371. // t1 = pnext[1]
  372. // t = ( BYUV[t0>>25].YU +
  373. // GYUV[(t1>> 1)&0x7F].YU +
  374. // RYUV[(t1>> 9)&0x7F].YU )
  375. // *(YPlane+1) = ((t>>8)+8)
  376. // t = ( BYUV[(t0>> 1)&0x7F].YU +
  377. // GYUV[(t0>> 9)&0x7F].YU +
  378. // RYUV[(t0>>17)&0x7F].YU )
  379. // *YPlane = ((t>>8)+8)
  380. // assign(eax: B2,Y1,Y2,U)
  381. // assign(ebx: B1,V)
  382. // assign(ecx: G2,G1)
  383. // assign(edx: R2,R1)
  384. // assign(ebp: B1)
  385. // 1
  386. mov eax, [esi]
  387. mov ecx, [esi + 4]
  388. // 2
  389. mov ebx, eax
  390. mov edx, ecx
  391. // 3
  392. shr eax, 25
  393. and ecx, 0xFE
  394. // 4
  395. shr ecx, 1
  396. and edx, 0xFE00
  397. // 5
  398. shr edx, 9
  399. and ebx, 0xFEFEFE
  400. // 6
  401. mov eax, [BYUV+eax*8].YU
  402. nop
  403. // 7
  404. add eax, [GYUV+ecx*8].YU
  405. mov ecx, ebx
  406. // 8
  407. add eax, [RYUV+edx*8].YU
  408. mov edx, ebx
  409. // 9
  410. sar eax, 8
  411. and ebx, 0xFE
  412. // 10
  413. shr ebx, 1
  414. add eax, 8
  415. // 11
  416. shr ecx, 9
  417. mov [edi + 1], al
  418. // 12
  419. shr edx, 17
  420. and ecx, 0x7F
  421. // 13
  422. mov eax, [BYUV+ebx*8].YU
  423. and edx, 0x7F
  424. // 14
  425. add eax, [GYUV+ecx*8].YU
  426. mov ebp, ebx
  427. // 15
  428. add eax, [RYUV+edx*8].YU
  429. nop
  430. // 16
  431. sar eax, 8
  432. mov ebx, [esp + LOOP_K]
  433. // 17
  434. add eax, 8
  435. and ebx, 1
  436. // 18
  437. mov [edi], al
  438. jnz L9
  439. // At this point, ebp: B1, ecx: G1, edx: R1
  440. // t0 = pnext[0]
  441. // *UPlane++ = ((t>>24)+64)
  442. // t = ( RYUV[(t0>>17)&0x7F].V +
  443. // GYUV[(t0>> 9)&0x7F].V +
  444. // BYUV[(t0>> 1)&0x7F].V )
  445. // *VPlane++ = ((t>>8)+64)
  446. // 19
  447. mov ebx, [RYUV+edx*8].V
  448. mov edx, [esp + UPLANE]
  449. // 20
  450. sar eax, 16
  451. add ebx, [GYUV+ecx*8].V
  452. // 21
  453. add eax, 64
  454. add ebx, [BYUV+ebp*8].V
  455. // 22
  456. mov [edx], al
  457. inc edx
  458. // 23
  459. mov [esp + UPLANE], edx
  460. mov edx, [esp + VPLANE]
  461. // 24
  462. sar ebx, 8
  463. inc edx
  464. // 25
  465. add ebx, 64
  466. mov [esp + VPLANE], edx
  467. // 26
  468. mov [edx - 1], bl
  469. nop
  470. L9:
  471. // ---------------------
  472. // | B2 | R1 | G1 | B1 | pnext[0]
  473. // ---------------------
  474. // | G3 | B3 | R2 | G2 | pnext[1]
  475. // ---------------------
  476. // | R4 | G4 | B4 | R3 | pnext[2]
  477. // ---------------------
  478. // t1 = pnext[1]
  479. // t2 = pnext[2]
  480. // t = ( BYUV[(t2>> 9)&0x7F].YU +
  481. // GYUV[(t2>>17)&0x7F].YU +
  482. // RYUV[t2>>25].YR )
  483. // *(YPlane+3) = ((t>>8)+8)
  484. // t = ( BYUV[(t1>>17)&0x7F].YU +
  485. // GYUV[t1>>25].YU +
  486. // RYUV[(t2>> 1)&0x7F].YU )
  487. // *(YPlane+2) = ((t>>8)+8)
  488. // YPlane += 4
  489. // assign(eax: B4,Y3,Y4,U)
  490. // assign(ebx: R3,V)
  491. // assign(ecx: G4,G3)
  492. // assign(edx: R4/B3)
  493. // assign(ebp: R3)
  494. // 27
  495. mov ebp, [esi + 4]
  496. mov ebx, [esi + 8]
  497. // 28
  498. mov eax, ebx
  499. mov ecx, ebx
  500. // 29
  501. shr eax, 9
  502. mov edx, ebx
  503. // 30
  504. shr ecx, 17
  505. and eax, 0x7F
  506. // 31
  507. shr edx, 25
  508. and ecx, 0x7F
  509. // 32
  510. mov eax, [BYUV+eax*8].YU
  511. nop
  512. // 33
  513. add eax, [GYUV+ecx*8].YU
  514. and ebx, 0xFE
  515. // 34
  516. add eax, [RYUV+edx*8].YU
  517. mov ecx, ebp
  518. // 35
  519. shr ebx, 1
  520. add eax, 0x800
  521. // 36
  522. sar eax, 8
  523. mov edx, ebp
  524. // 37
  525. shr edx, 17
  526. mov [edi + 3], al
  527. // 38
  528. shr ecx, 25
  529. and edx, 0x7F
  530. // 39
  531. mov eax, [RYUV+ebx*8].YU
  532. mov ebp, ebx
  533. // 40
  534. add eax, [GYUV+ecx*8].YU
  535. nop
  536. // 41
  537. add eax, [BYUV+edx*8].YU
  538. nop
  539. // 42
  540. sar eax, 8
  541. mov ebx, [esp + LOOP_K]
  542. // 43
  543. add eax, 8
  544. and ebx, 1
  545. // 44
  546. mov [edi + 2], al
  547. jnz L16
  548. // At this point, ebp: R3, ecx: G3, edx: B3
  549. // t1 = pnext[1]
  550. // t2 = pnext[2]
  551. // *UPlane++ = ((t>>16)+64)
  552. // t = ( RYUV[(t2>> 1)&0x7F].V +
  553. // GYUV[t1>>25].V +
  554. // BYUV[(t1>>17)&0x7F].V )
  555. // *VPlane++ = ((t>>8)+64)
  556. // 45
  557. mov ebx, [BYUV+edx*8].V
  558. mov edx, [esp + UPLANE]
  559. // 46
  560. sar eax, 16
  561. add ebx, [GYUV+ecx*8].V
  562. // 47
  563. add eax, 64
  564. add ebx, [RYUV+ebp*8].V
  565. // 48
  566. mov [edx], al
  567. inc edx
  568. // 49
  569. mov [esp + UPLANE], edx
  570. mov edx, [esp + VPLANE]
  571. // 50
  572. sar ebx, 8
  573. inc edx
  574. // 51
  575. add ebx, 64
  576. mov [esp + VPLANE], edx
  577. // 52
  578. mov [edx - 1], bl
  579. nop
  580. L16:
  581. // 53
  582. mov eax, [esp + LOOP_I]
  583. lea esi, [esi + 12]
  584. // 54
  585. sub eax, 4
  586. lea edi, [edi + 4]
  587. // 55
  588. mov [esp + LOOP_I], eax
  589. jnz L6
  590. // Assembler version of C_WIDTH_DIFF
  591. // if (width_diff)
  592. mov eax, [esp + WIDTH_DIFF]
  593. mov edx, eax
  594. test eax, eax
  595. jz Lno_width_diff
  596. // tm = (*(YPlane-1)) << 24
  597. // tm |= (tm>>8) | (tm>>16) | (tm>>24)
  598. mov bl, [edi - 1]
  599. shl ebx, 24
  600. mov ecx, ebx
  601. shr ebx, 8
  602. or ecx, ebx
  603. shr ebx, 8
  604. or ecx, ebx
  605. shr ebx, 8
  606. or ecx, ebx
  607. // *(U32 *)YPlane = tm
  608. mov [edi], ecx
  609. // if ((width_diff-4) > 0)
  610. sub eax, 4
  611. jz Lupdate_YPlane
  612. // *(U32 *)(YPlane + 4) = tm
  613. mov [edi + 4], ecx
  614. sub eax, 4
  615. // if ((width_diff-8) > 0)
  616. jz Lupdate_YPlane
  617. // *(U32 *)(YPlane + 8) = tm
  618. mov [edi + 8], ecx
  619. Lupdate_YPlane:
  620. // YPlane += width_diff
  621. lea edi, [edi + edx]
  622. ///if (0 == (k&1))
  623. mov eax, [esp + LOOP_K]
  624. test eax, 1
  625. jnz Lno_width_diff
  626. // t8u = *(UPlane-1)
  627. // t8v = *(VPlane-1)
  628. // *UPlane++ = t8u
  629. // *UPlane++ = t8u
  630. // *VPlane++ = t8v
  631. // *VPlane++ = t8v
  632. mov ebp, edx
  633. mov eax, [esp + UPLANE]
  634. mov ebx, [esp + VPLANE]
  635. mov cl, [eax - 1]
  636. mov ch, [ebx - 1]
  637. mov [eax], cl
  638. mov [eax + 1], cl
  639. mov [ebx], ch
  640. mov [ebx + 1], ch
  641. // if ((width_diff-4) > 0)
  642. sub ebp, 4
  643. jz Lupdate_UVPlane
  644. // *UPlane++ = t8u
  645. // *UPlane++ = t8u
  646. // *VPlane++ = t8v
  647. // *VPlane++ = t8v
  648. mov [eax + 2], cl
  649. mov [eax + 3], cl
  650. mov [ebx + 2], ch
  651. mov [ebx + 3], ch
  652. // if ((width_diff-8) > 0)
  653. sub ebp, 4
  654. jz Lupdate_UVPlane
  655. // *UPlane++ = t8u
  656. // *UPlane++ = t8u
  657. // *VPlane++ = t8v
  658. // *VPlane++ = t8v
  659. mov [eax + 4], cl
  660. mov [eax + 5], cl
  661. mov [ebx + 4], ch
  662. mov [ebx + 5], ch
  663. Lupdate_UVPlane:
  664. shr edx, 1
  665. lea eax, [eax + edx]
  666. mov [esp + UPLANE], eax
  667. lea ebx, [ebx + edx]
  668. mov [esp + VPLANE], ebx
  669. Lno_width_diff:
  670. // if (stretch && (0 == k) && j)
  671. mov eax, [esp + STRETCH]
  672. test eax, eax
  673. jz L21
  674. mov eax, [esp + LOOP_K]
  675. test eax, eax
  676. jnz L21
  677. mov eax, [esp + LOOP_J]
  678. test eax, eax
  679. jz L21
  680. // spill YPlane ptr
  681. mov [esp + YPLANE], edi
  682. nop
  683. // for (i = OutputWidth; i > 0; i -= 8)
  684. // assign (ebx, pyprev)
  685. // assign (ecx, t)
  686. // assign (edx, pynext)
  687. // assign (edi, pyspace)
  688. // assign (ebp, i)
  689. // make sure offsets are such that there are no bank conflicts here
  690. mov ebx, [esp + PYPREV]
  691. mov edi, [esp + PYSPACE]
  692. mov edx, [esp + PYNEXT]
  693. mov ebp, [esp + OUTPUT_WIDTH]
  694. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  695. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  696. // *pyspace++ = t
  697. // t = (*pyprev++ & 0xFEFEFEFE) >> 1
  698. // t += (*pynext++ & 0xFEFEFEFE) >> 1
  699. // *pyspace++ = t
  700. L22:
  701. // 1
  702. mov eax, [ebx]
  703. lea ebx, [ebx + 4]
  704. // 2
  705. mov ecx, [edx]
  706. lea edx, [edx + 4]
  707. // 3
  708. shr ecx, 1
  709. and eax, 0xFEFEFEFE
  710. // 4
  711. shr eax, 1
  712. and ecx, 0x7F7F7F7F
  713. // 5
  714. add eax, ecx
  715. mov ecx, [ebx]
  716. // 6
  717. shr ecx, 1
  718. mov [edi], eax
  719. // 7
  720. mov eax, [edx]
  721. and ecx, 0x7F7F7F7F
  722. // 8
  723. shr eax, 1
  724. lea edi, [edi + 4]
  725. // 9
  726. and eax, 0x7F7F7F7F
  727. lea ebx, [ebx + 4]
  728. // 10
  729. lea edx, [edx + 4]
  730. add eax, ecx
  731. // 11
  732. mov [edi], eax
  733. lea edi, [edi + 4]
  734. // 12
  735. sub ebp, 8
  736. jnz L22
  737. // kill (ebx, pyprev)
  738. // kill (ecx, t)
  739. // kill (edx, pynext)
  740. // kill (edi, pyspace)
  741. // kill (ebp, i)
  742. // restore YPlane
  743. mov edi, [esp + YPLANE]
  744. // pnext += BackTwoLines
  745. L21:
  746. add esi, [esp + BACK_TWO_LINES]
  747. // YPlane += byte_ypitch_adj;
  748. add edi, [esp + BYTE_YPITCH_ADJ]
  749. // if(0 == (k&1))
  750. mov eax, [esp + LOOP_K]
  751. and eax, 1
  752. jnz L23
  753. // UPlane += byte_uvpitch_adj;
  754. // VPlane += byte_uvpitch_adj;
  755. mov eax, [esp + BYTE_UVPITCH_ADJ]
  756. add [esp + UPLANE], eax
  757. add [esp + VPLANE], eax
  758. L23:
  759. inc DWORD PTR [esp + LOOP_K]
  760. mov eax, [esp + LOOP_K]
  761. cmp eax, [esp + MARK]
  762. jl L5
  763. // if (stretch)
  764. cmp DWORD PTR [esp + STRETCH], 0
  765. je L24
  766. // pyprev = YPlane - pitch
  767. mov eax, edi
  768. sub eax, [esp + PITCH_PARM]
  769. mov [esp + PYPREV], eax
  770. // pyspace = YPlane
  771. mov [esp + PYSPACE], edi
  772. // pynext = (YPlane += pitch)
  773. add edi, [esp + PITCH_PARM]
  774. mov [esp + PYNEXT], edi
  775. L24:
  776. inc DWORD PTR [esp + LOOP_J]
  777. mov eax, [esp + LOOP_J]
  778. cmp eax, [esp + LUMA_ITERS]
  779. jl L4
  780. // kill (esi, pnext)
  781. // kill (edi, YPlane)
  782. // ASM version of C_HEIGHT_FILL
  783. // if (height_diff)
  784. mov eax, [esp + HEIGHT_DIFF]
  785. test eax, eax
  786. jz Lno_height_diff
  787. // pyspace = (U32 *)YPlane
  788. mov esi, edi
  789. // pyprev = (U32 *)(YPlane - pitch)
  790. sub esi, [esp + PITCH_PARM]
  791. // for (j = height_diff; j > 0; j--)
  792. Lheight_yfill_loop:
  793. mov ebx, [esp + WIDTHX16]
  794. // for (i = widthx16; i>0; i -=4)
  795. Lheight_yfill_row:
  796. // *pyspace++ = *pyprev++
  797. mov ecx, [esi]
  798. lea esi, [esi + 4]
  799. mov [edi], ecx
  800. lea edi, [edi + 4]
  801. sub ebx, 4
  802. jnz Lheight_yfill_row
  803. // pyspace += word_ypitch_adj
  804. // pyprev += word_ypitch_adj
  805. add esi, [esp + BYTE_YPITCH_ADJ]
  806. add edi, [esp + BYTE_YPITCH_ADJ]
  807. dec eax
  808. jnz Lheight_yfill_loop
  809. mov eax, [esp + HEIGHT_DIFF]
  810. mov edi, [esp + UPLANE]
  811. // puvspace = (U32 *)UPlane
  812. mov esi, edi
  813. // puvprev = (U32 *)(UPlane - pitch)
  814. sub esi, [esp + PITCH_PARM]
  815. // for (j = height_diff; j > 0; j -= 2)
  816. Lheight_ufill_loop:
  817. mov ebx, [esp + WIDTHX16]
  818. // for (i = widthx16; i>0; i -= 8)
  819. Lheight_ufill_row:
  820. // *puvspace++ = *puvprev++
  821. mov ecx, [esi]
  822. mov [edi], ecx
  823. lea esi, [esi + 4]
  824. lea edi, [edi + 4]
  825. sub ebx, 8
  826. jnz Lheight_ufill_row
  827. // puvspace += word_uvpitch_adj
  828. // puvprev += word_uvpitch_adj
  829. add esi, [esp + BYTE_UVPITCH_ADJ]
  830. add edi, [esp + BYTE_UVPITCH_ADJ]
  831. sub eax, 2
  832. jnz Lheight_ufill_loop
  833. mov eax, [esp + HEIGHT_DIFF]
  834. mov edi, [esp + VPLANE]
  835. // puvspace = (U32 *)VPlane
  836. mov esi, edi
  837. // puvprev = (U32 *)(VPlane - pitch)
  838. sub esi, [esp + PITCH_PARM]
  839. // for (j = height_diff; j > 0; j -= 2)
  840. Lheight_vfill_loop:
  841. mov ebx, [esp + WIDTHX16]
  842. // for (i = widthx16; i>0; i -= 8)
  843. Lheight_vfill_row:
  844. // *puvspace++ = *puvprev++
  845. mov ecx, [esi]
  846. mov [edi], ecx
  847. lea esi, [esi + 4]
  848. lea edi, [edi + 4]
  849. sub ebx, 8
  850. jnz Lheight_vfill_row
  851. // puvspace += word_uvpitch_adj
  852. // puvprev += word_uvpitch_adj
  853. add esi, [esp + BYTE_UVPITCH_ADJ]
  854. add edi, [esp + BYTE_UVPITCH_ADJ]
  855. sub eax, 2
  856. jnz Lheight_vfill_loop
  857. Lno_height_diff:
  858. // if (stretch)
  859. mov esi, [esp + PYPREV]
  860. cmp DWORD PTR [esp + STRETCH], 0
  861. je L26
  862. // for (i = OutputWidth; i > 0; i -= 4)
  863. // assign (esi, pyprev)
  864. // assign (edi, pyspace)
  865. // assign (ebp, i)
  866. mov ebp, [esp + OUTPUT_WIDTH]
  867. mov edi, [esp + PYSPACE]
  868. L25:
  869. mov ecx, [esi]
  870. lea esi, [esi + 4]
  871. mov [edi], ecx
  872. lea edi, [edi + 4]
  873. sub ebp, 4
  874. jnz L25
  875. // kill (esi, pyprev)
  876. // kill (edi, pyspace)
  877. // kill (ebp, i)
  878. L26:
  879. add esp, LOCALSIZE
  880. pop edi
  881. pop esi
  882. pop ebx
  883. pop ebp
  884. ret
  885. }
  886. }
  887. #undef LOCALSIZE
  888. #undef PITCH_PARM
  889. #undef VPLANE
  890. #undef UPLANE
  891. #undef YPLANE
  892. #undef LP_INPUT
  893. #undef OUTPUT_HEIGHT_WORD
  894. #undef OUTPUT_WIDTH_WORD
  895. #undef LPBI_INPUT
  896. #undef OUTPUT_WIDTH
  897. #undef PYPREV
  898. #undef PYSPACE
  899. #undef PYNEXT
  900. #undef PUVPREV
  901. #undef PUVSPACE
  902. #undef LOOP_I
  903. #undef LOOP_J
  904. #undef LOOP_K
  905. #undef BACK_TWO_LINES
  906. #undef WIDTHX16
  907. #undef HEIGHTX16
  908. #undef WIDTH_DIFF
  909. #undef HEIGHT_DIFF
  910. #undef WIDTH_ADJ
  911. #undef HEIGHT_ADJ
  912. #undef STRETCH
  913. #undef ASPECT
  914. #undef LUMA_ITERS
  915. #undef MARK
  916. #undef BYTE_YPITCH_ADJ
  917. #undef BYTE_UVPITCH_ADJ
  918. #endif // } H263P