Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1189 lines
36 KiB

  1. /*++
  2. Copyright (c) 2002-2002 Microsoft Corporation
  3. Module Name:
  4. Utf8.c
  5. Abstract:
  6. UTF-8 manipulation routines
  7. Author:
  8. George V. Reilly (GeorgeRe) 01-Apr-2002
  9. Revision History:
  10. --*/
  11. #include "precomp.h"
  12. #if defined(ALLOC_PRAGMA) && defined(KERNEL_PRIV)
  13. #pragma alloc_text( INIT, HttpInitializeUtf8)
  14. #pragma alloc_text( PAGE, HttpUnicodeToUTF8)
  15. #pragma alloc_text( PAGE, HttpUTF8ToUnicode)
  16. #pragma alloc_text( PAGE, HttpUcs4toUtf16)
  17. #pragma alloc_text( PAGE, HttpUnicodeToUTF8Count)
  18. #pragma alloc_text( PAGE, HttpUnicodeToUTF8Encode)
  19. #pragma alloc_text( PAGE, HttpUtf8RawBytesToUnicode)
  20. #endif // ALLOC_PRAGMA && KERNEL_PRIV
  21. #if 0 // Non-Pageable Functions
  22. NOT PAGEABLE --
  23. #endif // Non-Pageable Functions
  24. DECLSPEC_ALIGN(UL_CACHE_LINE)
  25. const UCHAR
  26. Utf8OctetCount[256] =
  27. {
  28. // singletons: 0x00 - 0x7F
  29. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x
  30. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1x
  31. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2x
  32. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3x
  33. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4x
  34. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5x
  35. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6x
  36. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7x
  37. // UTF-8 trail bytes are not valid lead byte prefixes: 0x80 - 0xBF
  38. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
  39. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
  40. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax
  41. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx
  42. // two-byte prefixes: 0xC0 - 0xDF
  43. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // Cx
  44. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // Dx
  45. // three-byte prefixes: 0xE0 - 0xEF
  46. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
  47. // four-byte prefixes: 0xF0 - 0xF7
  48. 4, 4, 4, 4, 4, 4, 4, 4, // Fx
  49. // invalid prefixes: 0xF8 - 0xFF
  50. 0, 0, 0, 0, 0, 0, 0, 0, // Fx
  51. };
  52. const static char hexArray[] = "0123456789ABCDEF";
  53. VOID
  54. HttpInitializeUtf8(
  55. VOID
  56. )
  57. {
  58. #if DBG
  59. ULONG i;
  60. //
  61. // Validate Utf8OctetCount[]
  62. //
  63. for (i = 0; i < 256; ++i)
  64. {
  65. UCHAR OctetCount = UTF8_OCTET_COUNT(i);
  66. if (IS_UTF8_SINGLETON(i))
  67. {
  68. ASSERT(1 == OctetCount);
  69. }
  70. else if (IS_UTF8_1ST_BYTE_OF_2(i))
  71. {
  72. ASSERT(2 == OctetCount);
  73. }
  74. else if (IS_UTF8_1ST_BYTE_OF_3(i))
  75. {
  76. ASSERT(3 == OctetCount);
  77. }
  78. else if (IS_UTF8_1ST_BYTE_OF_4(i))
  79. {
  80. ASSERT(4 == OctetCount);
  81. }
  82. else
  83. {
  84. ASSERT(0 == OctetCount);
  85. }
  86. }
  87. #endif // DBG
  88. } // HttpInitializeUtf8
  89. //
  90. // Some Unicode to Utf8 conversion utilities taken and modified frm
  91. // base\win32\winnls\utf.c. Use this until they expose the same functionality
  92. // in kernel.
  93. //
  94. /***************************************************************************++
  95. Routine Description:
  96. Maps a Unicode character string to its UTF-8 string counterpart
  97. Conversion continues until the source is finished or an error happens in
  98. either case it returns the number of UTF-8 characters written.
  99. If the supllied buffer is not big enough it returns 0.
  100. --***************************************************************************/
  101. ULONG
  102. HttpUnicodeToUTF8(
  103. IN PCWSTR lpSrcStr,
  104. IN LONG cchSrc,
  105. OUT LPSTR lpDestStr,
  106. IN LONG cchDest
  107. )
  108. {
  109. LPCWSTR lpWC = lpSrcStr;
  110. LONG cchU8 = 0; // # of UTF8 chars generated
  111. ULONG dwSurrogateChar;
  112. WCHAR wchHighSurrogate = 0;
  113. BOOLEAN bHandled;
  114. while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
  115. {
  116. bHandled = FALSE;
  117. //
  118. // Check if high surrogate is available
  119. //
  120. if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
  121. {
  122. if (cchDest)
  123. {
  124. // Another high surrogate, then treat the 1st as normal
  125. // Unicode character.
  126. if (wchHighSurrogate)
  127. {
  128. if ((cchU8 + 2) < cchDest)
  129. {
  130. lpDestStr[cchU8++] = (UCHAR) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
  131. lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
  132. lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
  133. }
  134. else
  135. {
  136. // not enough buffer
  137. cchSrc++;
  138. break;
  139. }
  140. }
  141. }
  142. else
  143. {
  144. cchU8 += 3;
  145. }
  146. wchHighSurrogate = *lpWC;
  147. bHandled = TRUE;
  148. }
  149. if (!bHandled && wchHighSurrogate)
  150. {
  151. if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
  152. {
  153. // wheee, valid surrogate pairs
  154. if (cchDest)
  155. {
  156. if ((cchU8 + 3) < cchDest)
  157. {
  158. dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);
  159. lpDestStr[cchU8++] = (UTF8_1ST_OF_4 | (UCHAR)(dwSurrogateChar >> 18)); // 3 bits from 1st byte
  160. lpDestStr[cchU8++] = (UTF8_TRAIL | (UCHAR)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte
  161. lpDestStr[cchU8++] = (UTF8_TRAIL | (UCHAR)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte
  162. lpDestStr[cchU8++] = (UTF8_TRAIL | (UCHAR)(0x3f &dwSurrogateChar)); // 6 bits from 4th byte
  163. }
  164. else
  165. {
  166. // not enough buffer
  167. cchSrc++;
  168. break;
  169. }
  170. }
  171. else
  172. {
  173. // we already counted 3 previously (in high surrogate)
  174. cchU8 += 1;
  175. }
  176. bHandled = TRUE;
  177. }
  178. else
  179. {
  180. // Bad Surrogate pair : ERROR
  181. // Just process wchHighSurrogate , and the code below will
  182. // process the current code point
  183. if (cchDest)
  184. {
  185. if ((cchU8 + 2) < cchDest)
  186. {
  187. lpDestStr[cchU8++] = (UCHAR) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
  188. lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
  189. lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
  190. }
  191. else
  192. {
  193. // not enough buffer
  194. cchSrc++;
  195. break;
  196. }
  197. }
  198. }
  199. wchHighSurrogate = 0;
  200. }
  201. if (!bHandled)
  202. {
  203. if (*lpWC <= UTF8_1_MAX)
  204. {
  205. //
  206. // Found ASCII.
  207. //
  208. if (cchDest)
  209. {
  210. lpDestStr[cchU8] = (char)*lpWC;
  211. }
  212. cchU8++;
  213. }
  214. else if (*lpWC <= UTF8_2_MAX)
  215. {
  216. //
  217. // Found 2 byte sequence if < 0x07ff (11 bits).
  218. //
  219. if (cchDest)
  220. {
  221. if ((cchU8 + 1) < cchDest)
  222. {
  223. //
  224. // Use upper 5 bits in first byte.
  225. // Use lower 6 bits in second byte.
  226. //
  227. lpDestStr[cchU8++] = (UCHAR) (UTF8_1ST_OF_2 | (*lpWC >> 6));
  228. lpDestStr[cchU8++] = (UCHAR) (UTF8_TRAIL | LOWER_6_BIT(*lpWC));
  229. }
  230. else
  231. {
  232. //
  233. // Error - buffer too small.
  234. //
  235. cchSrc++;
  236. break;
  237. }
  238. }
  239. else
  240. {
  241. cchU8 += 2;
  242. }
  243. }
  244. else
  245. {
  246. //
  247. // Found 3 byte sequence.
  248. //
  249. if (cchDest)
  250. {
  251. if ((cchU8 + 2) < cchDest)
  252. {
  253. //
  254. // Use upper 4 bits in first byte.
  255. // Use middle 6 bits in second byte.
  256. // Use lower 6 bits in third byte.
  257. //
  258. lpDestStr[cchU8++] = (UCHAR)(UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC));
  259. lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL | MIDDLE_6_BIT(*lpWC));
  260. lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL | LOWER_6_BIT(*lpWC));
  261. }
  262. else
  263. {
  264. //
  265. // Error - buffer too small.
  266. //
  267. cchSrc++;
  268. break;
  269. }
  270. }
  271. else
  272. {
  273. cchU8 += 3;
  274. }
  275. }
  276. }
  277. lpWC++;
  278. }
  279. //
  280. // If the last character was a high surrogate, then handle it as a normal
  281. // unicode character.
  282. //
  283. if ((cchSrc < 0) && (wchHighSurrogate != 0))
  284. {
  285. if (cchDest)
  286. {
  287. if ((cchU8 + 2) < cchDest)
  288. {
  289. lpDestStr[cchU8++] = (UCHAR)(UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
  290. lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
  291. lpDestStr[cchU8++] = (UCHAR)(UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
  292. }
  293. else
  294. {
  295. cchSrc++;
  296. }
  297. }
  298. }
  299. //
  300. // Make sure the destination buffer was large enough.
  301. //
  302. if (cchDest && (cchSrc >= 0))
  303. {
  304. return 0;
  305. }
  306. //
  307. // Return the number of UTF-8 characters written.
  308. //
  309. return cchU8;
  310. } // HttpUnicodeToUTF8
  311. /***************************************************************************++
  312. Routine Description:
  313. Maps a UTF-8 character string to its wide character string counterpart.
  314. Return Value:
  315. --***************************************************************************/
  316. NTSTATUS
  317. HttpUTF8ToUnicode(
  318. IN LPCSTR lpSrcStr,
  319. IN LONG cchSrc,
  320. OUT LPWSTR lpDestStr,
  321. IN OUT PLONG pcchDest,
  322. IN ULONG dwFlags
  323. )
  324. {
  325. LONG nTB = 0; // # trail bytes to follow
  326. LONG cchWC = 0; // # of Unicode code points generated
  327. CONST BYTE* pUTF8 = (CONST BYTE*)lpSrcStr;
  328. LONG dwSurrogateChar = 0; // Full surrogate char
  329. BOOLEAN bSurrogatePair = FALSE; // Indicate we'r collecting a
  330. // surrogate pair
  331. BOOLEAN bCheckInvalidBytes = (BOOLEAN)(dwFlags == 1);
  332. BYTE UTF8;
  333. LONG cchDest = *pcchDest;
  334. while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
  335. {
  336. //
  337. // See if there are any trail bytes.
  338. //
  339. if (BIT7(*pUTF8) == 0)
  340. {
  341. //
  342. // Found ASCII.
  343. //
  344. if (cchDest)
  345. {
  346. lpDestStr[cchWC] = (WCHAR)*pUTF8;
  347. }
  348. nTB = bSurrogatePair = 0;
  349. cchWC++;
  350. }
  351. else if (BIT6(*pUTF8) == 0)
  352. {
  353. //
  354. // Found a trail byte.
  355. // Note : Ignore the trail byte if there was no lead byte.
  356. //
  357. if (nTB != 0)
  358. {
  359. //
  360. // Decrement the trail byte counter.
  361. //
  362. nTB--;
  363. if (bSurrogatePair)
  364. {
  365. dwSurrogateChar <<= 6;
  366. dwSurrogateChar |= LOWER_6_BIT(*pUTF8);
  367. if (nTB == 0)
  368. {
  369. if (cchDest)
  370. {
  371. if ((cchWC + 1) < cchDest)
  372. {
  373. lpDestStr[cchWC] = (WCHAR)
  374. (((dwSurrogateChar - 0x10000) >> 10) + HIGH_SURROGATE_START);
  375. lpDestStr[cchWC+1] = (WCHAR)
  376. ((dwSurrogateChar - 0x10000) % 0x400 + LOW_SURROGATE_START);
  377. }
  378. else
  379. {
  380. // Error : Buffer too small
  381. cchSrc++;
  382. break;
  383. }
  384. }
  385. cchWC += 2;
  386. bSurrogatePair = FALSE;
  387. }
  388. }
  389. else
  390. {
  391. //
  392. // Make room for the trail byte and add the trail byte
  393. // value.
  394. //
  395. if (cchDest)
  396. {
  397. lpDestStr[cchWC] <<= 6;
  398. lpDestStr[cchWC] |= LOWER_6_BIT(*pUTF8);
  399. }
  400. if (nTB == 0)
  401. {
  402. //
  403. // End of sequence. Advance the output counter.
  404. //
  405. cchWC++;
  406. }
  407. }
  408. }
  409. else
  410. {
  411. if (bCheckInvalidBytes)
  412. {
  413. RETURN(STATUS_INVALID_PARAMETER);
  414. }
  415. // error - not expecting a trail byte. That is, there is a trailing byte without leading byte.
  416. bSurrogatePair = FALSE;
  417. }
  418. }
  419. else
  420. {
  421. //
  422. // Found a lead byte.
  423. //
  424. if (nTB > 0)
  425. {
  426. // error - A leading byte before the previous sequence is completed.
  427. if (bCheckInvalidBytes)
  428. {
  429. RETURN(STATUS_INVALID_PARAMETER);
  430. }
  431. //
  432. // Error - previous sequence not finished.
  433. //
  434. nTB = 0;
  435. bSurrogatePair = FALSE;
  436. // Put this character back so that we can start over another sequence.
  437. cchSrc++;
  438. pUTF8--;
  439. }
  440. else
  441. {
  442. //
  443. // Calculate the number of bytes to follow.
  444. // Look for the first 0 from left to right.
  445. //
  446. UTF8 = *pUTF8;
  447. while (BIT7(UTF8) != 0)
  448. {
  449. UTF8 <<= 1;
  450. nTB++;
  451. }
  452. //
  453. // Check for non-shortest form.
  454. //
  455. switch (nTB) {
  456. case 1:
  457. nTB = 0;
  458. break;
  459. case 2:
  460. // Make sure that bit 8 ~ bit 11 is not all zero.
  461. // 110XXXXx 10xxxxxx
  462. if ((*pUTF8 & 0x1e) == 0)
  463. {
  464. nTB = 0;
  465. }
  466. break;
  467. case 3:
  468. // Look ahead to check for non-shortest form.
  469. // 1110XXXX 10Xxxxxx 10xxxxxx
  470. if (cchSrc >= 2)
  471. {
  472. if (((*pUTF8 & 0x0f) == 0) && (*(pUTF8 + 1) & 0x20) == 0)
  473. {
  474. nTB = 0;
  475. }
  476. }
  477. break;
  478. case 4:
  479. //
  480. // This is a surrogate unicode pair
  481. //
  482. if (cchSrc >= 3)
  483. {
  484. SHORT word = (((SHORT)*pUTF8) << 8) | *(pUTF8 + 1);
  485. // Look ahead to check for non-shortest form.
  486. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
  487. // Check for the 5 bits are not all zero.
  488. // 0x0730 == 00000111 11000000
  489. if ((word & 0x0730) == 0)
  490. {
  491. nTB = 0;
  492. } else if ((word & 0x0400) == 0x0400)
  493. {
  494. // The 21st bit is 1.
  495. // Make sure that the resulting Unicode is within the valid surrogate range.
  496. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne
  497. // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
  498. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
  499. // bit are all zero.
  500. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
  501. // XXXXX can only be 10000.
  502. // 0x0330 = 0000 0011 0011 0000
  503. if ((word & 0x0330) != 0)
  504. {
  505. nTB = 0;
  506. }
  507. }
  508. if (nTB != 0)
  509. {
  510. dwSurrogateChar = UTF8 >> nTB;
  511. bSurrogatePair = TRUE;
  512. }
  513. }
  514. break;
  515. default:
  516. //
  517. // If the bits is greater than 4, this is an invalid
  518. // UTF8 lead byte.
  519. //
  520. nTB = 0;
  521. break;
  522. }
  523. if (nTB != 0)
  524. {
  525. //
  526. // Store the value from the first byte and decrement
  527. // the number of bytes to follow.
  528. //
  529. if (cchDest)
  530. {
  531. lpDestStr[cchWC] = (WCHAR)(UTF8 >> nTB);
  532. }
  533. nTB--;
  534. } else
  535. {
  536. if (bCheckInvalidBytes)
  537. {
  538. RETURN(STATUS_INVALID_PARAMETER);
  539. }
  540. }
  541. }
  542. }
  543. pUTF8++;
  544. }
  545. if ((bCheckInvalidBytes && nTB != 0) || (cchWC == 0))
  546. {
  547. // About (cchWC == 0):
  548. // Because we now throw away non-shortest form, it is possible that we generate 0 chars.
  549. // In this case, we have to set error to ERROR_NO_UNICODE_TRANSLATION so that we conform
  550. // to the spec of MultiByteToWideChar.
  551. RETURN(STATUS_INVALID_PARAMETER);
  552. }
  553. //
  554. // Make sure the destination buffer was large enough.
  555. //
  556. if (cchDest && (cchSrc >= 0))
  557. {
  558. RETURN(STATUS_BUFFER_TOO_SMALL);
  559. }
  560. //
  561. // Return the number of Unicode characters written.
  562. //
  563. *pcchDest = cchWC;
  564. return STATUS_SUCCESS;
  565. } // HttpUTF8ToUnicode
  566. /***************************************************************************++
  567. Routine Description:
  568. Split a UCS-4 character (32 bits)
  569. into 1 or 2 UTF-16 characters (16 bits each)
  570. Arguments:
  571. UnicodeChar - UCS-4 character
  572. pHighSurrogate - First output character
  573. pLowSurrogate - Second output character. Zero unless UnicodeChar > 0xFFFF
  574. Return Value:
  575. STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
  576. --***************************************************************************/
  577. NTSTATUS
  578. HttpUcs4toUtf16(
  579. IN ULONG UnicodeChar,
  580. OUT PWCHAR pHighSurrogate,
  581. OUT PWCHAR pLowSurrogate
  582. )
  583. {
  584. NTSTATUS Status = STATUS_SUCCESS;
  585. ASSERT(NULL != pHighSurrogate);
  586. ASSERT(NULL != pLowSurrogate);
  587. if (UnicodeChar <= 0xFFFF)
  588. {
  589. *pHighSurrogate = (WCHAR) UnicodeChar;
  590. *pLowSurrogate = 0;
  591. if (HIGH_SURROGATE_START <= UnicodeChar
  592. && UnicodeChar <= LOW_SURROGATE_END)
  593. {
  594. UlTraceError(PARSER, (
  595. "http!HttpUcs4toUtf16(): "
  596. "Illegal raw surrogate character, U+%04lX.\n",
  597. UnicodeChar
  598. ));
  599. Status = STATUS_INVALID_PARAMETER;
  600. }
  601. if ( IS_UNICODE_NONCHAR(UnicodeChar) )
  602. {
  603. UlTraceError(PARSER, (
  604. "http!HttpUcs4toUtf16(): "
  605. "Non-character code point, U+%04lX.\n",
  606. UnicodeChar
  607. ));
  608. Status = STATUS_INVALID_PARAMETER;
  609. }
  610. }
  611. else if (UnicodeChar <= UTF8_4_MAX)
  612. {
  613. if ( IS_UNICODE_NONCHAR(UnicodeChar) )
  614. {
  615. UlTraceError(PARSER, (
  616. "http!HttpUcs4toUtf16(): "
  617. "Non-character code point, U+%04lX.\n",
  618. UnicodeChar
  619. ));
  620. Status = STATUS_INVALID_PARAMETER;
  621. }
  622. else
  623. {
  624. *pHighSurrogate
  625. = (WCHAR) (((UnicodeChar - 0x10000) >> 10)
  626. + HIGH_SURROGATE_START);
  627. ASSERT(HIGH_SURROGATE_START <= *pHighSurrogate
  628. && *pHighSurrogate <= HIGH_SURROGATE_END);
  629. *pLowSurrogate
  630. = (WCHAR) (((UnicodeChar - 0x10000) & ((1 << 10) - 1))
  631. + LOW_SURROGATE_START);
  632. ASSERT(LOW_SURROGATE_START <= *pLowSurrogate
  633. && *pLowSurrogate <= LOW_SURROGATE_END);
  634. }
  635. }
  636. else
  637. {
  638. UlTraceError(PARSER, (
  639. "http!HttpUcs4toUtf16(): "
  640. "Illegal large character, 0x%08lX.\n",
  641. UnicodeChar
  642. ));
  643. Status = STATUS_INVALID_PARAMETER;
  644. }
  645. return Status;
  646. } // HttpUcs4toUtf16
  647. /***************************************************************************++
  648. Routine Description:
  649. Count number of BYTEs required for UTF-8 conversion of UNICODE string.
  650. Count is terminated after dwInLen characters
  651. Arguments:
  652. pwszIn - pointer to input wide-character string
  653. dwInLen - number of characters in pwszIn
  654. bEncode - TRUE if we are to hex encode characters >= 0x80
  655. Return Value:
  656. ULONG - number of BYTEs required for conversion
  657. --***************************************************************************/
  658. ULONG
  659. HttpUnicodeToUTF8Count(
  660. IN LPCWSTR pwszIn,
  661. IN ULONG dwInLen,
  662. IN BOOLEAN bEncode
  663. )
  664. {
  665. ULONG dwCount = 0;
  666. ULONG oneCharLen = bEncode ? 3 : 1;
  667. ULONG twoCharLen = 2 * oneCharLen;
  668. ASSERT(pwszIn != NULL);
  669. ASSERT(dwInLen != 0);
  670. //
  671. // N.B. code arranged to reduce number of jumps in loop to 1 (while)
  672. //
  673. do {
  674. ULONG wchar = *pwszIn++;
  675. dwCount += (wchar & 0xF800) ? oneCharLen : 0;
  676. dwCount += ((wchar & 0xFF80) ? 0xFFFFFFFF : 0) & (twoCharLen - 1);
  677. ++dwCount;
  678. } while (--dwInLen != 0);
  679. return dwCount;
  680. } // HttpUnicodeToUTF8Count
  681. /***************************************************************************++
  682. Routine Description:
  683. Maps a Unicode character string to its UTF-8 string counterpart. This
  684. also hex encodes the string.
  685. Conversion continues until the source is finished or an error happens in
  686. either case it returns the number of UTF-8 characters written.
  687. If the supllied buffer is not big enough it returns 0.
  688. Convert a string of UNICODE characters to UTF-8:
  689. 0000000000000000..0000000001111111: 0xxxxxxx
  690. 0000000010000000..0000011111111111: 110xxxxx 10xxxxxx
  691. 0000100000000000..1111111111111111: 1110xxxx 10xxxxxx 10xxxxxx
  692. Arguments:
  693. pwszIn - pointer to input wide-character string
  694. dwInLen - number of CHARACTERS in pwszIn INCLUDING terminating NUL
  695. pszOut - pointer to output narrow-character buffer
  696. dwOutLen - number of BYTEs in pszOut
  697. pdwOutLen - actual number of BYTES written to the output pszOut
  698. bEncode - TRUE if we are to hex encode characters >= 0x80
  699. Return Value:
  700. ULONG
  701. Success - STATUS_SUCCESS
  702. Failure - STATUS_INSUFFICIENT_RESOURCES
  703. Not enough space in pszOut to store results
  704. --***************************************************************************/
  705. NTSTATUS
  706. HttpUnicodeToUTF8Encode(
  707. IN LPCWSTR pwszIn,
  708. IN ULONG dwInLen,
  709. OUT PUCHAR pszOut,
  710. IN ULONG dwOutLen,
  711. OUT PULONG pdwOutLen,
  712. IN BOOLEAN bEncode
  713. )
  714. {
  715. PUCHAR pOutput = pszOut;
  716. ULONG pOutputLen = dwOutLen;
  717. UCHAR lead;
  718. int shift;
  719. ULONG outputSize = bEncode ? 3 : 1;
  720. ASSERT(pwszIn != NULL);
  721. ASSERT((int)dwInLen > 0);
  722. ASSERT(pszOut != NULL);
  723. ASSERT((int)dwOutLen > 0);
  724. while (dwInLen-- && dwOutLen) {
  725. ULONG wchar = *pwszIn++;
  726. UCHAR bchar;
  727. if (wchar <= 0x007F) {
  728. *pszOut++ = (UCHAR)(wchar);
  729. --dwOutLen;
  730. continue;
  731. }
  732. lead = ((wchar >= 0x0800) ? 0xE0 : 0xC0);
  733. shift = ((wchar >= 0x0800) ? 12 : 6);
  734. if ((int)(dwOutLen -= outputSize) < 0)
  735. {
  736. RETURN(STATUS_INSUFFICIENT_RESOURCES);
  737. }
  738. bchar = lead | (UCHAR)(wchar >> shift);
  739. if (bEncode) {
  740. *pszOut++ = '%';
  741. *pszOut++ = hexArray[bchar >> 4];
  742. bchar = hexArray[bchar & 0x0F];
  743. }
  744. *pszOut++ = bchar;
  745. if (wchar >= 0x0800) {
  746. if ((int)(dwOutLen -= outputSize) < 0)
  747. {
  748. RETURN(STATUS_INSUFFICIENT_RESOURCES);
  749. }
  750. bchar = 0x80 | (UCHAR)((wchar >> 6) & 0x003F);
  751. if (bEncode) {
  752. *pszOut++ = '%';
  753. *pszOut++ = hexArray[bchar >> 4];
  754. bchar = hexArray[bchar & 0x0F];
  755. }
  756. *pszOut++ = bchar;
  757. }
  758. if ((int)(dwOutLen -= outputSize) < 0)
  759. {
  760. RETURN(STATUS_INSUFFICIENT_RESOURCES);
  761. }
  762. bchar = 0x80 | (UCHAR)(wchar & 0x003F);
  763. if (bEncode) {
  764. *pszOut++ = '%';
  765. *pszOut++ = hexArray[bchar >> 4];
  766. bchar = hexArray[bchar & 0x0F];
  767. }
  768. *pszOut++ = bchar;
  769. }
  770. ASSERT(pszOut >= pOutput && pszOut <= pOutput + pOutputLen);
  771. UNREFERENCED_PARAMETER(pOutputLen);
  772. if (pdwOutLen)
  773. *pdwOutLen = (ULONG)(pszOut - pOutput);
  774. return STATUS_SUCCESS;
  775. } // HttpUnicodeToUTF8Encode
  776. /***************************************************************************++
  777. Routine Description:
  778. Splice together the bits from a UTF-8 lead byte and 0-3 trail bytes
  779. into a Unicode character.
  780. Arguments:
  781. pOctetArray - Input buffer: Raw lead byte + raw trail bytes
  782. SourceLength - Length of pOctetArray, in bytes
  783. pUnicodeChar - decoded character
  784. pOctetsToSkip - number of bytes consumed from pOctetArray
  785. Return Value:
  786. STATUS_SUCCESS or STATUS_OBJECT_PATH_SYNTAX_BAD
  787. --***************************************************************************/
  788. NTSTATUS
  789. HttpUtf8RawBytesToUnicode(
  790. IN PCUCHAR pOctetArray,
  791. IN ULONG SourceLength,
  792. OUT PULONG pUnicodeChar,
  793. OUT PULONG pOctetsToSkip
  794. )
  795. {
  796. ULONG i;
  797. ULONG UnicodeChar;
  798. UCHAR LeadByte = pOctetArray[0];
  799. ULONG OctetCount = UTF8_OCTET_COUNT(LeadByte);
  800. ASSERT(SourceLength > 0);
  801. if (0 == OctetCount)
  802. {
  803. UlTraceError(PARSER, (
  804. "http!HttpUtf8RawBytesToUnicode(): "
  805. "Invalid UTF-8 lead byte, %%%02X.\n",
  806. LeadByte
  807. ));
  808. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  809. }
  810. else if (OctetCount > SourceLength)
  811. {
  812. UlTraceError(PARSER, (
  813. "http!HttpUtf8RawBytesToUnicode(): "
  814. "UTF-8 lead byte, %%%02X, requires %lu bytes in buffer, "
  815. "but only have %lu.\n",
  816. LeadByte, OctetCount, SourceLength
  817. ));
  818. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  819. }
  820. // Check that the trail bytes are valid: 10xxxxxx.
  821. for (i = 1; i < OctetCount; ++i)
  822. {
  823. if (! IS_UTF8_TRAILBYTE(pOctetArray[i]))
  824. {
  825. UlTraceError(PARSER, (
  826. "http!HttpUtf8RawBytesToUnicode(): "
  827. "Invalid trail byte[%lu], %%%02X.\n",
  828. i, pOctetArray[i]
  829. ));
  830. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  831. }
  832. }
  833. //
  834. // Now splice together the bits from the lead byte and the trail byte(s)
  835. //
  836. switch (OctetCount)
  837. {
  838. case 1:
  839. // handle one-byte case:
  840. // (0xxx xxxx)
  841. // => 0xxx xxxx
  842. ASSERT(IS_UTF8_SINGLETON(LeadByte));
  843. ASSERT(SourceLength >= 1);
  844. UnicodeChar = LeadByte;
  845. ASSERT(UnicodeChar <= UTF8_1_MAX);
  846. break;
  847. case 2:
  848. // handle two-byte case:
  849. // (110y yyyy, 10xx xxxx)
  850. // => 0000 0yyy yyxx xxxx
  851. ASSERT(IS_UTF8_1ST_BYTE_OF_2(LeadByte));
  852. ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[1]));
  853. ASSERT(SourceLength >= 2);
  854. UnicodeChar = (
  855. ((pOctetArray[0] & 0x1f) << 6) |
  856. (pOctetArray[1] & 0x3f)
  857. );
  858. if (UnicodeChar <= UTF8_1_MAX)
  859. {
  860. UlTraceError(PARSER, (
  861. "http!HttpUtf8RawBytesToUnicode(): "
  862. "Overlong 2-byte sequence, "
  863. "%%%02X %%%02X = U+%04lX.\n",
  864. pOctetArray[0],
  865. pOctetArray[1],
  866. UnicodeChar
  867. ));
  868. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  869. }
  870. ASSERT(UTF8_1_MAX < UnicodeChar && UnicodeChar <= UTF8_2_MAX);
  871. break;
  872. case 3:
  873. // handle three-byte case:
  874. // (1110 zzzz, 10yy yyyy, 10xx xxxx)
  875. // => zzzz yyyy yyxx xxxx
  876. ASSERT(IS_UTF8_1ST_BYTE_OF_3(LeadByte));
  877. ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[1]));
  878. ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[2]));
  879. ASSERT(SourceLength >= 3);
  880. UnicodeChar = (
  881. ((pOctetArray[0] & 0x0f) << 12) |
  882. ((pOctetArray[1] & 0x3f) << 6) |
  883. (pOctetArray[2] & 0x3f)
  884. );
  885. if (UnicodeChar <= UTF8_2_MAX)
  886. {
  887. UlTraceError(PARSER, (
  888. "http!HttpUtf8RawBytesToUnicode(): "
  889. "Overlong 3-byte sequence, "
  890. "%%%02X %%%02X %%%02X = U+%04lX.\n",
  891. pOctetArray[0],
  892. pOctetArray[1],
  893. pOctetArray[2],
  894. UnicodeChar
  895. ));
  896. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  897. }
  898. ASSERT(UTF8_2_MAX < UnicodeChar && UnicodeChar <= UTF8_3_MAX);
  899. break;
  900. case 4:
  901. // handle four-byte case:
  902. // (1111 0uuu, 10uu zzzz, 10yy yyyy, 10xx xxxx)
  903. // => 000u uuuu zzzz yyyy yyxx xxxx
  904. ASSERT(IS_UTF8_1ST_BYTE_OF_4(LeadByte));
  905. ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[1]));
  906. ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[2]));
  907. ASSERT(IS_UTF8_TRAILBYTE(pOctetArray[3]));
  908. ASSERT(SourceLength >= 4);
  909. UnicodeChar = (
  910. ((pOctetArray[0] & 0x07) << 18) |
  911. ((pOctetArray[1] & 0x3f) << 12) |
  912. ((pOctetArray[2] & 0x3f) << 6) |
  913. (pOctetArray[3] & 0x3f)
  914. );
  915. if (UnicodeChar <= UTF8_3_MAX)
  916. {
  917. UlTraceError(PARSER, (
  918. "http!HttpUtf8RawBytesToUnicode(): "
  919. "Overlong 4-byte sequence, "
  920. "%%%02X %%%02X %%%02X %%%02X = U+%06lX.\n",
  921. pOctetArray[0],
  922. pOctetArray[1],
  923. pOctetArray[2],
  924. pOctetArray[3],
  925. UnicodeChar
  926. ));
  927. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  928. }
  929. // Not all values in the 21-bit range are valid
  930. if (UnicodeChar > UTF8_4_MAX)
  931. {
  932. UlTraceError(PARSER, (
  933. "http!HttpUtf8RawBytesToUnicode(): "
  934. "Overlarge 4-byte sequence, "
  935. "%%%02X %%%02X %%%02X %%%02X = U+%06lX.\n",
  936. pOctetArray[0],
  937. pOctetArray[1],
  938. pOctetArray[2],
  939. pOctetArray[3],
  940. UnicodeChar
  941. ));
  942. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  943. }
  944. ASSERT(UTF8_3_MAX < UnicodeChar && UnicodeChar <= UTF8_4_MAX);
  945. break;
  946. default:
  947. ASSERT(! "Impossible OctetCount");
  948. UnicodeChar = 0;
  949. break;
  950. }
  951. //
  952. // Do not allow characters in the high- or low-surrogate ranges
  953. // to be UTF-8-encoded directly.
  954. //
  955. if (HIGH_SURROGATE_START <= UnicodeChar && UnicodeChar <= LOW_SURROGATE_END)
  956. {
  957. UlTraceError(PARSER, (
  958. "http!HttpUtf8RawBytesToUnicode(): "
  959. "Illegal surrogate character, U+%04lX.\n",
  960. UnicodeChar
  961. ));
  962. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  963. }
  964. // For security reasons we will signal an error for all noncharacter code
  965. // points encountered.
  966. if ( IS_UNICODE_NONCHAR(UnicodeChar) )
  967. {
  968. ASSERT( (((LOW_NONCHAR_BOM & UnicodeChar) == LOW_NONCHAR_BOM) &&
  969. ((UnicodeChar >> 16) <= HIGH_NONCHAR_END)) ||
  970. ((LOW_NONCHAR_START <= UnicodeChar) &&
  971. (UnicodeChar <= LOW_NONCHAR_END)) );
  972. UlTraceError(PARSER, (
  973. "http!HttpUtf8RawBytesToUnicode(): "
  974. "Non-character code point, U+%04lX.\n",
  975. UnicodeChar
  976. ));
  977. RETURN(STATUS_OBJECT_PATH_SYNTAX_BAD);
  978. }
  979. *pUnicodeChar = UnicodeChar;
  980. *pOctetsToSkip = OctetCount;
  981. return STATUS_SUCCESS;
  982. } // HttpUtf8RawBytesToUnicode