Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

654 lines
14 KiB

  1. /*++
  2. Copyright (c) 2000 Microsoft Corporation
  3. Module Name :
  4. ulparse.cxx
  5. Abstract:
  6. Rip some useful UL code
  7. Author:
  8. (RIPPED from UL driver code (HenrySa, PaulMcd)
  9. Environment:
  10. Win32 - User Mode
  11. Project:
  12. ULW3.DLL
  13. --*/
  14. #include "precomp.hxx"
  15. typedef enum _URL_PART
  16. {
  17. Scheme,
  18. HostName,
  19. AbsPath,
  20. QueryString
  21. } URL_PART;
  22. #define IS_UTF8_TRAILBYTE(ch) (((ch) & 0xc0) == 0x80)
  23. NTSTATUS
  24. Unescape(
  25. IN PUCHAR pChar,
  26. OUT PUCHAR pOutChar
  27. )
  28. {
  29. UCHAR Result, Digit;
  30. if (pChar[0] != '%' ||
  31. SAFEIsXDigit(pChar[1]) == FALSE ||
  32. SAFEIsXDigit(pChar[2]) == FALSE)
  33. {
  34. return STATUS_OBJECT_PATH_SYNTAX_BAD;
  35. }
  36. //
  37. // HexToChar() inlined
  38. //
  39. // uppercase #1
  40. //
  41. if (isalpha(pChar[1]))
  42. Digit = (UCHAR) toupper(pChar[1]);
  43. else
  44. Digit = pChar[1];
  45. Result = ((Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0')) << 4;
  46. // uppercase #2
  47. //
  48. if (isalpha(pChar[2]))
  49. Digit = (UCHAR) toupper(pChar[2]);
  50. else
  51. Digit = pChar[2];
  52. Result |= (Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0');
  53. *pOutChar = Result;
  54. return STATUS_SUCCESS;
  55. } // Unescape
  56. NTSTATUS
  57. PopChar(
  58. IN URL_PART UrlPart,
  59. IN PUCHAR pChar,
  60. OUT WCHAR * pUnicodeChar,
  61. OUT PULONG pCharToSkip
  62. )
  63. {
  64. NTSTATUS Status;
  65. WCHAR UnicodeChar;
  66. UCHAR Char;
  67. UCHAR Trail1;
  68. UCHAR Trail2;
  69. ULONG CharToSkip;
  70. //
  71. // need to unescape ?
  72. //
  73. // can't decode the query string. that would be lossy decodeing
  74. // as '=' and '&' characters might be encoded, but have meaning
  75. // to the usermode parser.
  76. //
  77. if (UrlPart != QueryString && pChar[0] == '%')
  78. {
  79. Status = Unescape(pChar, &Char);
  80. if (NT_SUCCESS(Status) == FALSE)
  81. goto end;
  82. CharToSkip = 3;
  83. }
  84. else
  85. {
  86. Char = pChar[0];
  87. CharToSkip = 1;
  88. }
  89. //
  90. // convert to unicode, checking for utf8 .
  91. //
  92. // 3 byte runs are the largest we can have. 16 bits in UCS-2 =
  93. // 3 bytes of (4+4,2+6,2+6) where it's code + char.
  94. // for a total of 6+6+4 char bits = 16 bits.
  95. //
  96. //
  97. // NOTE: we'll only bother to decode utf if it was escaped
  98. // thus the (CharToSkip == 3)
  99. //
  100. if ((CharToSkip == 3) && ((Char & 0xf0) == 0xe0))
  101. {
  102. // 3 byte run
  103. //
  104. // Unescape the next 2 trail bytes
  105. //
  106. Status = Unescape(pChar+CharToSkip, &Trail1);
  107. if (NT_SUCCESS(Status) == FALSE)
  108. goto end;
  109. CharToSkip += 3; // %xx
  110. Status = Unescape(pChar+CharToSkip, &Trail2);
  111. if (NT_SUCCESS(Status) == FALSE)
  112. goto end;
  113. CharToSkip += 3; // %xx
  114. if (IS_UTF8_TRAILBYTE(Trail1) == FALSE ||
  115. IS_UTF8_TRAILBYTE(Trail2) == FALSE)
  116. {
  117. // bad utf!
  118. //
  119. Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
  120. goto end;
  121. }
  122. // handle three byte case
  123. // 1110xxxx 10xxxxxx 10xxxxxx
  124. UnicodeChar = (USHORT) (((Char & 0x0f) << 12) |
  125. ((Trail1 & 0x3f) << 6) |
  126. (Trail2 & 0x3f));
  127. }
  128. else if ((CharToSkip == 3) && ((Char & 0xe0) == 0xc0))
  129. {
  130. // 2 byte run
  131. //
  132. // Unescape the next 1 trail byte
  133. //
  134. Status = Unescape(pChar+CharToSkip, &Trail1);
  135. if (NT_SUCCESS(Status) == FALSE)
  136. goto end;
  137. CharToSkip += 3; // %xx
  138. if (IS_UTF8_TRAILBYTE(Trail1) == FALSE)
  139. {
  140. // bad utf!
  141. //
  142. Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
  143. goto end;
  144. }
  145. // handle two byte case
  146. // 110xxxxx 10xxxxxx
  147. UnicodeChar = (USHORT) (((Char & 0x1f) << 6) |
  148. (Trail1 & 0x3f));
  149. }
  150. // now this can either be unescaped high-bit (bad)
  151. // or escaped high-bit. (also bad)
  152. //
  153. // thus not checking CharToSkip
  154. //
  155. else if ((Char & 0x80) == 0x80)
  156. {
  157. // high bit set ! bad utf!
  158. //
  159. Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
  160. goto end;
  161. }
  162. //
  163. // Normal character (again either escaped or unescaped)
  164. //
  165. else
  166. {
  167. //
  168. // Simple conversion to unicode, it's 7-bit ascii.
  169. //
  170. UnicodeChar = (USHORT)Char;
  171. }
  172. //
  173. // turn backslashes into forward slashes
  174. //
  175. if (UrlPart != QueryString && UnicodeChar == L'\\')
  176. {
  177. UnicodeChar = L'/';
  178. }
  179. else if (UnicodeChar == 0)
  180. {
  181. //
  182. // we pop'd a NULL. bad!
  183. //
  184. Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
  185. goto end;
  186. }
  187. *pCharToSkip = CharToSkip;
  188. *pUnicodeChar = UnicodeChar;
  189. Status = STATUS_SUCCESS;
  190. end:
  191. return Status;
  192. } // PopChar
  193. //
  194. // Private constants.
  195. //
  196. #define ACTION_NOTHING 0x00000000
  197. #define ACTION_EMIT_CH 0x00010000
  198. #define ACTION_EMIT_DOT_CH 0x00020000
  199. #define ACTION_EMIT_DOT_DOT_CH 0x00030000
  200. #define ACTION_BACKUP 0x00040000
  201. #define ACTION_MASK 0xFFFF0000
  202. //
  203. // Private globals
  204. //
  205. //
  206. // this table says what to do based on the current state and the current
  207. // character
  208. //
  209. ULONG pActionTable[16] =
  210. {
  211. //
  212. // state 0 = fresh, seen nothing exciting yet
  213. //
  214. ACTION_EMIT_CH, // other = emit it state = 0
  215. ACTION_EMIT_CH, // "." = emit it state = 0
  216. ACTION_NOTHING, // EOS = normal finish state = 4
  217. ACTION_EMIT_CH, // "/" = we saw the "/", emit it state = 1
  218. //
  219. // state 1 = we saw a "/" !
  220. //
  221. ACTION_EMIT_CH, // other = emit it, state = 0
  222. ACTION_NOTHING, // "." = eat it, state = 2
  223. ACTION_NOTHING, // EOS = normal finish state = 4
  224. ACTION_NOTHING, // "/" = extra slash, eat it, state = 1
  225. //
  226. // state 2 = we saw a "/" and ate a "." !
  227. //
  228. ACTION_EMIT_DOT_CH, // other = emit the dot we ate. state = 0
  229. ACTION_NOTHING, // "." = eat it, a .. state = 3
  230. ACTION_NOTHING, // EOS = normal finish state = 4
  231. ACTION_NOTHING, // "/" = we ate a "/./", swallow it state = 1
  232. //
  233. // state 3 = we saw a "/" and ate a ".." !
  234. //
  235. ACTION_EMIT_DOT_DOT_CH, // other = emit the "..". state = 0
  236. ACTION_EMIT_DOT_DOT_CH, // "." = 3 dots, emit the ".." state = 0
  237. ACTION_BACKUP, // EOS = we have a "/..\0", backup! state = 4
  238. ACTION_BACKUP // "/" = we have a "/../", backup! state = 1
  239. };
  240. //
  241. // this table says which newstate to be in given the current state and the
  242. // character we saw
  243. //
  244. ULONG pNextStateTable[16] =
  245. {
  246. // state 0
  247. 0 , // other
  248. 0 , // "."
  249. 4 , // EOS
  250. 1 , // "\"
  251. // state 1
  252. 0 , // other
  253. 2 , // "."
  254. 4 , // EOS
  255. 1 , // "\"
  256. // state 2
  257. 0 , // other
  258. 3 , // "."
  259. 4 , // EOS
  260. 1 , // "\"
  261. // state 3
  262. 0 , // other
  263. 0 , // "."
  264. 4 , // EOS
  265. 1 // "\"
  266. };
  267. //
  268. // this says how to index into pNextStateTable given our current state.
  269. //
  270. // since max states = 4, we calculate the index by multiplying with 4.
  271. //
  272. #define IndexFromState( st) ( (st) * 4)
  273. /***************************************************************************++
  274. Routine Description:
  275. Unescape
  276. Convert backslash to forward slash
  277. Remove double slashes (empty directiories names) - e.g. // or \\
  278. Handle /./
  279. Handle /../
  280. Convert to unicode
  281. Arguments:
  282. Return Value:
  283. HRESULT
  284. --***************************************************************************/
  285. HRESULT
  286. UlCleanAndCopyUrl(
  287. IN PUCHAR pSource,
  288. IN ULONG SourceLength,
  289. OUT PULONG pBytesCopied,
  290. OUT PWSTR pDestination,
  291. OUT PWSTR * ppQueryString OPTIONAL
  292. )
  293. {
  294. NTSTATUS Status;
  295. PWSTR pDest;
  296. PUCHAR pChar;
  297. ULONG CharToSkip;
  298. UCHAR Char;
  299. ULONG BytesCopied;
  300. PWSTR pQueryString;
  301. ULONG StateIndex;
  302. WCHAR UnicodeChar;
  303. BOOLEAN MakeCanonical;
  304. URL_PART UrlPart = AbsPath;
  305. //
  306. // a cool local helper macro
  307. //
  308. #define EMIT_CHAR(ch) \
  309. do { \
  310. pDest[0] = (ch); \
  311. pDest += 1; \
  312. BytesCopied += 2; \
  313. } while (0)
  314. pDest = pDestination;
  315. pQueryString = NULL;
  316. BytesCopied = 0;
  317. pChar = pSource;
  318. CharToSkip = 0;
  319. StateIndex = 0;
  320. MakeCanonical = (UrlPart == AbsPath) ? TRUE : FALSE;
  321. while (SourceLength > 0)
  322. {
  323. //
  324. // advance ! it's at the top of the loop to enable ANSI_NULL to
  325. // come through ONCE
  326. //
  327. pChar += CharToSkip;
  328. SourceLength -= CharToSkip;
  329. //
  330. // well? have we hit the end?
  331. //
  332. if (SourceLength == 0)
  333. {
  334. UnicodeChar = UNICODE_NULL;
  335. }
  336. else
  337. {
  338. //
  339. // Nope. Peek briefly to see if we hit the query string
  340. //
  341. if (UrlPart == AbsPath && pChar[0] == '?')
  342. {
  343. DBG_ASSERT(pQueryString == NULL);
  344. //
  345. // remember it's location
  346. //
  347. pQueryString = pDest;
  348. //
  349. // let it fall through ONCE to the canonical
  350. // in order to handle a trailing "/.." like
  351. // "http://foobar:80/foo/bar/..?v=1&v2"
  352. //
  353. UnicodeChar = L'?';
  354. CharToSkip = 1;
  355. //
  356. // now we are cleaning the query string
  357. //
  358. UrlPart = QueryString;
  359. }
  360. else
  361. {
  362. //
  363. // grab the next char
  364. //
  365. Status = PopChar(UrlPart, pChar, &UnicodeChar, &CharToSkip);
  366. if (NT_SUCCESS(Status) == FALSE)
  367. goto end;
  368. }
  369. }
  370. if (MakeCanonical)
  371. {
  372. //
  373. // now use the state machine to make it canonical .
  374. //
  375. //
  376. // from the old value of StateIndex, figure out our new base StateIndex
  377. //
  378. StateIndex = IndexFromState(pNextStateTable[StateIndex]);
  379. //
  380. // did we just hit the query string? this will only happen once
  381. // that we take this branch after hitting it, as we stop
  382. // processing after hitting it.
  383. //
  384. if (UrlPart == QueryString)
  385. {
  386. //
  387. // treat this just like we hit a NULL, EOS.
  388. //
  389. StateIndex += 2;
  390. }
  391. else
  392. {
  393. //
  394. // otherwise based the new state off of the char we
  395. // just popped.
  396. //
  397. switch (UnicodeChar)
  398. {
  399. case UNICODE_NULL: StateIndex += 2; break;
  400. case L'.': StateIndex += 1; break;
  401. case L'/': StateIndex += 3; break;
  402. default: StateIndex += 0; break;
  403. }
  404. }
  405. }
  406. else
  407. {
  408. StateIndex = (UnicodeChar == UNICODE_NULL) ? 2 : 0;
  409. }
  410. //
  411. // Perform the action associated with the state.
  412. //
  413. switch (pActionTable[StateIndex])
  414. {
  415. case ACTION_EMIT_DOT_DOT_CH:
  416. EMIT_CHAR(L'.');
  417. // fall through
  418. case ACTION_EMIT_DOT_CH:
  419. EMIT_CHAR(L'.');
  420. // fall through
  421. case ACTION_EMIT_CH:
  422. EMIT_CHAR(UnicodeChar);
  423. // fall through
  424. case ACTION_NOTHING:
  425. break;
  426. case ACTION_BACKUP:
  427. //
  428. // pDest currently points 1 past the last '/'. backup over it and
  429. // find the preceding '/', set pDest to 1 past that one.
  430. //
  431. //
  432. // backup to the '/'
  433. //
  434. pDest -= 1;
  435. BytesCopied -= 2;
  436. DBG_ASSERT(pDest[0] == L'/');
  437. //
  438. // are we at the start of the string? that's bad, can't go back!
  439. //
  440. if (pDest == pDestination)
  441. {
  442. DBG_ASSERT(BytesCopied == 0);
  443. Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
  444. goto end;
  445. }
  446. //
  447. // back up over the '/'
  448. //
  449. pDest -= 1;
  450. BytesCopied -= 2;
  451. DBG_ASSERT(pDest > pDestination);
  452. //
  453. // now find the previous slash
  454. //
  455. while (pDest > pDestination && pDest[0] != L'/')
  456. {
  457. pDest -= 1;
  458. BytesCopied -= 2;
  459. }
  460. //
  461. // we already have a slash, so don't have to store 1.
  462. //
  463. DBG_ASSERT(pDest[0] == L'/');
  464. //
  465. // simply skip it, as if we had emitted it just now
  466. //
  467. pDest += 1;
  468. BytesCopied += 2;
  469. break;
  470. default:
  471. DBG_ASSERT(!"w3core!UlpCleanAndCopyUrl: Invalid action code in state table!");
  472. Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
  473. goto end;
  474. }
  475. //
  476. // Just hit the query string ?
  477. //
  478. if (MakeCanonical && UrlPart == QueryString)
  479. {
  480. //
  481. // Stop canonical processing
  482. //
  483. MakeCanonical = FALSE;
  484. //
  485. // Need to emit the '?', it wasn't emitted above
  486. //
  487. DBG_ASSERT(pActionTable[StateIndex] != ACTION_EMIT_CH);
  488. EMIT_CHAR(L'?');
  489. }
  490. }
  491. //
  492. // terminate the string, it hasn't been done in the loop
  493. //
  494. DBG_ASSERT((pDest-1)[0] != UNICODE_NULL);
  495. pDest[0] = UNICODE_NULL;
  496. *pBytesCopied = BytesCopied;
  497. if (ppQueryString != NULL)
  498. {
  499. *ppQueryString = pQueryString;
  500. }
  501. Status = STATUS_SUCCESS;
  502. end:
  503. return HRESULT_FROM_WIN32( RtlNtStatusToDosError( Status ) );
  504. } // UlCleanAndCopyUrl