Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

450 lines
8.4 KiB

  1. /*
  2. * XML support functions
  3. * Copyright (C) 2000 Microsoft Corporation
  4. */
  5. #include "precomp.h"
  6. BOOL FIsXmlWhitespaceW(WCHAR wch)
  7. {
  8. return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD'));
  9. }
  10. BOOL FIsXmlWhitespaceA(char ch)
  11. {
  12. return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));
  13. }
  14. BOOL FIsXmlA(LPCSTR rgch, UINT cch)
  15. {
  16. if (memcmp(rgch, "<?xml", 5) != 0)
  17. {
  18. // Not XML
  19. return(FALSE);
  20. }
  21. return(TRUE);
  22. UNREFERENCED_PARAMETER( cch );
  23. }
  24. BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)
  25. {
  26. if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)
  27. {
  28. // Not XML
  29. return(FALSE);
  30. }
  31. return(TRUE);
  32. UNREFERENCED_PARAMETER( cch );
  33. }
  34. BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)
  35. {
  36. LPCSTR pchMax;
  37. LPCSTR pch;
  38. char chQuote;
  39. // XML files encoded in UTF-16 are required to have a BOM which if present
  40. // would already have been detected. This means that if this file is XML
  41. // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
  42. // encoding of some form. We check for ASCII compatible encodings only
  43. // which includes everything we probably care about but excludes EBCDIC.
  44. // Check for file begining with <?xml ... encoding='...' ... ?>
  45. if (cch < 20)
  46. {
  47. // File is too small
  48. return(FALSE);
  49. }
  50. if (!FIsXmlA(rgch, cch))
  51. {
  52. // Not XML
  53. return(FALSE);
  54. }
  55. // Don't scan more than 4K looking for encoding even if it is valid XML
  56. cch = __min(cch, 4096);
  57. pchMax = rgch + cch;
  58. pch = rgch + 5;
  59. if (!FIsXmlWhitespaceA(*pch))
  60. {
  61. // Not XML
  62. return(FALSE);
  63. }
  64. pch++;
  65. chQuote = '\0';
  66. for (;;)
  67. {
  68. LPCSTR pchToken;
  69. if (pch == pchMax)
  70. {
  71. // Not XML
  72. break;
  73. }
  74. if (FIsXmlWhitespaceA(*pch))
  75. {
  76. pch++;
  77. continue;
  78. }
  79. if (*pch == '=')
  80. {
  81. pch++;
  82. continue;
  83. }
  84. if ((*pch == '\'') || (*pch == '"'))
  85. {
  86. if (*pch == chQuote)
  87. {
  88. chQuote = '\0';
  89. }
  90. else
  91. {
  92. chQuote = *pch;
  93. }
  94. pch++;
  95. continue;
  96. }
  97. if (chQuote != '\0')
  98. {
  99. // We are within a quoted string. Skip everything until closing quote.
  100. pch++;
  101. continue;
  102. }
  103. if ((pch + 2) > pchMax)
  104. {
  105. // Not XML
  106. break;
  107. }
  108. if ((pch[0] == '?') && (pch[1] == '>'))
  109. {
  110. // This looks like XML. At this point if we don't find an encoding
  111. // specification we could assume UTF-8. We don't because there are
  112. // malformed XML documents and assuming UTF-8 might affect Notepad
  113. // compatibility. This may be fine but we put it off for now.
  114. // *pcp = CP_UTF8;
  115. // return(TRUE);
  116. break;
  117. }
  118. pchToken = pch;
  119. while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch))
  120. {
  121. pch++;
  122. }
  123. if (pch != (pchToken + 8))
  124. {
  125. continue;
  126. }
  127. if (memcmp(pchToken, "encoding", 8) != 0)
  128. {
  129. continue;
  130. }
  131. while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
  132. {
  133. pch++;
  134. }
  135. if ((pch == pchMax) || (*pch++ != '='))
  136. {
  137. // Not XML
  138. break;
  139. }
  140. while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
  141. {
  142. pch++;
  143. }
  144. if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"')))
  145. {
  146. // Not XML
  147. break;
  148. }
  149. chQuote = *pch++;
  150. pchToken = pch;
  151. while ((pch < pchMax) && (*pch != chQuote))
  152. {
  153. pch++;
  154. }
  155. if (pch == pchMax)
  156. {
  157. // Not XML
  158. break;
  159. }
  160. // We have an XML encoding declaration from pchToken to (pch - 1)
  161. if (pch == pchToken)
  162. {
  163. // Not XML
  164. break;
  165. }
  166. if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))
  167. {
  168. // Encoding is not recognized
  169. break;
  170. }
  171. if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
  172. {
  173. // These are bogus since we know the file is MBCS
  174. break;
  175. }
  176. return(FValidateCodepage(hwndNP, *pcp));
  177. }
  178. return(FALSE);
  179. }
  180. BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)
  181. {
  182. const WCHAR *pchMax;
  183. const WCHAR *pch;
  184. WCHAR chQuote;
  185. // XML files encoded in UTF-16 are required to have a BOM which if present
  186. // would already have been detected. This means that if this file is XML
  187. // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
  188. // encoding of some form. We check for ASCII compatible encodings only
  189. // which includes everything we probably care about but excludes EBCDIC.
  190. // Check for file begining with <?xml ... encoding='...' ... ?>
  191. if (cch < 20)
  192. {
  193. // File is too small
  194. return(FALSE);
  195. }
  196. if (!FIsXmlW(rgch, cch))
  197. {
  198. // Not XML
  199. return(FALSE);
  200. }
  201. // Don't scan more than 4K looking for encoding even if it is valid XML
  202. cch = __min(cch, 4096);
  203. pchMax = rgch + cch;
  204. pch = rgch + 5;
  205. if (!FIsXmlWhitespaceW(*pch))
  206. {
  207. // Not XML
  208. return(FALSE);
  209. }
  210. pch++;
  211. chQuote = L'\0';
  212. for (;;)
  213. {
  214. const WCHAR *pchToken;
  215. if (pch == pchMax)
  216. {
  217. // Not XML
  218. break;
  219. }
  220. if (FIsXmlWhitespaceW(*pch))
  221. {
  222. pch++;
  223. continue;
  224. }
  225. if (*pch == L'=')
  226. {
  227. pch++;
  228. continue;
  229. }
  230. if ((*pch == L'\'') || (*pch == L'"'))
  231. {
  232. if (*pch == chQuote)
  233. {
  234. chQuote = L'\0';
  235. }
  236. else
  237. {
  238. chQuote = *pch;
  239. }
  240. pch++;
  241. continue;
  242. }
  243. if (chQuote != L'\0')
  244. {
  245. // We are within a quoted string. Skip everything until closing quote.
  246. pch++;
  247. continue;
  248. }
  249. if ((pch + 2) > pchMax)
  250. {
  251. // Not XML
  252. break;
  253. }
  254. if ((pch[0] == L'?') && (pch[1] == L'>'))
  255. {
  256. // This looks like XML. At this point if we don't find an encoding
  257. // specification we could assume UTF-8. We don't because there are
  258. // malformed XML documents and assuming UTF-8 might affect Notepad
  259. // compatibility. This may be fine but we put it off for now.
  260. // *pcp = CP_UTF8;
  261. // return(TRUE);
  262. break;
  263. }
  264. pchToken = pch;
  265. while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch))
  266. {
  267. pch++;
  268. }
  269. if (pch != (pchToken + 8))
  270. {
  271. continue;
  272. }
  273. if (memcmp(pchToken, L"encoding", 8) != 0)
  274. {
  275. continue;
  276. }
  277. while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
  278. {
  279. pch++;
  280. }
  281. if ((pch == pchMax) || (*pch++ != L'='))
  282. {
  283. // Not XML
  284. break;
  285. }
  286. while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
  287. {
  288. pch++;
  289. }
  290. if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"')))
  291. {
  292. // Not XML
  293. break;
  294. }
  295. chQuote = *pch++;
  296. pchToken = pch;
  297. while ((pch < pchMax) && (*pch != chQuote))
  298. {
  299. pch++;
  300. }
  301. if (pch == pchMax)
  302. {
  303. // Not XML
  304. break;
  305. }
  306. // We have an XML encoding declaration from pchToken to (pch - 1)
  307. if (pch == pchToken)
  308. {
  309. // Not XML
  310. break;
  311. }
  312. if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))
  313. {
  314. // Encoding is not recognized
  315. break;
  316. }
  317. #if 0
  318. if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
  319. {
  320. // These are bogus since we know the file is MBCS
  321. break;
  322. }
  323. #endif
  324. return(FValidateCodepage(hwndNP, *pcp));
  325. }
  326. return(FALSE);
  327. }