Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

910 lines
28 KiB

  1. //------------------------------------------------------------------------
  2. //
  3. // Tabular Data Control Parsing Module
  4. // Copyright (C) Microsoft Corporation, 1996, 1997
  5. //
  6. // File: TDCParse.cpp
  7. //
  8. // Contents: Implementation of CTDCParse classes.
  9. //
  10. //------------------------------------------------------------------------
  11. #include "stdafx.h"
  12. #include <simpdata.h>
  13. #include "TDC.h"
  14. #include <MLang.h>
  15. #include "Notify.h"
  16. #include "TDCParse.h"
  17. #include "TDCArr.h"
  18. #include "locale.h"
  19. #include "wch.h"
  20. //#ifndef DISPID_AMBIENT_CODEPAGE
  21. //#define DISPID_AMBIENT_CODEPAGE (-725)
  22. //#endif
  23. #define BYTE_ORDER_MARK 0xFEFF
  24. #define REVERSE_BYTE_ORDER_MARK 0xFFFE
  25. //------------------------------------------------------------------------
  26. //
  27. // Function: IsSpace()
  28. //
  29. // Synopsis: Returns TRUE if the given character is a space or tab character.
  30. //
  31. // Arguments: ch Character to test.
  32. //
  33. // Returns: TRUE if 'ch' is a space or tab character.
  34. // FALSE otherwise.
  35. //
  36. //------------------------------------------------------------------------
  37. inline boolean IsSpace(WCHAR ch)
  38. {
  39. return (ch == L' ' || ch == L'\t');
  40. }
  41. //////////////////////////////////////////////////////////////////////////
  42. //
  43. // CTDCTokenise Class - see comments in file TDCParse.h
  44. // ------------------
  45. //////////////////////////////////////////////////////////////////////////
  46. //------------------------------------------------------------------------
  47. //
  48. // Method: CTDCTokenise::Create()
  49. //
  50. // Synopsis: Initialise the CTDCTokenise object
  51. //
  52. // Arguments: pFieldSink Object to send parsed fields to.
  53. // wchDelimField \
  54. // wchDelimRow | Set of characters that control
  55. // wchQuote | the parsing of fields
  56. // wchEscape /
  57. //
  58. // Returns: S_OK indicating success.
  59. //
  60. //------------------------------------------------------------------------
  61. HRESULT CTDCUnify::InitTokenizer(CTDCFieldSink *pFieldSink, WCHAR wchDelimField,
  62. WCHAR wchDelimRow, WCHAR wchQuote, WCHAR wchEscape)
  63. {
  64. _ASSERT(pFieldSink != NULL);
  65. m_pFieldSink = pFieldSink;
  66. m_wchDelimField = wchDelimField;
  67. m_wchDelimRow = wchDelimRow;
  68. m_wchQuote = wchQuote;
  69. m_wchEscape = wchEscape;
  70. m_ucParsed = 0;
  71. m_fIgnoreNextLF = FALSE;
  72. m_fIgnoreNextCR = FALSE;
  73. m_fIgnoreNextWhiteSpace = FALSE;
  74. m_fEscapeActive = FALSE;
  75. m_fQuoteActive = FALSE;
  76. m_fFoldWhiteSpace = FALSE;
  77. // Ensure that the field and row delimiters are set.
  78. //
  79. if (m_wchDelimRow == 0)
  80. m_wchDelimRow = DEFAULT_ROW_DELIM[0];
  81. // Remove conflicting delimiter values
  82. //
  83. if (m_wchDelimRow == m_wchDelimField)
  84. m_wchDelimRow = 0;
  85. if (m_wchQuote != 0)
  86. {
  87. if (m_wchQuote == m_wchDelimField || m_wchQuote == m_wchDelimRow)
  88. m_wchQuote = 0;
  89. }
  90. if (m_wchEscape != 0)
  91. {
  92. if (m_wchEscape == m_wchDelimField ||
  93. m_wchEscape == m_wchDelimRow ||
  94. m_wchEscape == m_wchQuote)
  95. m_wchEscape = 0;
  96. }
  97. m_fFoldCRLF = (m_wchDelimRow == L'\r' || m_wchDelimRow == L'\n');
  98. return S_OK;
  99. }
  100. //------------------------------------------------------------------------
  101. //
  102. // Method: CTDCTokenise::AddWcharBuffer()
  103. //
  104. // Synopsis: Takes a buffer of characters, breaks it up into fields
  105. // and passes them to the embedded CTDCFieldSink object
  106. // as fields.
  107. //
  108. // Arguments: pwch Buffer containing characters to be parsed.
  109. // dwSize Number of significant characters in 'pwch'
  110. // dwSize == 0 means "End-of-stream"
  111. //
  112. // Returns: S_OK upon success.
  113. // E_OUTOFMEMORY indicating insufficient memory to carry
  114. // out the parse operation.
  115. // Other misc error code upon failure.
  116. //
  117. //------------------------------------------------------------------------
  118. HRESULT CTDCUnify::AddWcharBuffer(BOOL fLastData)
  119. {
  120. OutputDebugStringX(_T("CTDCTokenise::AddWcharBuffer called\n"));
  121. _ASSERT(m_pFieldSink != NULL);
  122. HRESULT hr = S_OK;
  123. LPWCH pwchCurr; // Next character to process
  124. LPWCH pwchEnd; // End-of-buffer marker
  125. LPWCH pwchDest; // Where to write next char processed
  126. LPWCH pwchStart; // Beginning of current token
  127. pwchStart = &m_psWcharBuf[0];
  128. pwchCurr = pwchStart + m_ucParsed;
  129. pwchDest = pwchCurr;
  130. pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];
  131. // Read up to the next field boundary (field or row delimiter)
  132. //
  133. while (pwchCurr < pwchEnd)
  134. {
  135. // Security: If we see a null character, it's not a text file. Abort the
  136. // download, so that no one can use the TDC to download .exe's or other
  137. // binary files.
  138. if (*pwchCurr == 0)
  139. {
  140. hr = E_ABORT;
  141. goto Cleanup;
  142. }
  143. if (m_fIgnoreNextLF)
  144. {
  145. // We're expecting a LF to terminate a CR-LF sequence.
  146. //
  147. m_fIgnoreNextLF = FALSE;
  148. if (*pwchCurr == L'\n')
  149. {
  150. // Found a LF - ignore it
  151. //
  152. pwchCurr++;
  153. continue;
  154. }
  155. // Found something else - carry on ...
  156. //
  157. }
  158. if (m_fIgnoreNextCR)
  159. {
  160. // We're expecting a CR to terminate a LF-CR sequence.
  161. //
  162. m_fIgnoreNextCR = FALSE;
  163. if (*pwchCurr == L'\r')
  164. {
  165. // Found a CR - ignore it
  166. //
  167. pwchCurr++;
  168. continue;
  169. }
  170. // Found something else - carry on ...
  171. //
  172. }
  173. if (m_fIgnoreNextWhiteSpace)
  174. {
  175. // We're expecting the rest of a white-space sequence
  176. //
  177. if (IsSpace(*pwchCurr))
  178. {
  179. // Found white-space - ignore it
  180. //
  181. pwchCurr++;
  182. continue;
  183. }
  184. m_fIgnoreNextWhiteSpace = FALSE;
  185. }
  186. // Escape characters work, even in quoted strings
  187. //
  188. if (m_fEscapeActive)
  189. {
  190. *pwchDest++ = *pwchCurr++;
  191. m_fEscapeActive = FALSE;
  192. continue;
  193. }
  194. if (*pwchCurr == m_wchEscape)
  195. {
  196. pwchCurr++;
  197. m_fEscapeActive = TRUE;
  198. continue;
  199. }
  200. // Quotes activate/deactivate Field/Row delimiters
  201. //
  202. if (*pwchCurr == m_wchQuote)
  203. {
  204. pwchCurr++;
  205. m_fQuoteActive = !m_fQuoteActive;
  206. continue;
  207. }
  208. if (m_fQuoteActive)
  209. {
  210. *pwchDest++ = *pwchCurr++;
  211. continue;
  212. }
  213. if (*pwchCurr == m_wchDelimField ||
  214. (m_fFoldWhiteSpace && IsSpace(*pwchCurr)))
  215. {
  216. hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
  217. if (!SUCCEEDED(hr))
  218. goto Cleanup;
  219. pwchCurr++;
  220. if (m_fFoldWhiteSpace && IsSpace(*pwchCurr))
  221. m_fIgnoreNextWhiteSpace = TRUE;
  222. pwchStart = &m_psWcharBuf[0];
  223. pwchDest = pwchStart;
  224. continue;
  225. }
  226. if (*pwchCurr == m_wchDelimRow ||
  227. (m_fFoldCRLF && (*pwchCurr == L'\r' || *pwchCurr == L'\n')))
  228. {
  229. hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
  230. if (!SUCCEEDED(hr))
  231. goto Cleanup;
  232. hr = m_pFieldSink->EOLN();
  233. if (!SUCCEEDED(hr))
  234. goto Cleanup;
  235. if (m_fFoldCRLF)
  236. {
  237. m_fIgnoreNextLF = (*pwchCurr == L'\r');
  238. m_fIgnoreNextCR = (*pwchCurr == L'\n');
  239. }
  240. pwchCurr++;
  241. pwchStart = &m_psWcharBuf[0];
  242. pwchDest = pwchStart;
  243. continue;
  244. }
  245. *pwchDest++ = *pwchCurr++;
  246. }
  247. m_ucWcharBufCount = pwchDest - pwchStart;
  248. m_ucParsed = pwchDest - pwchStart; // amount we've already parsed
  249. // If this is the last data packet, and there's a fragment left,
  250. // parse it.
  251. if (m_ucWcharBufCount && fLastData)
  252. {
  253. hr = m_pFieldSink->AddField(pwchStart, m_ucParsed);
  254. if (!SUCCEEDED(hr))
  255. goto Cleanup;
  256. m_ucParsed = 0;
  257. hr = m_pFieldSink->EOLN();
  258. return hr;
  259. }
  260. Cleanup:
  261. return hr;
  262. }
  263. //////////////////////////////////////////////////////////////////////////
  264. //
  265. // CTDCUnify Class - see comments in file TDCParse.h
  266. // ---------------
  267. //////////////////////////////////////////////////////////////////////////
  268. //------------------------------------------------------------------------
  269. //
  270. // Method: CTDCUnify::CTDCUnify()
  271. //
  272. // Synopsis: Constuctor
  273. //
  274. //------------------------------------------------------------------------
  275. CTDCUnify::CTDCUnify()
  276. {
  277. m_pML = NULL;
  278. }
  279. //------------------------------------------------------------------------
  280. //
  281. // Method: CTDCUnify::~CTDCUnify()
  282. //
  283. // Synopsis: Destructor
  284. //
  285. //------------------------------------------------------------------------
  286. CTDCUnify::~CTDCUnify()
  287. {
  288. delete [] m_psByteBuf;
  289. delete [] m_psWcharBuf;
  290. if (m_pML != NULL)
  291. m_pML->Release();
  292. }
  293. //------------------------------------------------------------------------
  294. //
  295. // Method: CTDCUnify::Create()
  296. //
  297. // Synopsis: Initialise the CTDCUnify object
  298. //
  299. // Arguments: pTokenise Object to send converted buffers to.
  300. // nCodePage Code page for ASCII->Unicode conversions
  301. // pML MLANG COM object (used for conversions)
  302. //
  303. // Returns: S_OK to indicate success.
  304. //
  305. //------------------------------------------------------------------------
  306. HRESULT CTDCUnify::Create(UINT nCodePage, UINT nAmbientCodePage, IMultiLanguage *pML)
  307. {
  308. m_pML = pML;
  309. m_pML->AddRef();
  310. m_nCodePage = nCodePage;
  311. m_nAmbientCodePage = nAmbientCodePage;
  312. m_fDataMarkedUnicode = FALSE;
  313. m_fDataIsUnicode = FALSE;
  314. m_dwBytesProcessed = 0;
  315. m_fCanConvertToUnicode = 0;
  316. m_nUnicode = 0;
  317. m_fProcessedAllowDomainList = FALSE;
  318. m_dwConvertMode = 0;
  319. m_ucByteBufSize = 0;
  320. m_ucByteBufCount = 0;
  321. m_psByteBuf = NULL;
  322. m_ucWcharBufSize = 0;
  323. m_ucWcharBufCount = 0;
  324. m_psWcharBuf = NULL;
  325. if (m_nCodePage && S_OK != m_pML->IsConvertible(m_nCodePage, UNICODE_CP))
  326. {
  327. m_nCodePage = 0;
  328. }
  329. if (m_nAmbientCodePage && S_OK != m_pML->IsConvertible(m_nAmbientCodePage, UNICODE_CP))
  330. {
  331. m_nAmbientCodePage = 0;
  332. }
  333. return S_OK;
  334. }
  335. //------------------------------------------------------------------------
  336. //
  337. // Method: CTDCUnify::IsUnicode
  338. //
  339. // Synopsis: Determines if our text buffer is Unicode or not. Should
  340. // only be called once on the FIRST text buffer.
  341. //
  342. // Assume if the data is marked as Unicode, that it's correct.
  343. //
  344. // The determination this routine makes will override any
  345. // single byte codepage the user may have specified.
  346. //
  347. //
  348. // Arguments: pBytes Buffer containing characters to be converted.
  349. // dwSize Number of significant characters in 'pBytes'
  350. //
  351. // Returns: Code page of text, or zero if not Unicode (UNICODE_CP,
  352. // UNICODE_REVERSE_CP, or 0)
  353. //
  354. //
  355. //------------------------------------------------------------------------
  356. int
  357. CTDCUnify::IsUnicode(BYTE * pBytes, DWORD dwSize)
  358. {
  359. if (BYTE_ORDER_MARK == *(WCHAR *)pBytes)
  360. return UNICODE_CP;
  361. if (REVERSE_BYTE_ORDER_MARK == *(WCHAR *)pBytes)
  362. return UNICODE_REVERSE_CP;
  363. else return 0;
  364. }
  365. //------------------------------------------------------------------------
  366. //
  367. // Method: CTDCUnify::ConvertByteBuffer()
  368. //
  369. // Synopsis: Converts a byte-buffer into a wide-character stream
  370. // (applying unicode conversions if necessary) and passes
  371. // it to the embedded TDCTokenise object to be broken into
  372. // fields.
  373. //
  374. // Arguments: pBytes Buffer containing characters to be converted.
  375. // dwSize Number of significant characters in 'pBytes'
  376. // dwSize == 0 means "End-of-stream"
  377. //
  378. // Returns: S_OK upon success.
  379. // S_FALSE if not enough data has shown up yet to be useful
  380. // OLE_E_CANTCONVERT if a non-unicode buffer can't be
  381. // converted into unicode.
  382. // E_OUTOFMEMORY if there isn't enough memory to perform
  383. // a data conversion.
  384. //
  385. //------------------------------------------------------------------------
  386. HRESULT CTDCUnify::ConvertByteBuffer(BYTE *pBytes, DWORD dwSize)
  387. {
  388. OutputDebugStringX(_T("CTDCUnify::ConvertByteBuffer called\n"));
  389. _ASSERT(pBytes != NULL || dwSize == 0);
  390. HRESULT hr = S_OK;
  391. UINT ucBytes;
  392. UINT ucWchars;
  393. // Is there enough space in Byte buffer for this packet?
  394. if (dwSize > (m_ucByteBufSize - m_ucByteBufCount))
  395. {
  396. // No, the current buffer is too small, make a new one.
  397. BYTE * psTemp = new BYTE[m_ucByteBufCount + dwSize];
  398. if (psTemp==NULL)
  399. {
  400. hr = E_OUTOFMEMORY;
  401. goto Done;
  402. }
  403. if (m_psByteBuf != NULL) // if not first time
  404. {
  405. memmove(psTemp, m_psByteBuf, m_ucByteBufCount);
  406. delete [] m_psByteBuf;
  407. }
  408. m_ucByteBufSize = m_ucByteBufCount + dwSize;
  409. m_psByteBuf = psTemp;
  410. }
  411. // Append the new data to the old data.
  412. memmove(m_psByteBuf + m_ucByteBufCount, pBytes, dwSize);
  413. m_ucByteBufCount += dwSize;
  414. // Is there enough space in the Wchar buffer for the converted data?
  415. // We make a very conservative assumption here that N source buffer bytes
  416. // convert to N Wchar buffer chars (or 2*N bytes). This will ensure that
  417. // our call to ConvertToUnicode will never not finish because there wasn't
  418. // enough room in the output buffer.
  419. if (m_ucByteBufCount > (m_ucWcharBufSize - m_ucWcharBufCount))
  420. {
  421. // The current buffer is too small, make a new one.
  422. WCHAR * psTemp = new WCHAR[m_ucWcharBufCount + m_ucByteBufCount];
  423. if (psTemp==NULL)
  424. {
  425. hr = E_OUTOFMEMORY;
  426. goto Done;
  427. }
  428. if (m_psWcharBuf != NULL) // if not first time
  429. {
  430. memmove(psTemp, m_psWcharBuf,
  431. m_ucWcharBufCount*sizeof(WCHAR));
  432. delete [] m_psWcharBuf;
  433. }
  434. m_psWcharBuf = psTemp;
  435. m_ucWcharBufSize = m_ucWcharBufCount + m_ucByteBufCount;
  436. }
  437. if (0 == m_dwBytesProcessed)
  438. {
  439. // if we can't determine the codepage yet, try again later
  440. if (!DetermineCodePage(dwSize==0))
  441. {
  442. hr = S_FALSE;
  443. goto Done;
  444. }
  445. }
  446. // Convert as many source bytes as we can to Unicode chars
  447. ucBytes = m_ucByteBufCount;
  448. ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;
  449. // ConvertStringToUnicode won't convert Unicode to Unicode for us.
  450. // So we'll do it ourselves.
  451. if (m_nUnicode)
  452. {
  453. _ASSERT( ucWchars * sizeof(WCHAR) >= ucBytes);
  454. // This might copy an odd extra byte
  455. memmove((BYTE *)(m_psWcharBuf + m_ucWcharBufCount), m_psByteBuf,
  456. ucBytes);
  457. // But we only count the number of complete WCHAR's we copied.
  458. ucWchars = ucBytes / sizeof(WCHAR);
  459. ucBytes = ucWchars * sizeof(WCHAR);
  460. if (UNICODE_REVERSE_CP == m_nUnicode)
  461. {
  462. // need to byte swap
  463. BYTE *pByteSwap = (BYTE *)(m_psWcharBuf + m_ucWcharBufCount);
  464. BYTE bTemp;
  465. for (ULONG i = ucWchars; i != 0; i--)
  466. {
  467. // Well, OK, we've kind of hardwired WCHAR == 2 here, but ..
  468. bTemp = pByteSwap[0];
  469. pByteSwap[0] = pByteSwap[1];
  470. pByteSwap[1] = bTemp;
  471. pByteSwap += 2;
  472. }
  473. }
  474. // On first packet, need to remove Unicode signature.
  475. // Only need to look for 0xFFFE -- we already swapped bytes.
  476. if (0 == m_dwBytesProcessed && m_psWcharBuf[0] == BYTE_ORDER_MARK)
  477. {
  478. ucWchars--;
  479. memmove((BYTE *)m_psWcharBuf, (BYTE *)m_psWcharBuf+2,
  480. ucWchars*sizeof(ucWchars));
  481. }
  482. }
  483. else
  484. {
  485. hr = m_pML->ConvertStringToUnicode(&m_dwConvertMode, m_nCodePage,
  486. (char *)m_psByteBuf, &ucBytes,
  487. m_psWcharBuf +m_ucWcharBufCount,
  488. &ucWchars);
  489. // Some character(s) failed conversion. The best we can do is
  490. // attempt to skip the character that failed conversion.
  491. if (FAILED(hr))
  492. {
  493. // Did we come back around and try to unconvertable portion again?
  494. if (ucBytes==0)
  495. {
  496. // Yes, and it made no progress. Skip a char to try to make
  497. // forward progress.
  498. ucBytes++;
  499. }
  500. // We can't return this error, or we won't look a the rest of the
  501. // file.
  502. hr = S_OK;
  503. }
  504. }
  505. // Move any leftover source characters to the start of the buffer.
  506. // These are probably split Unicode chars, lead bytes without trail
  507. // bytes, etc.
  508. m_ucByteBufCount -= ucBytes;
  509. memmove(m_psByteBuf, m_psByteBuf + ucBytes,
  510. m_ucByteBufCount);
  511. // The number of useful chars in the output buf is increased by the
  512. // number we managed to convert.
  513. m_ucWcharBufCount += ucWchars;
  514. m_dwBytesProcessed += ucWchars;
  515. Done:
  516. return hr;
  517. }
  518. //------------------------------------------------------------------------
  519. //
  520. // Method: CTDCUnify::DetermineCodePage()
  521. //
  522. // Synopsis: Figures out what codepage to use to read the data.
  523. // Sets m_nCodePage and m_nUnicode appropriately.
  524. //
  525. // Arguments: fForce determine the answer, no matter what
  526. //
  527. // Returns: TRUE the codepage is determined.
  528. // FALSE not enough data yet to determine
  529. //
  530. //------------------------------------------------------------------------
  531. BOOL
  532. CTDCUnify::DetermineCodePage(BOOL fForce)
  533. {
  534. DWORD dwConvertMode = 0;
  535. HRESULT hr;
  536. UINT ucBytes = m_ucByteBufCount;
  537. UINT ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;
  538. UINT cpDetected;
  539. IMultiLanguage2 *pML2 = NULL;
  540. _ASSERT(m_dwBytesProcessed == 0 && m_pML);
  541. // First look for Unicode. Assume it's not Unicode to start.
  542. m_nUnicode = 0;
  543. // Need at least 2 chars for Unicode signature (0xFFFE or 0xFEFF)
  544. if (m_ucByteBufCount > 1)
  545. {
  546. // If we detect Unicode, it overrides any user specified code page.
  547. m_nUnicode = IsUnicode(m_psByteBuf, m_ucByteBufCount);
  548. if (m_nUnicode)
  549. {
  550. m_nCodePage = m_nUnicode;
  551. return TRUE;
  552. }
  553. // It's not Unicode. If the user specified a code page, use it.
  554. if (m_nCodePage)
  555. {
  556. return TRUE;
  557. }
  558. }
  559. // if we need an answer and user specified a code page, use it
  560. if (fForce && m_nCodePage)
  561. {
  562. return TRUE;
  563. }
  564. // At this point, we have to guess. If we have enough input or if we
  565. // need an answer now, use MLang to do the guessing
  566. if (fForce || m_ucByteBufCount >= CODEPAGE_BYTE_THRESHOLD)
  567. {
  568. // First see if the auto-detect interface is available.
  569. hr = m_pML->QueryInterface(IID_IMultiLanguage2, (void**)&pML2);
  570. if (!hr && pML2)
  571. {
  572. DetectEncodingInfo info[N_DETECTENCODINGINFO];
  573. int nInfo = N_DETECTENCODINGINFO;
  574. // auto-detect
  575. hr = pML2->DetectInputCodepage(
  576. MLDETECTCP_NONE,
  577. CP_ACP,
  578. (char *)m_psByteBuf,
  579. (int*)&ucBytes,
  580. info,
  581. &nInfo);
  582. pML2->Release();
  583. if (!hr)
  584. {
  585. // if one of the returned codepages is "good enough", use it.
  586. for (int i=0; i<nInfo; ++i)
  587. {
  588. if (info[i].nConfidence >= 90 && info[i].nDocPercent >= 90)
  589. {
  590. if (S_OK == m_pML->IsConvertible(info[i].nCodePage, UNICODE_CP))
  591. {
  592. m_nCodePage = info[i].nCodePage;
  593. return TRUE;
  594. }
  595. }
  596. }
  597. }
  598. }
  599. // Try plain old MLang.
  600. // Ask MLang to convert the input using the"auto-detect" codepage.
  601. hr = m_pML->ConvertStringToUnicode(&dwConvertMode, CP_AUTO,
  602. (char *)m_psByteBuf, &ucBytes,
  603. m_psWcharBuf + m_ucWcharBufCount,
  604. &ucWchars);
  605. cpDetected = HIWORD(dwConvertMode);
  606. // if MLang detected a codepage, use it
  607. if (!hr && cpDetected != 0)
  608. {
  609. if (S_OK == m_pML->IsConvertible(cpDetected, UNICODE_CP))
  610. {
  611. m_nCodePage = cpDetected;
  612. return TRUE;
  613. }
  614. }
  615. }
  616. // guessing didn't work. If we don't have to decide now, try again later
  617. if (!fForce)
  618. {
  619. return FALSE;
  620. }
  621. // if we have to decide and all else has failed, use the host page's
  622. // encoding. If even that isn't available, use the machine's ASCII codepage.
  623. m_nCodePage = m_nAmbientCodePage ? m_nAmbientCodePage : GetACP();
  624. // and if this still isn't convertible to Unicode, use windows-1252
  625. if (m_nCodePage == 0 || S_OK != m_pML->IsConvertible(m_nCodePage, UNICODE_CP))
  626. {
  627. m_nCodePage = CP_1252;
  628. }
  629. return TRUE;
  630. }
  631. LPWCH SkipSpace(LPWCH pwchCurr)
  632. {
  633. while (IsSpace(*pwchCurr)) pwchCurr++;
  634. return pwchCurr;
  635. }
  636. static
  637. boolean IsEnd(WCHAR ch)
  638. {
  639. return (ch == 0 || ch == L'\r' || ch == L'\n');
  640. }
  641. static
  642. boolean IsBreak(WCHAR ch)
  643. {
  644. return (ch == L';' || IsEnd(ch));
  645. }
  646. // Returns FALSE if names didn't match.
  647. // Returns TRUE if they did.
  648. // Sets *ppwchAdvance to terminator of the match name
  649. BOOL
  650. MatchName(LPWCH pwchMatchName, LPCWCH pwzHostName, LPWCH *ppwchAdvance)
  651. {
  652. // match from right to left
  653. LPWCH pwchMatchRight = &pwchMatchName[0];
  654. LPCWCH pwchHostRight = &pwzHostName[0] + ocslen(pwzHostName) -1;
  655. // handle empty match name
  656. if (IsBreak(*pwchMatchRight))
  657. {
  658. if (!IsEnd(*pwchMatchRight)) // be sure to advance (unless at end)
  659. ++ pwchMatchRight;
  660. *ppwchAdvance = pwchMatchRight;
  661. return FALSE;
  662. }
  663. // Find end of Match name.
  664. while (!IsBreak(*pwchMatchRight)) pwchMatchRight++;
  665. *ppwchAdvance = pwchMatchRight; // return pointer to terminator
  666. pwchMatchRight--;
  667. while (IsSpace(*pwchMatchRight) && pwchMatchRight >= pwchMatchName)
  668. -- pwchMatchRight; // ignore trailing whitespace
  669. // match full wildcard the easy way
  670. if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*')
  671. return TRUE;
  672. // match right-to-left, stop at mismatch or beginning of either string
  673. for (; pwchMatchRight>=pwchMatchName && pwchHostRight>=pwzHostName;
  674. --pwchMatchRight, --pwchHostRight)
  675. {
  676. if (*pwchMatchRight != *pwchHostRight || *pwchMatchRight == '*')
  677. break;
  678. }
  679. // it's a match if strings matched completely
  680. if (pwchMatchRight+1 == pwchMatchName && pwchHostRight+1 == pwzHostName)
  681. return TRUE;
  682. // or if match name started with "*." and the rest matched a suffix of host name
  683. if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*' &&
  684. pwchMatchRight[1] == '.')
  685. return TRUE;
  686. // otherwise it's not a match
  687. return FALSE;
  688. }
  689. HRESULT
  690. CTDCUnify::MatchAllowDomainList(LPCWSTR pwzURL)
  691. {
  692. HRESULT hr = E_FAIL; // assume failure
  693. LPWCH pwchCurr = &m_psWcharBuf[0];
  694. LPWCH pwchCurr2;
  695. int cchHostDoman = ocslen(pwzURL);
  696. // skip over white space
  697. pwchCurr = SkipSpace(pwchCurr);
  698. if (IsEnd(*pwchCurr))
  699. goto Cleanup;
  700. // must have the equal sign
  701. if (*pwchCurr++ != '=' || *pwchCurr == '\0')
  702. goto Cleanup;
  703. while (TRUE)
  704. {
  705. // skip over white space
  706. pwchCurr = SkipSpace(pwchCurr);
  707. if (IsEnd(*pwchCurr)) // terminate on \r, \n, \0
  708. break;
  709. if (IsBreak(*pwchCurr)) // Must be ';',
  710. pwchCurr++; // skip it.
  711. // skip over white space
  712. pwchCurr = SkipSpace(pwchCurr);
  713. if (MatchName(pwchCurr, pwzURL, &pwchCurr2))
  714. {
  715. hr = S_OK;
  716. break;
  717. }
  718. pwchCurr = pwchCurr2;
  719. }
  720. Cleanup:
  721. while (!IsEnd(*pwchCurr))
  722. pwchCurr++;
  723. // Skip CRLF combos
  724. if (*pwchCurr == '\r' && pwchCurr[1] == '\n') pwchCurr++;
  725. // Eat the AllowDomain line so it doesn't screw up the data.
  726. m_ucWcharBufCount -= (ULONG)(pwchCurr+1 - m_psWcharBuf);
  727. memmove(m_psWcharBuf, pwchCurr+1, m_ucWcharBufCount*sizeof(WCHAR));
  728. m_fProcessedAllowDomainList = TRUE;
  729. return hr;
  730. }
  731. //------------------------------------------------------------------------
  732. //
  733. // Method: CTDCUnify::CheckForAllowDomainList
  734. //
  735. // Synopsis: Checks the beggining of the Wide Char buffer to see if it
  736. // contains the string "@!allow.domains". This is used to
  737. // determine if this file has a list of domain names which are
  738. // allowed to access this file, even though the access may be
  739. // coming from another internet host.
  740. //
  741. // Arguments: uses CTDCUnify state variables for the Wide Char buffer:
  742. // m_psWcharBUf the Wide char buffer
  743. // m_ucWcharBufCount the # of chars in the wide char buf
  744. //
  745. // Returns: ALLOW_DOMAINLIST_NO signature not found
  746. // ALLOW_DOMAINLIST_YES signature was found
  747. // ALLOW_DOMAINLIST_DONTKNOW don't have enough characters
  748. // to know for sure yet.
  749. //
  750. //------------------------------------------------------------------------
  751. CTDCUnify::ALLOWDOMAINLIST
  752. CTDCUnify::CheckForAllowDomainList()
  753. {
  754. ULONG cAllowDomainLen = ocslen(ALLOW_DOMAIN_STRING);
  755. // Make sure we have a whole line.
  756. LPWCH pwchCurr = m_psWcharBuf;
  757. LPWCH pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];
  758. while (pwchCurr < pwchEnd)
  759. {
  760. if (IsEnd(*pwchCurr))
  761. break;
  762. ++ pwchCurr;
  763. }
  764. if (pwchCurr >= pwchEnd) // if buffer ended before line did
  765. return ALLOW_DOMAINLIST_DONTKNOW;
  766. if (0 == wch_incmp(m_psWcharBuf, ALLOW_DOMAIN_STRING, cAllowDomainLen))
  767. {
  768. // We matched equal and have the whole string.
  769. // Take the "@!allow.domains" out of the buffer..
  770. m_ucWcharBufCount -= cAllowDomainLen;
  771. memmove(m_psWcharBuf, &m_psWcharBuf[cAllowDomainLen],
  772. m_ucWcharBufCount*sizeof(WCHAR));
  773. return ALLOW_DOMAINLIST_YES;
  774. }
  775. // We didn't match equal, no point in looking any more.
  776. return ALLOW_DOMAINLIST_NO;
  777. }