Leaked source code of windows server 2003
// Tabular Data Control Parsing Module
// Copyright (C) Microsoft Corporation, 1996, 1997
// File: TDCParse.cpp
// Contents: Implementation of CTDCParse classes.
#include "stdafx.h"
#include <simpdata.h>
#include "TDC.h"
#include <MLang.h>
#include "Notify.h"
#include "TDCParse.h"
#include "TDCArr.h"
#include "locale.h"
#include "wch.h"
// Function: IsSpace()
// Synopsis: Returns TRUE if the given character is a space or tab character.
// Arguments: ch Character to test.
// Returns: TRUE if 'ch' is a space or tab character.
// FALSE otherwise.
inline boolean IsSpace(WCHAR ch)
return (ch == L' ' || ch == L'\t');
// CTDCTokenise Class - see comments in file TDCParse.h
// ------------------
// Method: CTDCTokenise::Create()
// Synopsis: Initialise the CTDCTokenise object
// Arguments: pFieldSink Object to send parsed fields to.
// wchDelimField \
// wchDelimRow | Set of characters that control
// wchQuote | the parsing of fields
// wchEscape /
// Returns: S_OK indicating success.
HRESULT CTDCUnify::InitTokenizer(CTDCFieldSink *pFieldSink, WCHAR wchDelimField,
WCHAR wchDelimRow, WCHAR wchQuote, WCHAR wchEscape)
_ASSERT(pFieldSink != NULL);
m_pFieldSink = pFieldSink;
m_wchDelimField = wchDelimField;
m_wchDelimRow = wchDelimRow;
m_wchQuote = wchQuote;
m_wchEscape = wchEscape;
m_ucParsed = 0;
m_fIgnoreNextLF = FALSE;
m_fIgnoreNextCR = FALSE;
m_fIgnoreNextWhiteSpace = FALSE;
m_fEscapeActive = FALSE;
m_fQuoteActive = FALSE;
m_fFoldWhiteSpace = FALSE;
// Ensure that the field and row delimiters are set.
if (m_wchDelimRow == 0)
m_wchDelimRow = DEFAULT_ROW_DELIM[0];
// Remove conflicting delimiter values
if (m_wchDelimRow == m_wchDelimField)
m_wchDelimRow = 0;
if (m_wchQuote != 0)
if (m_wchQuote == m_wchDelimField || m_wchQuote == m_wchDelimRow)
m_wchQuote = 0;
if (m_wchEscape != 0)
if (m_wchEscape == m_wchDelimField ||
m_wchEscape == m_wchDelimRow ||
m_wchEscape == m_wchQuote)
m_wchEscape = 0;
m_fFoldCRLF = (m_wchDelimRow == L'\r' || m_wchDelimRow == L'\n');
return S_OK;
// Method: CTDCTokenise::AddWcharBuffer()
// Synopsis: Takes a buffer of characters, breaks it up into fields
// and passes them to the embedded CTDCFieldSink object
// as fields.
// Arguments: pwch Buffer containing characters to be parsed.
// dwSize Number of significant characters in 'pwch'
// dwSize == 0 means "End-of-stream"
// Returns: S_OK upon success.
// E_OUTOFMEMORY indicating insufficient memory to carry
// out the parse operation.
// Other misc error code upon failure.
HRESULT CTDCUnify::AddWcharBuffer(BOOL fLastData)
OutputDebugStringX(_T("CTDCTokenise::AddWcharBuffer called\n"));
_ASSERT(m_pFieldSink != NULL);
LPWCH pwchCurr; // Next character to process
LPWCH pwchEnd; // End-of-buffer marker
LPWCH pwchDest; // Where to write next char processed
LPWCH pwchStart; // Beginning of current token
pwchStart = &m_psWcharBuf[0];
pwchCurr = pwchStart + m_ucParsed;
pwchDest = pwchCurr;
pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];
// Read up to the next field boundary (field or row delimiter)
while (pwchCurr < pwchEnd)
// Security: If we see a null character, it's not a text file. Abort the
// download, so that no one can use the TDC to download .exe's or other
// binary files.
if (*pwchCurr == 0)
hr = E_ABORT;
goto Cleanup;
if (m_fIgnoreNextLF)
// We're expecting a LF to terminate a CR-LF sequence.
m_fIgnoreNextLF = FALSE;
if (*pwchCurr == L'\n')
// Found a LF - ignore it
// Found something else - carry on ...
if (m_fIgnoreNextCR)
// We're expecting a CR to terminate a LF-CR sequence.
m_fIgnoreNextCR = FALSE;
if (*pwchCurr == L'\r')
// Found a CR - ignore it
// Found something else - carry on ...
if (m_fIgnoreNextWhiteSpace)
// We're expecting the rest of a white-space sequence
if (IsSpace(*pwchCurr))
// Found white-space - ignore it
m_fIgnoreNextWhiteSpace = FALSE;
// Escape characters work, even in quoted strings
if (m_fEscapeActive)
*pwchDest++ = *pwchCurr++;
m_fEscapeActive = FALSE;
if (*pwchCurr == m_wchEscape)
m_fEscapeActive = TRUE;
// Quotes activate/deactivate Field/Row delimiters
if (*pwchCurr == m_wchQuote)
m_fQuoteActive = !m_fQuoteActive;
if (m_fQuoteActive)
*pwchDest++ = *pwchCurr++;
if (*pwchCurr == m_wchDelimField ||
(m_fFoldWhiteSpace && IsSpace(*pwchCurr)))
hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
if (!SUCCEEDED(hr))
goto Cleanup;
if (m_fFoldWhiteSpace && IsSpace(*pwchCurr))
m_fIgnoreNextWhiteSpace = TRUE;
pwchStart = &m_psWcharBuf[0];
pwchDest = pwchStart;
if (*pwchCurr == m_wchDelimRow ||
(m_fFoldCRLF && (*pwchCurr == L'\r' || *pwchCurr == L'\n')))
hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
if (!SUCCEEDED(hr))
goto Cleanup;
hr = m_pFieldSink->EOLN();
if (!SUCCEEDED(hr))
goto Cleanup;
if (m_fFoldCRLF)
m_fIgnoreNextLF = (*pwchCurr == L'\r');
m_fIgnoreNextCR = (*pwchCurr == L'\n');
pwchStart = &m_psWcharBuf[0];
pwchDest = pwchStart;
*pwchDest++ = *pwchCurr++;
m_ucWcharBufCount = pwchDest - pwchStart;
m_ucParsed = pwchDest - pwchStart; // amount we've already parsed
// If this is the last data packet, and there's a fragment left,
// parse it.
if (m_ucWcharBufCount && fLastData)
hr = m_pFieldSink->AddField(pwchStart, m_ucParsed);
if (!SUCCEEDED(hr))
goto Cleanup;
m_ucParsed = 0;
hr = m_pFieldSink->EOLN();
return hr;
return hr;
// CTDCUnify Class - see comments in file TDCParse.h
// ---------------
// Method: CTDCUnify::CTDCUnify()
// Synopsis: Constuctor
m_pML = NULL;
// Method: CTDCUnify::~CTDCUnify()
// Synopsis: Destructor
delete [] m_psByteBuf;
delete [] m_psWcharBuf;
if (m_pML != NULL)
// Method: CTDCUnify::Create()
// Synopsis: Initialise the CTDCUnify object
// Arguments: pTokenise Object to send converted buffers to.
// nCodePage Code page for ASCII->Unicode conversions
// pML MLANG COM object (used for conversions)
// Returns: S_OK to indicate success.
HRESULT CTDCUnify::Create(UINT nCodePage, UINT nAmbientCodePage, IMultiLanguage *pML)
m_pML = pML;
m_nCodePage = nCodePage;
m_nAmbientCodePage = nAmbientCodePage;
m_fDataMarkedUnicode = FALSE;
m_fDataIsUnicode = FALSE;
m_dwBytesProcessed = 0;
m_fCanConvertToUnicode = 0;
m_nUnicode = 0;
m_fProcessedAllowDomainList = FALSE;
m_dwConvertMode = 0;
m_ucByteBufSize = 0;
m_ucByteBufCount = 0;
m_psByteBuf = NULL;
m_ucWcharBufSize = 0;
m_ucWcharBufCount = 0;
m_psWcharBuf = NULL;
if (m_nCodePage && S_OK != m_pML->IsConvertible(m_nCodePage, UNICODE_CP))
m_nCodePage = 0;
if (m_nAmbientCodePage && S_OK != m_pML->IsConvertible(m_nAmbientCodePage, UNICODE_CP))
m_nAmbientCodePage = 0;
return S_OK;
// Method: CTDCUnify::IsUnicode
// Synopsis: Determines if our text buffer is Unicode or not. Should
// only be called once on the FIRST text buffer.
// Assume if the data is marked as Unicode, that it's correct.
// The determination this routine makes will override any
// single byte codepage the user may have specified.
// Arguments: pBytes Buffer containing characters to be converted.
// dwSize Number of significant characters in 'pBytes'
// Returns: Code page of text, or zero if not Unicode (UNICODE_CP,
CTDCUnify::IsUnicode(BYTE * pBytes, DWORD dwSize)
if (BYTE_ORDER_MARK == *(WCHAR *)pBytes)
return UNICODE_CP;
else return 0;
// Method: CTDCUnify::ConvertByteBuffer()
// Synopsis: Converts a byte-buffer into a wide-character stream
// (applying unicode conversions if necessary) and passes
// it to the embedded TDCTokenise object to be broken into
// fields.
// Arguments: pBytes Buffer containing characters to be converted.
// dwSize Number of significant characters in 'pBytes'
// dwSize == 0 means "End-of-stream"
// Returns: S_OK upon success.
// S_FALSE if not enough data has shown up yet to be useful
// OLE_E_CANTCONVERT if a non-unicode buffer can't be
// converted into unicode.
// E_OUTOFMEMORY if there isn't enough memory to perform
// a data conversion.
HRESULT CTDCUnify::ConvertByteBuffer(BYTE *pBytes, DWORD dwSize)
OutputDebugStringX(_T("CTDCUnify::ConvertByteBuffer called\n"));
_ASSERT(pBytes != NULL || dwSize == 0);
UINT ucBytes;
UINT ucWchars;
// Is there enough space in Byte buffer for this packet?
if (dwSize > (m_ucByteBufSize - m_ucByteBufCount))
// No, the current buffer is too small, make a new one.
BYTE * psTemp = new BYTE[m_ucByteBufCount + dwSize];
if (psTemp==NULL)
goto Done;
if (m_psByteBuf != NULL) // if not first time
memmove(psTemp, m_psByteBuf, m_ucByteBufCount);
delete [] m_psByteBuf;
m_ucByteBufSize = m_ucByteBufCount + dwSize;
m_psByteBuf = psTemp;
// Append the new data to the old data.
memmove(m_psByteBuf + m_ucByteBufCount, pBytes, dwSize);
m_ucByteBufCount += dwSize;
// Is there enough space in the Wchar buffer for the converted data?
// We make a very conservative assumption here that N source buffer bytes
// convert to N Wchar buffer chars (or 2*N bytes). This will ensure that
// our call to ConvertToUnicode will never not finish because there wasn't
// enough room in the output buffer.
if (m_ucByteBufCount > (m_ucWcharBufSize - m_ucWcharBufCount))
// The current buffer is too small, make a new one.
WCHAR * psTemp = new WCHAR[m_ucWcharBufCount + m_ucByteBufCount];
if (psTemp==NULL)
goto Done;
if (m_psWcharBuf != NULL) // if not first time
memmove(psTemp, m_psWcharBuf,
delete [] m_psWcharBuf;
m_psWcharBuf = psTemp;
m_ucWcharBufSize = m_ucWcharBufCount + m_ucByteBufCount;
if (0 == m_dwBytesProcessed)
// if we can't determine the codepage yet, try again later
if (!DetermineCodePage(dwSize==0))
hr = S_FALSE;
goto Done;
// Convert as many source bytes as we can to Unicode chars
ucBytes = m_ucByteBufCount;
ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;
// ConvertStringToUnicode won't convert Unicode to Unicode for us.
// So we'll do it ourselves.
if (m_nUnicode)
_ASSERT( ucWchars * sizeof(WCHAR) >= ucBytes);
// This might copy an odd extra byte
memmove((BYTE *)(m_psWcharBuf + m_ucWcharBufCount), m_psByteBuf,
// But we only count the number of complete WCHAR's we copied.
ucWchars = ucBytes / sizeof(WCHAR);
ucBytes = ucWchars * sizeof(WCHAR);
if (UNICODE_REVERSE_CP == m_nUnicode)
// need to byte swap
BYTE *pByteSwap = (BYTE *)(m_psWcharBuf + m_ucWcharBufCount);
BYTE bTemp;
for (ULONG i = ucWchars; i != 0; i--)
// Well, OK, we've kind of hardwired WCHAR == 2 here, but ..
bTemp = pByteSwap[0];
pByteSwap[0] = pByteSwap[1];
pByteSwap[1] = bTemp;
pByteSwap += 2;
// On first packet, need to remove Unicode signature.
// Only need to look for 0xFFFE -- we already swapped bytes.
if (0 == m_dwBytesProcessed && m_psWcharBuf[0] == BYTE_ORDER_MARK)
memmove((BYTE *)m_psWcharBuf, (BYTE *)m_psWcharBuf+2,
hr = m_pML->ConvertStringToUnicode(&m_dwConvertMode, m_nCodePage,
(char *)m_psByteBuf, &ucBytes,
m_psWcharBuf +m_ucWcharBufCount,
// Some character(s) failed conversion. The best we can do is
// attempt to skip the character that failed conversion.
if (FAILED(hr))
// Did we come back around and try to unconvertable portion again?
if (ucBytes==0)
// Yes, and it made no progress. Skip a char to try to make
// forward progress.
// We can't return this error, or we won't look a the rest of the
// file.
hr = S_OK;
// Move any leftover source characters to the start of the buffer.
// These are probably split Unicode chars, lead bytes without trail
// bytes, etc.
m_ucByteBufCount -= ucBytes;
memmove(m_psByteBuf, m_psByteBuf + ucBytes,
// The number of useful chars in the output buf is increased by the
// number we managed to convert.
m_ucWcharBufCount += ucWchars;
m_dwBytesProcessed += ucWchars;
return hr;
// Method: CTDCUnify::DetermineCodePage()
// Synopsis: Figures out what codepage to use to read the data.
// Sets m_nCodePage and m_nUnicode appropriately.
// Arguments: fForce determine the answer, no matter what
// Returns: TRUE the codepage is determined.
// FALSE not enough data yet to determine
CTDCUnify::DetermineCodePage(BOOL fForce)
DWORD dwConvertMode = 0;
UINT ucBytes = m_ucByteBufCount;
UINT ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;
UINT cpDetected;
IMultiLanguage2 *pML2 = NULL;
_ASSERT(m_dwBytesProcessed == 0 && m_pML);
// First look for Unicode. Assume it's not Unicode to start.
m_nUnicode = 0;
// Need at least 2 chars for Unicode signature (0xFFFE or 0xFEFF)
if (m_ucByteBufCount > 1)
// If we detect Unicode, it overrides any user specified code page.
m_nUnicode = IsUnicode(m_psByteBuf, m_ucByteBufCount);
if (m_nUnicode)
m_nCodePage = m_nUnicode;
return TRUE;
// It's not Unicode. If the user specified a code page, use it.
if (m_nCodePage)
return TRUE;
// if we need an answer and user specified a code page, use it
if (fForce && m_nCodePage)
return TRUE;
// At this point, we have to guess. If we have enough input or if we
// need an answer now, use MLang to do the guessing
if (fForce || m_ucByteBufCount >= CODEPAGE_BYTE_THRESHOLD)
// First see if the auto-detect interface is available.
hr = m_pML->QueryInterface(IID_IMultiLanguage2, (void**)&pML2);
if (!hr && pML2)
DetectEncodingInfo info[N_DETECTENCODINGINFO];
// auto-detect
hr = pML2->DetectInputCodepage(
(char *)m_psByteBuf,
if (!hr)
// if one of the returned codepages is "good enough", use it.
for (int i=0; i<nInfo; ++i)
if (info[i].nConfidence >= 90 && info[i].nDocPercent >= 90)
if (S_OK == m_pML->IsConvertible(info[i].nCodePage, UNICODE_CP))
m_nCodePage = info[i].nCodePage;
return TRUE;
// Try plain old MLang.
// Ask MLang to convert the input using the"auto-detect" codepage.
hr = m_pML->ConvertStringToUnicode(&dwConvertMode, CP_AUTO,
(char *)m_psByteBuf, &ucBytes,
m_psWcharBuf + m_ucWcharBufCount,
cpDetected = HIWORD(dwConvertMode);
// if MLang detected a codepage, use it
if (!hr && cpDetected != 0)
if (S_OK == m_pML->IsConvertible(cpDetected, UNICODE_CP))
m_nCodePage = cpDetected;
return TRUE;
// guessing didn't work. If we don't have to decide now, try again later
if (!fForce)
return FALSE;
// if we have to decide and all else has failed, use the host page's
// encoding. If even that isn't available, use the machine's ASCII codepage.
m_nCodePage = m_nAmbientCodePage ? m_nAmbientCodePage : GetACP();
// and if this still isn't convertible to Unicode, use windows-1252
if (m_nCodePage == 0 || S_OK != m_pML->IsConvertible(m_nCodePage, UNICODE_CP))
m_nCodePage = CP_1252;
return TRUE;
LPWCH SkipSpace(LPWCH pwchCurr)
while (IsSpace(*pwchCurr)) pwchCurr++;
return pwchCurr;
boolean IsEnd(WCHAR ch)
return (ch == 0 || ch == L'\r' || ch == L'\n');
boolean IsBreak(WCHAR ch)
return (ch == L';' || IsEnd(ch));
// Returns FALSE if names didn't match.
// Returns TRUE if they did.
// Sets *ppwchAdvance to terminator of the match name
MatchName(LPWCH pwchMatchName, LPCWCH pwzHostName, LPWCH *ppwchAdvance)
// match from right to left
LPWCH pwchMatchRight = &pwchMatchName[0];
LPCWCH pwchHostRight = &pwzHostName[0] + ocslen(pwzHostName) -1;
// handle empty match name
if (IsBreak(*pwchMatchRight))
if (!IsEnd(*pwchMatchRight)) // be sure to advance (unless at end)
++ pwchMatchRight;
*ppwchAdvance = pwchMatchRight;
return FALSE;
// Find end of Match name.
while (!IsBreak(*pwchMatchRight)) pwchMatchRight++;
*ppwchAdvance = pwchMatchRight; // return pointer to terminator
while (IsSpace(*pwchMatchRight) && pwchMatchRight >= pwchMatchName)
-- pwchMatchRight; // ignore trailing whitespace
// match full wildcard the easy way
if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*')
return TRUE;
// match right-to-left, stop at mismatch or beginning of either string
for (; pwchMatchRight>=pwchMatchName && pwchHostRight>=pwzHostName;
--pwchMatchRight, --pwchHostRight)
if (*pwchMatchRight != *pwchHostRight || *pwchMatchRight == '*')
// it's a match if strings matched completely
if (pwchMatchRight+1 == pwchMatchName && pwchHostRight+1 == pwzHostName)
return TRUE;
// or if match name started with "*." and the rest matched a suffix of host name
if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*' &&
pwchMatchRight[1] == '.')
return TRUE;
// otherwise it's not a match
return FALSE;
CTDCUnify::MatchAllowDomainList(LPCWSTR pwzURL)
HRESULT hr = E_FAIL; // assume failure
LPWCH pwchCurr = &m_psWcharBuf[0];
LPWCH pwchCurr2;
int cchHostDoman = ocslen(pwzURL);
// skip over white space
pwchCurr = SkipSpace(pwchCurr);
if (IsEnd(*pwchCurr))
goto Cleanup;
// must have the equal sign
if (*pwchCurr++ != '=' || *pwchCurr == '\0')
goto Cleanup;
while (TRUE)
// skip over white space
pwchCurr = SkipSpace(pwchCurr);
if (IsEnd(*pwchCurr)) // terminate on \r, \n, \0
if (IsBreak(*pwchCurr)) // Must be ';',
pwchCurr++; // skip it.
// skip over white space
pwchCurr = SkipSpace(pwchCurr);
if (MatchName(pwchCurr, pwzURL, &pwchCurr2))
hr = S_OK;
pwchCurr = pwchCurr2;
while (!IsEnd(*pwchCurr))
// Skip CRLF combos
if (*pwchCurr == '\r' && pwchCurr[1] == '\n') pwchCurr++;
// Eat the AllowDomain line so it doesn't screw up the data.
m_ucWcharBufCount -= (ULONG)(pwchCurr+1 - m_psWcharBuf);
memmove(m_psWcharBuf, pwchCurr+1, m_ucWcharBufCount*sizeof(WCHAR));
m_fProcessedAllowDomainList = TRUE;
return hr;
// Method: CTDCUnify::CheckForAllowDomainList
// Synopsis: Checks the beggining of the Wide Char buffer to see if it
// contains the string "@!allow.domains". This is used to
// determine if this file has a list of domain names which are
// allowed to access this file, even though the access may be
// coming from another internet host.
// Arguments: uses CTDCUnify state variables for the Wide Char buffer:
// m_psWcharBUf the Wide char buffer
// m_ucWcharBufCount the # of chars in the wide char buf
// Returns: ALLOW_DOMAINLIST_NO signature not found
// ALLOW_DOMAINLIST_YES signature was found
// ALLOW_DOMAINLIST_DONTKNOW don't have enough characters
// to know for sure yet.
ULONG cAllowDomainLen = ocslen(ALLOW_DOMAIN_STRING);
// Make sure we have a whole line.
LPWCH pwchCurr = m_psWcharBuf;
LPWCH pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];
while (pwchCurr < pwchEnd)
if (IsEnd(*pwchCurr))
++ pwchCurr;
if (pwchCurr >= pwchEnd) // if buffer ended before line did
if (0 == wch_incmp(m_psWcharBuf, ALLOW_DOMAIN_STRING, cAllowDomainLen))
// We matched equal and have the whole string.
// Take the "@!allow.domains" out of the buffer..
m_ucWcharBufCount -= cAllowDomainLen;
memmove(m_psWcharBuf, &m_psWcharBuf[cAllowDomainLen],
// We didn't match equal, no point in looking any more.