You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
910 lines
28 KiB
910 lines
28 KiB
//------------------------------------------------------------------------
|
|
//
|
|
// Tabular Data Control Parsing Module
|
|
// Copyright (C) Microsoft Corporation, 1996, 1997
|
|
//
|
|
// File: TDCParse.cpp
|
|
//
|
|
// Contents: Implementation of CTDCParse classes.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
|
|
#include "stdafx.h"
|
|
#include <simpdata.h>
|
|
#include "TDC.h"
|
|
#include <MLang.h>
|
|
#include "Notify.h"
|
|
#include "TDCParse.h"
|
|
#include "TDCArr.h"
|
|
#include "locale.h"
|
|
#include "wch.h"
|
|
|
|
//#ifndef DISPID_AMBIENT_CODEPAGE
|
|
//#define DISPID_AMBIENT_CODEPAGE (-725)
|
|
//#endif
|
|
|
|
#define BYTE_ORDER_MARK 0xFEFF
|
|
#define REVERSE_BYTE_ORDER_MARK 0xFFFE
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Function: IsSpace()
|
|
//
|
|
// Synopsis: Returns TRUE if the given character is a space or tab character.
|
|
//
|
|
// Arguments: ch Character to test.
|
|
//
|
|
// Returns: TRUE if 'ch' is a space or tab character.
|
|
// FALSE otherwise.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
inline boolean IsSpace(WCHAR ch)
|
|
{
|
|
return (ch == L' ' || ch == L'\t');
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// CTDCTokenise Class - see comments in file TDCParse.h
|
|
// ------------------
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCTokenise::Create()
|
|
//
|
|
// Synopsis: Initialise the CTDCTokenise object
|
|
//
|
|
// Arguments: pFieldSink Object to send parsed fields to.
|
|
// wchDelimField \
|
|
// wchDelimRow | Set of characters that control
|
|
// wchQuote | the parsing of fields
|
|
// wchEscape /
|
|
//
|
|
// Returns: S_OK indicating success.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
HRESULT CTDCUnify::InitTokenizer(CTDCFieldSink *pFieldSink, WCHAR wchDelimField,
|
|
WCHAR wchDelimRow, WCHAR wchQuote, WCHAR wchEscape)
|
|
{
|
|
_ASSERT(pFieldSink != NULL);
|
|
m_pFieldSink = pFieldSink;
|
|
m_wchDelimField = wchDelimField;
|
|
m_wchDelimRow = wchDelimRow;
|
|
m_wchQuote = wchQuote;
|
|
m_wchEscape = wchEscape;
|
|
m_ucParsed = 0;
|
|
|
|
m_fIgnoreNextLF = FALSE;
|
|
m_fIgnoreNextCR = FALSE;
|
|
m_fIgnoreNextWhiteSpace = FALSE;
|
|
m_fEscapeActive = FALSE;
|
|
m_fQuoteActive = FALSE;
|
|
m_fFoldWhiteSpace = FALSE;
|
|
|
|
// Ensure that the field and row delimiters are set.
|
|
//
|
|
if (m_wchDelimRow == 0)
|
|
m_wchDelimRow = DEFAULT_ROW_DELIM[0];
|
|
|
|
// Remove conflicting delimiter values
|
|
//
|
|
if (m_wchDelimRow == m_wchDelimField)
|
|
m_wchDelimRow = 0;
|
|
if (m_wchQuote != 0)
|
|
{
|
|
if (m_wchQuote == m_wchDelimField || m_wchQuote == m_wchDelimRow)
|
|
m_wchQuote = 0;
|
|
}
|
|
if (m_wchEscape != 0)
|
|
{
|
|
if (m_wchEscape == m_wchDelimField ||
|
|
m_wchEscape == m_wchDelimRow ||
|
|
m_wchEscape == m_wchQuote)
|
|
m_wchEscape = 0;
|
|
}
|
|
|
|
m_fFoldCRLF = (m_wchDelimRow == L'\r' || m_wchDelimRow == L'\n');
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCTokenise::AddWcharBuffer()
|
|
//
|
|
// Synopsis: Takes a buffer of characters, breaks it up into fields
|
|
// and passes them to the embedded CTDCFieldSink object
|
|
// as fields.
|
|
//
|
|
// Arguments: pwch Buffer containing characters to be parsed.
|
|
// dwSize Number of significant characters in 'pwch'
|
|
// dwSize == 0 means "End-of-stream"
|
|
//
|
|
// Returns: S_OK upon success.
|
|
// E_OUTOFMEMORY indicating insufficient memory to carry
|
|
// out the parse operation.
|
|
// Other misc error code upon failure.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
HRESULT CTDCUnify::AddWcharBuffer(BOOL fLastData)
|
|
{
|
|
|
|
OutputDebugStringX(_T("CTDCTokenise::AddWcharBuffer called\n"));
|
|
|
|
_ASSERT(m_pFieldSink != NULL);
|
|
|
|
HRESULT hr = S_OK;
|
|
|
|
LPWCH pwchCurr; // Next character to process
|
|
LPWCH pwchEnd; // End-of-buffer marker
|
|
LPWCH pwchDest; // Where to write next char processed
|
|
LPWCH pwchStart; // Beginning of current token
|
|
|
|
pwchStart = &m_psWcharBuf[0];
|
|
pwchCurr = pwchStart + m_ucParsed;
|
|
pwchDest = pwchCurr;
|
|
pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];
|
|
|
|
// Read up to the next field boundary (field or row delimiter)
|
|
//
|
|
while (pwchCurr < pwchEnd)
|
|
{
|
|
// Security: If we see a null character, it's not a text file. Abort the
|
|
// download, so that no one can use the TDC to download .exe's or other
|
|
// binary files.
|
|
if (*pwchCurr == 0)
|
|
{
|
|
hr = E_ABORT;
|
|
goto Cleanup;
|
|
}
|
|
|
|
if (m_fIgnoreNextLF)
|
|
{
|
|
// We're expecting a LF to terminate a CR-LF sequence.
|
|
//
|
|
m_fIgnoreNextLF = FALSE;
|
|
if (*pwchCurr == L'\n')
|
|
{
|
|
// Found a LF - ignore it
|
|
//
|
|
pwchCurr++;
|
|
continue;
|
|
}
|
|
|
|
// Found something else - carry on ...
|
|
//
|
|
}
|
|
|
|
if (m_fIgnoreNextCR)
|
|
{
|
|
// We're expecting a CR to terminate a LF-CR sequence.
|
|
//
|
|
m_fIgnoreNextCR = FALSE;
|
|
if (*pwchCurr == L'\r')
|
|
{
|
|
// Found a CR - ignore it
|
|
//
|
|
pwchCurr++;
|
|
continue;
|
|
}
|
|
|
|
// Found something else - carry on ...
|
|
//
|
|
}
|
|
|
|
if (m_fIgnoreNextWhiteSpace)
|
|
{
|
|
// We're expecting the rest of a white-space sequence
|
|
//
|
|
if (IsSpace(*pwchCurr))
|
|
{
|
|
// Found white-space - ignore it
|
|
//
|
|
pwchCurr++;
|
|
continue;
|
|
}
|
|
m_fIgnoreNextWhiteSpace = FALSE;
|
|
}
|
|
|
|
// Escape characters work, even in quoted strings
|
|
//
|
|
if (m_fEscapeActive)
|
|
{
|
|
*pwchDest++ = *pwchCurr++;
|
|
m_fEscapeActive = FALSE;
|
|
continue;
|
|
}
|
|
if (*pwchCurr == m_wchEscape)
|
|
{
|
|
pwchCurr++;
|
|
m_fEscapeActive = TRUE;
|
|
continue;
|
|
}
|
|
|
|
// Quotes activate/deactivate Field/Row delimiters
|
|
//
|
|
if (*pwchCurr == m_wchQuote)
|
|
{
|
|
pwchCurr++;
|
|
m_fQuoteActive = !m_fQuoteActive;
|
|
continue;
|
|
}
|
|
|
|
if (m_fQuoteActive)
|
|
{
|
|
*pwchDest++ = *pwchCurr++;
|
|
continue;
|
|
}
|
|
|
|
|
|
if (*pwchCurr == m_wchDelimField ||
|
|
(m_fFoldWhiteSpace && IsSpace(*pwchCurr)))
|
|
{
|
|
hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
|
|
if (!SUCCEEDED(hr))
|
|
goto Cleanup;
|
|
pwchCurr++;
|
|
if (m_fFoldWhiteSpace && IsSpace(*pwchCurr))
|
|
m_fIgnoreNextWhiteSpace = TRUE;
|
|
pwchStart = &m_psWcharBuf[0];
|
|
pwchDest = pwchStart;
|
|
continue;
|
|
}
|
|
|
|
if (*pwchCurr == m_wchDelimRow ||
|
|
(m_fFoldCRLF && (*pwchCurr == L'\r' || *pwchCurr == L'\n')))
|
|
{
|
|
hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
|
|
if (!SUCCEEDED(hr))
|
|
goto Cleanup;
|
|
hr = m_pFieldSink->EOLN();
|
|
if (!SUCCEEDED(hr))
|
|
goto Cleanup;
|
|
if (m_fFoldCRLF)
|
|
{
|
|
m_fIgnoreNextLF = (*pwchCurr == L'\r');
|
|
m_fIgnoreNextCR = (*pwchCurr == L'\n');
|
|
}
|
|
pwchCurr++;
|
|
pwchStart = &m_psWcharBuf[0];
|
|
pwchDest = pwchStart;
|
|
continue;
|
|
}
|
|
|
|
*pwchDest++ = *pwchCurr++;
|
|
}
|
|
|
|
m_ucWcharBufCount = pwchDest - pwchStart;
|
|
m_ucParsed = pwchDest - pwchStart; // amount we've already parsed
|
|
|
|
// If this is the last data packet, and there's a fragment left,
|
|
// parse it.
|
|
if (m_ucWcharBufCount && fLastData)
|
|
{
|
|
hr = m_pFieldSink->AddField(pwchStart, m_ucParsed);
|
|
if (!SUCCEEDED(hr))
|
|
goto Cleanup;
|
|
m_ucParsed = 0;
|
|
hr = m_pFieldSink->EOLN();
|
|
return hr;
|
|
}
|
|
|
|
|
|
Cleanup:
|
|
return hr;
|
|
}
|
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// CTDCUnify Class - see comments in file TDCParse.h
|
|
// ---------------
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::CTDCUnify()
|
|
//
|
|
// Synopsis: Constuctor
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
CTDCUnify::CTDCUnify()
|
|
{
|
|
m_pML = NULL;
|
|
}
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::~CTDCUnify()
|
|
//
|
|
// Synopsis: Destructor
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
CTDCUnify::~CTDCUnify()
|
|
{
|
|
delete [] m_psByteBuf;
|
|
delete [] m_psWcharBuf;
|
|
|
|
if (m_pML != NULL)
|
|
m_pML->Release();
|
|
}
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::Create()
|
|
//
|
|
// Synopsis: Initialise the CTDCUnify object
|
|
//
|
|
// Arguments: pTokenise Object to send converted buffers to.
|
|
// nCodePage Code page for ASCII->Unicode conversions
|
|
// pML MLANG COM object (used for conversions)
|
|
//
|
|
// Returns: S_OK to indicate success.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
HRESULT CTDCUnify::Create(UINT nCodePage, UINT nAmbientCodePage, IMultiLanguage *pML)
|
|
{
|
|
m_pML = pML;
|
|
m_pML->AddRef();
|
|
m_nCodePage = nCodePage;
|
|
m_nAmbientCodePage = nAmbientCodePage;
|
|
m_fDataMarkedUnicode = FALSE;
|
|
m_fDataIsUnicode = FALSE;
|
|
m_dwBytesProcessed = 0;
|
|
m_fCanConvertToUnicode = 0;
|
|
m_nUnicode = 0;
|
|
m_fProcessedAllowDomainList = FALSE;
|
|
|
|
m_dwConvertMode = 0;
|
|
m_ucByteBufSize = 0;
|
|
m_ucByteBufCount = 0;
|
|
m_psByteBuf = NULL;
|
|
|
|
m_ucWcharBufSize = 0;
|
|
m_ucWcharBufCount = 0;
|
|
m_psWcharBuf = NULL;
|
|
|
|
if (m_nCodePage && S_OK != m_pML->IsConvertible(m_nCodePage, UNICODE_CP))
|
|
{
|
|
m_nCodePage = 0;
|
|
}
|
|
|
|
if (m_nAmbientCodePage && S_OK != m_pML->IsConvertible(m_nAmbientCodePage, UNICODE_CP))
|
|
{
|
|
m_nAmbientCodePage = 0;
|
|
}
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::IsUnicode
|
|
//
|
|
// Synopsis: Determines if our text buffer is Unicode or not. Should
|
|
// only be called once on the FIRST text buffer.
|
|
//
|
|
// Assume if the data is marked as Unicode, that it's correct.
|
|
//
|
|
// The determination this routine makes will override any
|
|
// single byte codepage the user may have specified.
|
|
//
|
|
//
|
|
// Arguments: pBytes Buffer containing characters to be converted.
|
|
// dwSize Number of significant characters in 'pBytes'
|
|
//
|
|
// Returns: Code page of text, or zero if not Unicode (UNICODE_CP,
|
|
// UNICODE_REVERSE_CP, or 0)
|
|
//
|
|
//
|
|
//------------------------------------------------------------------------
|
|
int
|
|
CTDCUnify::IsUnicode(BYTE * pBytes, DWORD dwSize)
|
|
{
|
|
if (BYTE_ORDER_MARK == *(WCHAR *)pBytes)
|
|
return UNICODE_CP;
|
|
|
|
if (REVERSE_BYTE_ORDER_MARK == *(WCHAR *)pBytes)
|
|
return UNICODE_REVERSE_CP;
|
|
|
|
else return 0;
|
|
}
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::ConvertByteBuffer()
|
|
//
|
|
// Synopsis: Converts a byte-buffer into a wide-character stream
|
|
// (applying unicode conversions if necessary) and passes
|
|
// it to the embedded TDCTokenise object to be broken into
|
|
// fields.
|
|
//
|
|
// Arguments: pBytes Buffer containing characters to be converted.
|
|
// dwSize Number of significant characters in 'pBytes'
|
|
// dwSize == 0 means "End-of-stream"
|
|
//
|
|
// Returns: S_OK upon success.
|
|
// S_FALSE if not enough data has shown up yet to be useful
|
|
// OLE_E_CANTCONVERT if a non-unicode buffer can't be
|
|
// converted into unicode.
|
|
// E_OUTOFMEMORY if there isn't enough memory to perform
|
|
// a data conversion.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
HRESULT CTDCUnify::ConvertByteBuffer(BYTE *pBytes, DWORD dwSize)
|
|
{
|
|
OutputDebugStringX(_T("CTDCUnify::ConvertByteBuffer called\n"));
|
|
|
|
_ASSERT(pBytes != NULL || dwSize == 0);
|
|
|
|
HRESULT hr = S_OK;
|
|
UINT ucBytes;
|
|
UINT ucWchars;
|
|
|
|
// Is there enough space in Byte buffer for this packet?
|
|
if (dwSize > (m_ucByteBufSize - m_ucByteBufCount))
|
|
{
|
|
// No, the current buffer is too small, make a new one.
|
|
BYTE * psTemp = new BYTE[m_ucByteBufCount + dwSize];
|
|
if (psTemp==NULL)
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
|
|
goto Done;
|
|
}
|
|
|
|
if (m_psByteBuf != NULL) // if not first time
|
|
{
|
|
memmove(psTemp, m_psByteBuf, m_ucByteBufCount);
|
|
delete [] m_psByteBuf;
|
|
}
|
|
m_ucByteBufSize = m_ucByteBufCount + dwSize;
|
|
m_psByteBuf = psTemp;
|
|
}
|
|
|
|
// Append the new data to the old data.
|
|
memmove(m_psByteBuf + m_ucByteBufCount, pBytes, dwSize);
|
|
m_ucByteBufCount += dwSize;
|
|
|
|
// Is there enough space in the Wchar buffer for the converted data?
|
|
// We make a very conservative assumption here that N source buffer bytes
|
|
// convert to N Wchar buffer chars (or 2*N bytes). This will ensure that
|
|
// our call to ConvertToUnicode will never not finish because there wasn't
|
|
// enough room in the output buffer.
|
|
if (m_ucByteBufCount > (m_ucWcharBufSize - m_ucWcharBufCount))
|
|
{
|
|
// The current buffer is too small, make a new one.
|
|
WCHAR * psTemp = new WCHAR[m_ucWcharBufCount + m_ucByteBufCount];
|
|
if (psTemp==NULL)
|
|
{
|
|
hr = E_OUTOFMEMORY;
|
|
goto Done;
|
|
}
|
|
|
|
if (m_psWcharBuf != NULL) // if not first time
|
|
{
|
|
memmove(psTemp, m_psWcharBuf,
|
|
m_ucWcharBufCount*sizeof(WCHAR));
|
|
delete [] m_psWcharBuf;
|
|
}
|
|
m_psWcharBuf = psTemp;
|
|
m_ucWcharBufSize = m_ucWcharBufCount + m_ucByteBufCount;
|
|
}
|
|
|
|
if (0 == m_dwBytesProcessed)
|
|
{
|
|
// if we can't determine the codepage yet, try again later
|
|
if (!DetermineCodePage(dwSize==0))
|
|
{
|
|
hr = S_FALSE;
|
|
goto Done;
|
|
}
|
|
}
|
|
|
|
// Convert as many source bytes as we can to Unicode chars
|
|
ucBytes = m_ucByteBufCount;
|
|
ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;
|
|
|
|
// ConvertStringToUnicode won't convert Unicode to Unicode for us.
|
|
// So we'll do it ourselves.
|
|
if (m_nUnicode)
|
|
{
|
|
_ASSERT( ucWchars * sizeof(WCHAR) >= ucBytes);
|
|
|
|
// This might copy an odd extra byte
|
|
memmove((BYTE *)(m_psWcharBuf + m_ucWcharBufCount), m_psByteBuf,
|
|
ucBytes);
|
|
|
|
// But we only count the number of complete WCHAR's we copied.
|
|
ucWchars = ucBytes / sizeof(WCHAR);
|
|
ucBytes = ucWchars * sizeof(WCHAR);
|
|
|
|
if (UNICODE_REVERSE_CP == m_nUnicode)
|
|
{
|
|
// need to byte swap
|
|
BYTE *pByteSwap = (BYTE *)(m_psWcharBuf + m_ucWcharBufCount);
|
|
BYTE bTemp;
|
|
for (ULONG i = ucWchars; i != 0; i--)
|
|
{
|
|
// Well, OK, we've kind of hardwired WCHAR == 2 here, but ..
|
|
bTemp = pByteSwap[0];
|
|
pByteSwap[0] = pByteSwap[1];
|
|
pByteSwap[1] = bTemp;
|
|
pByteSwap += 2;
|
|
}
|
|
}
|
|
|
|
// On first packet, need to remove Unicode signature.
|
|
// Only need to look for 0xFFFE -- we already swapped bytes.
|
|
if (0 == m_dwBytesProcessed && m_psWcharBuf[0] == BYTE_ORDER_MARK)
|
|
{
|
|
ucWchars--;
|
|
memmove((BYTE *)m_psWcharBuf, (BYTE *)m_psWcharBuf+2,
|
|
ucWchars*sizeof(ucWchars));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
hr = m_pML->ConvertStringToUnicode(&m_dwConvertMode, m_nCodePage,
|
|
(char *)m_psByteBuf, &ucBytes,
|
|
m_psWcharBuf +m_ucWcharBufCount,
|
|
&ucWchars);
|
|
|
|
// Some character(s) failed conversion. The best we can do is
|
|
// attempt to skip the character that failed conversion.
|
|
if (FAILED(hr))
|
|
{
|
|
// Did we come back around and try to unconvertable portion again?
|
|
if (ucBytes==0)
|
|
{
|
|
// Yes, and it made no progress. Skip a char to try to make
|
|
// forward progress.
|
|
ucBytes++;
|
|
}
|
|
// We can't return this error, or we won't look a the rest of the
|
|
// file.
|
|
hr = S_OK;
|
|
}
|
|
|
|
}
|
|
|
|
// Move any leftover source characters to the start of the buffer.
|
|
// These are probably split Unicode chars, lead bytes without trail
|
|
// bytes, etc.
|
|
m_ucByteBufCount -= ucBytes;
|
|
memmove(m_psByteBuf, m_psByteBuf + ucBytes,
|
|
m_ucByteBufCount);
|
|
|
|
// The number of useful chars in the output buf is increased by the
|
|
// number we managed to convert.
|
|
m_ucWcharBufCount += ucWchars;
|
|
m_dwBytesProcessed += ucWchars;
|
|
|
|
Done:
|
|
return hr;
|
|
}
|
|
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::DetermineCodePage()
|
|
//
|
|
// Synopsis: Figures out what codepage to use to read the data.
|
|
// Sets m_nCodePage and m_nUnicode appropriately.
|
|
//
|
|
// Arguments: fForce determine the answer, no matter what
|
|
//
|
|
// Returns: TRUE the codepage is determined.
|
|
// FALSE not enough data yet to determine
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
BOOL
|
|
CTDCUnify::DetermineCodePage(BOOL fForce)
|
|
{
|
|
DWORD dwConvertMode = 0;
|
|
HRESULT hr;
|
|
UINT ucBytes = m_ucByteBufCount;
|
|
UINT ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;
|
|
UINT cpDetected;
|
|
IMultiLanguage2 *pML2 = NULL;
|
|
|
|
_ASSERT(m_dwBytesProcessed == 0 && m_pML);
|
|
|
|
// First look for Unicode. Assume it's not Unicode to start.
|
|
m_nUnicode = 0;
|
|
|
|
// Need at least 2 chars for Unicode signature (0xFFFE or 0xFEFF)
|
|
if (m_ucByteBufCount > 1)
|
|
{
|
|
// If we detect Unicode, it overrides any user specified code page.
|
|
m_nUnicode = IsUnicode(m_psByteBuf, m_ucByteBufCount);
|
|
if (m_nUnicode)
|
|
{
|
|
m_nCodePage = m_nUnicode;
|
|
return TRUE;
|
|
}
|
|
|
|
// It's not Unicode. If the user specified a code page, use it.
|
|
if (m_nCodePage)
|
|
{
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
// if we need an answer and user specified a code page, use it
|
|
if (fForce && m_nCodePage)
|
|
{
|
|
return TRUE;
|
|
}
|
|
|
|
// At this point, we have to guess. If we have enough input or if we
|
|
// need an answer now, use MLang to do the guessing
|
|
if (fForce || m_ucByteBufCount >= CODEPAGE_BYTE_THRESHOLD)
|
|
{
|
|
// First see if the auto-detect interface is available.
|
|
hr = m_pML->QueryInterface(IID_IMultiLanguage2, (void**)&pML2);
|
|
if (!hr && pML2)
|
|
{
|
|
DetectEncodingInfo info[N_DETECTENCODINGINFO];
|
|
int nInfo = N_DETECTENCODINGINFO;
|
|
|
|
// auto-detect
|
|
hr = pML2->DetectInputCodepage(
|
|
MLDETECTCP_NONE,
|
|
CP_ACP,
|
|
(char *)m_psByteBuf,
|
|
(int*)&ucBytes,
|
|
info,
|
|
&nInfo);
|
|
pML2->Release();
|
|
|
|
if (!hr)
|
|
{
|
|
// if one of the returned codepages is "good enough", use it.
|
|
for (int i=0; i<nInfo; ++i)
|
|
{
|
|
if (info[i].nConfidence >= 90 && info[i].nDocPercent >= 90)
|
|
{
|
|
if (S_OK == m_pML->IsConvertible(info[i].nCodePage, UNICODE_CP))
|
|
{
|
|
m_nCodePage = info[i].nCodePage;
|
|
return TRUE;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try plain old MLang.
|
|
// Ask MLang to convert the input using the"auto-detect" codepage.
|
|
hr = m_pML->ConvertStringToUnicode(&dwConvertMode, CP_AUTO,
|
|
(char *)m_psByteBuf, &ucBytes,
|
|
m_psWcharBuf + m_ucWcharBufCount,
|
|
&ucWchars);
|
|
cpDetected = HIWORD(dwConvertMode);
|
|
|
|
// if MLang detected a codepage, use it
|
|
if (!hr && cpDetected != 0)
|
|
{
|
|
if (S_OK == m_pML->IsConvertible(cpDetected, UNICODE_CP))
|
|
{
|
|
m_nCodePage = cpDetected;
|
|
return TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
// guessing didn't work. If we don't have to decide now, try again later
|
|
if (!fForce)
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
// if we have to decide and all else has failed, use the host page's
|
|
// encoding. If even that isn't available, use the machine's ASCII codepage.
|
|
m_nCodePage = m_nAmbientCodePage ? m_nAmbientCodePage : GetACP();
|
|
|
|
// and if this still isn't convertible to Unicode, use windows-1252
|
|
if (m_nCodePage == 0 || S_OK != m_pML->IsConvertible(m_nCodePage, UNICODE_CP))
|
|
{
|
|
m_nCodePage = CP_1252;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
LPWCH SkipSpace(LPWCH pwchCurr)
|
|
{
|
|
while (IsSpace(*pwchCurr)) pwchCurr++;
|
|
return pwchCurr;
|
|
}
|
|
|
|
static
|
|
boolean IsEnd(WCHAR ch)
|
|
{
|
|
return (ch == 0 || ch == L'\r' || ch == L'\n');
|
|
}
|
|
|
|
static
|
|
boolean IsBreak(WCHAR ch)
|
|
{
|
|
return (ch == L';' || IsEnd(ch));
|
|
}
|
|
|
|
// Returns FALSE if names didn't match.
|
|
// Returns TRUE if they did.
|
|
// Sets *ppwchAdvance to terminator of the match name
|
|
BOOL
|
|
MatchName(LPWCH pwchMatchName, LPCWCH pwzHostName, LPWCH *ppwchAdvance)
|
|
{
|
|
// match from right to left
|
|
LPWCH pwchMatchRight = &pwchMatchName[0];
|
|
LPCWCH pwchHostRight = &pwzHostName[0] + ocslen(pwzHostName) -1;
|
|
|
|
// handle empty match name
|
|
if (IsBreak(*pwchMatchRight))
|
|
{
|
|
if (!IsEnd(*pwchMatchRight)) // be sure to advance (unless at end)
|
|
++ pwchMatchRight;
|
|
*ppwchAdvance = pwchMatchRight;
|
|
return FALSE;
|
|
}
|
|
|
|
// Find end of Match name.
|
|
while (!IsBreak(*pwchMatchRight)) pwchMatchRight++;
|
|
|
|
*ppwchAdvance = pwchMatchRight; // return pointer to terminator
|
|
|
|
pwchMatchRight--;
|
|
|
|
while (IsSpace(*pwchMatchRight) && pwchMatchRight >= pwchMatchName)
|
|
-- pwchMatchRight; // ignore trailing whitespace
|
|
|
|
// match full wildcard the easy way
|
|
if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*')
|
|
return TRUE;
|
|
|
|
// match right-to-left, stop at mismatch or beginning of either string
|
|
for (; pwchMatchRight>=pwchMatchName && pwchHostRight>=pwzHostName;
|
|
--pwchMatchRight, --pwchHostRight)
|
|
{
|
|
if (*pwchMatchRight != *pwchHostRight || *pwchMatchRight == '*')
|
|
break;
|
|
}
|
|
|
|
// it's a match if strings matched completely
|
|
if (pwchMatchRight+1 == pwchMatchName && pwchHostRight+1 == pwzHostName)
|
|
return TRUE;
|
|
|
|
// or if match name started with "*." and the rest matched a suffix of host name
|
|
if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*' &&
|
|
pwchMatchRight[1] == '.')
|
|
return TRUE;
|
|
|
|
// otherwise it's not a match
|
|
return FALSE;
|
|
}
|
|
|
|
HRESULT
|
|
CTDCUnify::MatchAllowDomainList(LPCWSTR pwzURL)
|
|
{
|
|
HRESULT hr = E_FAIL; // assume failure
|
|
LPWCH pwchCurr = &m_psWcharBuf[0];
|
|
LPWCH pwchCurr2;
|
|
int cchHostDoman = ocslen(pwzURL);
|
|
|
|
// skip over white space
|
|
pwchCurr = SkipSpace(pwchCurr);
|
|
if (IsEnd(*pwchCurr))
|
|
goto Cleanup;
|
|
|
|
// must have the equal sign
|
|
if (*pwchCurr++ != '=' || *pwchCurr == '\0')
|
|
goto Cleanup;
|
|
|
|
while (TRUE)
|
|
{
|
|
// skip over white space
|
|
pwchCurr = SkipSpace(pwchCurr);
|
|
|
|
if (IsEnd(*pwchCurr)) // terminate on \r, \n, \0
|
|
break;
|
|
|
|
if (IsBreak(*pwchCurr)) // Must be ';',
|
|
pwchCurr++; // skip it.
|
|
|
|
// skip over white space
|
|
pwchCurr = SkipSpace(pwchCurr);
|
|
|
|
if (MatchName(pwchCurr, pwzURL, &pwchCurr2))
|
|
{
|
|
hr = S_OK;
|
|
break;
|
|
}
|
|
pwchCurr = pwchCurr2;
|
|
}
|
|
|
|
Cleanup:
|
|
while (!IsEnd(*pwchCurr))
|
|
pwchCurr++;
|
|
|
|
// Skip CRLF combos
|
|
if (*pwchCurr == '\r' && pwchCurr[1] == '\n') pwchCurr++;
|
|
|
|
// Eat the AllowDomain line so it doesn't screw up the data.
|
|
m_ucWcharBufCount -= (ULONG)(pwchCurr+1 - m_psWcharBuf);
|
|
memmove(m_psWcharBuf, pwchCurr+1, m_ucWcharBufCount*sizeof(WCHAR));
|
|
|
|
m_fProcessedAllowDomainList = TRUE;
|
|
|
|
return hr;
|
|
}
|
|
|
|
//------------------------------------------------------------------------
|
|
//
|
|
// Method: CTDCUnify::CheckForAllowDomainList
|
|
//
|
|
// Synopsis: Checks the beggining of the Wide Char buffer to see if it
|
|
// contains the string "@!allow.domains". This is used to
|
|
// determine if this file has a list of domain names which are
|
|
// allowed to access this file, even though the access may be
|
|
// coming from another internet host.
|
|
//
|
|
// Arguments: uses CTDCUnify state variables for the Wide Char buffer:
|
|
// m_psWcharBUf the Wide char buffer
|
|
// m_ucWcharBufCount the # of chars in the wide char buf
|
|
//
|
|
// Returns: ALLOW_DOMAINLIST_NO signature not found
|
|
// ALLOW_DOMAINLIST_YES signature was found
|
|
// ALLOW_DOMAINLIST_DONTKNOW don't have enough characters
|
|
// to know for sure yet.
|
|
//
|
|
//------------------------------------------------------------------------
|
|
|
|
CTDCUnify::ALLOWDOMAINLIST
|
|
CTDCUnify::CheckForAllowDomainList()
|
|
{
|
|
ULONG cAllowDomainLen = ocslen(ALLOW_DOMAIN_STRING);
|
|
|
|
// Make sure we have a whole line.
|
|
LPWCH pwchCurr = m_psWcharBuf;
|
|
LPWCH pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];
|
|
|
|
while (pwchCurr < pwchEnd)
|
|
{
|
|
if (IsEnd(*pwchCurr))
|
|
break;
|
|
++ pwchCurr;
|
|
}
|
|
|
|
if (pwchCurr >= pwchEnd) // if buffer ended before line did
|
|
return ALLOW_DOMAINLIST_DONTKNOW;
|
|
|
|
if (0 == wch_incmp(m_psWcharBuf, ALLOW_DOMAIN_STRING, cAllowDomainLen))
|
|
{
|
|
// We matched equal and have the whole string.
|
|
// Take the "@!allow.domains" out of the buffer..
|
|
m_ucWcharBufCount -= cAllowDomainLen;
|
|
memmove(m_psWcharBuf, &m_psWcharBuf[cAllowDomainLen],
|
|
m_ucWcharBufCount*sizeof(WCHAR));
|
|
return ALLOW_DOMAINLIST_YES;
|
|
}
|
|
|
|
// We didn't match equal, no point in looking any more.
|
|
return ALLOW_DOMAINLIST_NO;
|
|
}
|
|
|