windows-xp/Source/XPSP1/NT/shell/osshell/accesory/newpad/npxml.c


								/*

								 * XML support functions

								 *  Copyright (C) 2000 Microsoft Corporation

								 */


								#include "precomp.h"


								BOOL FIsXmlWhitespaceW(WCHAR wch)

								{

								    return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD'));

								}


								BOOL FIsXmlWhitespaceA(char ch)

								{

								    return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));

								}


								BOOL FIsXmlA(LPCSTR rgch, UINT cch)

								{

								    if (memcmp(rgch, "<?xml", 5) != 0)

								    {

								        // Not XML


								        return(FALSE);

								    }


								    return(TRUE);


								    UNREFERENCED_PARAMETER( cch );

								}


								BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)

								{

								    if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)

								    {

								        // Not XML


								        return(FALSE);

								    }


								    return(TRUE);


								    UNREFERENCED_PARAMETER( cch );

								}


								BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)

								{

								    LPCSTR pchMax;

								    LPCSTR pch;

								    char chQuote;


								    // XML files encoded in UTF-16 are required to have a BOM which if present

								    // would already have been detected.  This means that if this file is XML

								    // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS

								    // encoding of some form.  We check for ASCII compatible encodings only

								    // which includes everything we probably care about but excludes EBCDIC.


								    // Check for file begining with <?xml ... encoding='...' ... ?>


								    if (cch < 20)

								    {

								        // File is too small


								        return(FALSE);

								    }


								    if (!FIsXmlA(rgch, cch))

								    {

								        // Not XML


								        return(FALSE);

								    }


								    // Don't scan more than 4K looking for encoding even if it is valid XML


								    cch = __min(cch, 4096);


								    pchMax = rgch + cch;

								    pch = rgch + 5;


								    if (!FIsXmlWhitespaceA(*pch))

								    {

								        // Not XML


								        return(FALSE);

								    }


								    pch++;


								    chQuote = '\0';


								    for (;;)

								    {

								        LPCSTR pchToken;


								        if (pch == pchMax)

								        {

								            // Not XML


								            break;

								        }


								        if (FIsXmlWhitespaceA(*pch))

								        {

								            pch++;

								            continue;

								        }


								        if (*pch == '=')

								        {

								            pch++;

								            continue;

								        }


								        if ((*pch == '\'') || (*pch == '"'))

								        {

								            if (*pch == chQuote)

								            {

								                chQuote = '\0';

								            }


								            else

								            {

								                chQuote = *pch;

								            }


								            pch++;

								            continue;

								        }


								        if (chQuote != '\0')

								        {

								            // We are within a quoted string.  Skip everything until closing quote.


								            pch++;

								            continue;

								        }


								        if ((pch + 2) > pchMax)

								        {

								            // Not XML


								            break;

								        }


								        if ((pch[0] == '?') && (pch[1] == '>'))

								        {

								            // This looks like XML.  At this point if we don't find an encoding

								            // specification we could assume UTF-8.  We don't because there are

								            // malformed XML documents and assuming UTF-8 might affect Notepad

								            // compatibility.  This may be fine but we put it off for now.


								            // *pcp = CP_UTF8;

								            // return(TRUE);


								            break;

								        }


								        pchToken = pch;


								        while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch))

								        {

								            pch++;

								        }


								        if (pch != (pchToken + 8))

								        {

								             continue;

								        }


								        if (memcmp(pchToken, "encoding", 8) != 0)

								        {

								             continue;

								        }


								        while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))

								        {

								            pch++;

								        }


								        if ((pch == pchMax) || (*pch++ != '='))

								        {

								            // Not XML


								            break;

								        }


								        while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))

								        {

								            pch++;

								        }


								        if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"')))

								        {

								            // Not XML


								            break;

								        }


								        chQuote = *pch++;


								        pchToken = pch;


								        while ((pch < pchMax) && (*pch != chQuote))

								        {

								            pch++;

								        }


								        if (pch == pchMax)

								        {

								            // Not XML


								            break;

								        }


								        // We have an XML encoding declaration from pchToken to (pch - 1)


								        if (pch == pchToken)

								        {

								            // Not XML


								            break;

								        }


								        if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))

								        {

								            // Encoding is not recognized


								            break;

								        }


								        if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))

								        {

								            // These are bogus since we know the file is MBCS


								            break;

								        }


								        return(FValidateCodepage(hwndNP, *pcp));

								    }


								    return(FALSE);

								}


								BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)

								{

								    const WCHAR *pchMax;

								    const WCHAR *pch;

								    WCHAR chQuote;


								    // XML files encoded in UTF-16 are required to have a BOM which if present

								    // would already have been detected.  This means that if this file is XML

								    // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS

								    // encoding of some form.  We check for ASCII compatible encodings only

								    // which includes everything we probably care about but excludes EBCDIC.


								    // Check for file begining with <?xml ... encoding='...' ... ?>


								    if (cch < 20)

								    {

								        // File is too small


								        return(FALSE);

								    }


								    if (!FIsXmlW(rgch, cch))

								    {

								        // Not XML


								        return(FALSE);

								    }


								    // Don't scan more than 4K looking for encoding even if it is valid XML


								    cch = __min(cch, 4096);


								    pchMax = rgch + cch;

								    pch = rgch + 5;


								    if (!FIsXmlWhitespaceW(*pch))

								    {

								        // Not XML


								        return(FALSE);

								    }


								    pch++;


								    chQuote = L'\0';


								    for (;;)

								    {

								        const WCHAR *pchToken;


								        if (pch == pchMax)

								        {

								            // Not XML


								            break;

								        }


								        if (FIsXmlWhitespaceW(*pch))

								        {

								            pch++;

								            continue;

								        }


								        if (*pch == L'=')

								        {

								            pch++;

								            continue;

								        }


								        if ((*pch == L'\'') || (*pch == L'"'))

								        {

								            if (*pch == chQuote)

								            {

								                chQuote = L'\0';

								            }


								            else

								            {

								                chQuote = *pch;

								            }


								            pch++;

								            continue;

								        }


								        if (chQuote != L'\0')

								        {

								            // We are within a quoted string.  Skip everything until closing quote.


								            pch++;

								            continue;

								        }


								        if ((pch + 2) > pchMax)

								        {

								            // Not XML


								            break;

								        }


								        if ((pch[0] == L'?') && (pch[1] == L'>'))

								        {

								            // This looks like XML.  At this point if we don't find an encoding

								            // specification we could assume UTF-8.  We don't because there are

								            // malformed XML documents and assuming UTF-8 might affect Notepad

								            // compatibility.  This may be fine but we put it off for now.


								            // *pcp = CP_UTF8;

								            // return(TRUE);


								            break;

								        }


								        pchToken = pch;


								        while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch))

								        {

								            pch++;

								        }


								        if (pch != (pchToken + 8))

								        {

								             continue;

								        }


								        if (memcmp(pchToken, L"encoding", 8) != 0)

								        {

								             continue;

								        }


								        while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))

								        {

								            pch++;

								        }


								        if ((pch == pchMax) || (*pch++ != L'='))

								        {

								            // Not XML


								            break;

								        }


								        while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))

								        {

								            pch++;

								        }


								        if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"')))

								        {

								            // Not XML


								            break;

								        }


								        chQuote = *pch++;


								        pchToken = pch;


								        while ((pch < pchMax) && (*pch != chQuote))

								        {

								            pch++;

								        }


								        if (pch == pchMax)

								        {

								            // Not XML


								            break;

								        }


								        // We have an XML encoding declaration from pchToken to (pch - 1)


								        if (pch == pchToken)

								        {

								            // Not XML


								            break;

								        }


								        if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))

								        {

								            // Encoding is not recognized


								            break;

								        }


								#if 0

								        if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))

								        {

								            // These are bogus since we know the file is MBCS


								            break;

								        }

								#endif


								        return(FValidateCodepage(hwndNP, *pcp));

								    }


								    return(FALSE);

								}