/*++

Copyright (c) 1989-91  Microsoft Corporation

Module Name:

    gettoken.c

Abstract:

    The GetToken() function, which takes a pathname splits it into
    individual tokens.  This function is a lexical analyzer which
    is called by the parsing routines of NetpPathType.

Author:

    Danny Glasser (dannygl) 19 June 1989

Notes:

    For efficiency, there is code here which is different for the
    DBCS and non-DBCS environments.  This allows us to take advantage
    of short cuts which are not valid in the DBCS world (such as
    scanning a string from right to left).

    See the comments below for a detailed description of the behavior
    of this function.

Revision History:

    27-Sep-1991 JohnRo
        Changed TEXT macro usage to allow UNICODE.

    06 May 1991 rfirth
        32-bit version

--*/


#include "nticanon.h"
#include "winnls.h"


#define TEXT_LENGTH(s)  ((sizeof(s)/sizeof(TCHAR)) - 1)


static  TCHAR   szAUXName[]             = TEXT("AUX");
static  TCHAR   szCOMMName[]            = TEXT("COMM");
static  TCHAR   szCONName[]             = TEXT("CON");
static  TCHAR   szDEVName[]             = TEXT("DEV");
static  TCHAR   szMAILSLOTName[]        = TEXT("MAILSLOT");
static  TCHAR   szNULName[]             = TEXT("NUL");
static  TCHAR   szPIPEName[]            = TEXT("PIPE");
static  TCHAR   szPRINTName[]           = TEXT("PRINT");
static  TCHAR   szPRNName[]             = TEXT("PRN");
static  TCHAR   szQUEUESName[]          = TEXT("QUEUES");
static  TCHAR   szSEMName[]             = TEXT("SEM");
static  TCHAR   szSHAREMEMName[]        = TEXT("SHAREMEM");
static  TCHAR   szLPTName[]             = TEXT("LPT");
static  TCHAR   szCOMName[]             = TEXT("COM");

#define LPT_TOKEN_LEN   TEXT_LENGTH(szLPTName)
#define COM_TOKEN_LEN   TEXT_LENGTH(szCOMName)

static  TCHAR   szWildcards[]           = TEXT("*?");
static  TCHAR   szIllegalChars[]        = ILLEGAL_CHARS;
static  TCHAR   szNonComponentChars[]   = NON_COMPONENT_CHARS ILLEGAL_CHARS;

static  TCHAR   _text_SingleDot[]       = TEXT(".");


typedef struct {
    LPTSTR  pszTokenName;
    DWORD   cbTokenLen;
    DWORD   flTokenType;
} STRING_TOKEN;


//
// IMPORTANT:  In order for the binary table traversal to work, the strings
//             in this table MUST be in lexically-sorted order.  Please
//             bear this in mind when adding strings to the table.
//

STATIC STRING_TOKEN StringTokenTable[] = {
    szDEVName,         TEXT_LENGTH(szDEVName),      TOKEN_TYPE_DEV
};

#define NUM_STRING_TOKENS   (sizeof(StringTokenTable) / sizeof(*StringTokenTable))


STATIC DWORD    TrailingDotsAndSpaces(LPTSTR pszToken, DWORD cbTokenLen );
STATIC BOOL     IsIllegalCharacter(LPTSTR pszString);


DWORD
GetToken(
    IN  LPTSTR  pszBegin,
    OUT LPTSTR* ppszEnd,
    OUT LPDWORD pflTokenType,
    IN  DWORD   flFlags
    )

/*++

Routine Description:

    GetToken attempts to locate and type the next token.  It takes the
    beginning of the token and determines the end of the token (i.e.
    the beginning of the next token, so that it can be called again).
    It also sets the TOKEN_TYPE_* bits for all of the token types which
    are appropriate to the specified type.

Arguments:

    pszBegin    - A pointer to the first character in the token.

    ppszEnd     - A pointer to the location in which to store the end of
                  the current token (actually, the first character of the
                  next token).

    pflTokenType- The place to store the token type.  Token types are
                  defined in TOKEN.H.

    flFlags     - Flags to determine operation.  Currently MBZ.

Return Value:

    DWORD
        Success - 0
        Failure - ERROR_INVALID_PARAMETER
                  ERROR_INVALID_NAME
                  ERROR_FILENAME_EXCED_RANGE

--*/

{
    register    TCHAR   chFirstChar;
    register    DWORD   cbTokenLen;
                BOOL    fComputernameOnly = FALSE;
                DWORD   usNameError = 0;
                DWORD   cbTrailingDotSpace;
                DWORD   iLow, iHigh, iMid;
                LONG    iCmpVal;
                LCID    lcid  = GetThreadLocale();
                BOOL    bDBCS = (PRIMARYLANGID( LANGIDFROMLCID(lcid)) == LANG_JAPANESE) ||
                                (PRIMARYLANGID(LANGIDFROMLCID(lcid)) == LANG_KOREAN) ||
                                (PRIMARYLANGID(LANGIDFROMLCID(lcid)) == LANG_CHINESE);

    extern      DWORD   cbMaxPathCompLen;

    //
    // This macro is used to make sure that the error value is set only
    // once in the computername-only case.
    //

#define SET_COMPUTERNAMEONLY(err)   if (! fComputernameOnly)            \
                                    {                                   \
                                        fComputernameOnly = TRUE;       \
                                        usNameError = err;              \
                                    }

    if (flFlags & GTF_RESERVED) {
        return ERROR_INVALID_PARAMETER;
    }

    //
    // Initialize the token type to 0
    //

    *pflTokenType = 0;

    //
    // Store the first character
    //

    chFirstChar = *pszBegin;

    //
    // Return immediately if the string is a null string
    //

    if (chFirstChar == TCHAR_EOS) {
        *ppszEnd = pszBegin;
        *pflTokenType = TOKEN_TYPE_EOS;
#ifdef DEVDEBUG
        DbgPrint("GetToken - returning TOKEN_TYPE_EOS\n");
#endif
        return 0;
    }

    //
    // Handle single-character, non-component tokens
    //

    if ((chFirstChar == TCHAR_BACKSLASH) || (chFirstChar == TCHAR_FWDSLASH)) {
        *pflTokenType = TOKEN_TYPE_SLASH;
    } else if (chFirstChar == TCHAR_COLON) {
        *pflTokenType = TOKEN_TYPE_COLON;
    }

    //
    // If we get here and the token type is non-zero, we have a single
    // character token.  We set <ppszEnd> and return 0.
    //

    if (*pflTokenType) {
        *ppszEnd = pszBegin + 1;
#ifdef DEVDEBUG
        DbgPrint("GetToken - *pflTokenType=%x\n", *pflTokenType);
#endif
        return 0;
    }

    //
    // If we get here, the token is a component, find the end of the
    // component by looking for the first character in the string which
    // isn't a valid component character.
    //
    // IMPORTANT:  There are certain names which are not valid component
    //             names but which may be valid computernames.  If we hit
    //             such a name, we set the <fComputernameOnly> flag.  Later
    //             on, we check to see if the name is a valid computername.
    //             If it is, we allow it; otherwise, we return an error.
    //

    cbTokenLen = STRCSPN(pszBegin, szNonComponentChars);

    //
    // We return an error if the first character is not a valid component
    // character, if the component is too long, or if the first
    // non-component character in the string is an illegal character.
    //

    if (cbTokenLen == 0) {
#ifdef DEVDEBUG
        DbgPrint("GetToken - returning ERROR_INVALID_NAME (token len = 0)\n");
#endif
        return ERROR_INVALID_NAME;
    }

    if (cbTokenLen > cbMaxPathCompLen) {
        SET_COMPUTERNAMEONLY(ERROR_FILENAME_EXCED_RANGE);
    }

    if (IsIllegalCharacter(pszBegin + cbTokenLen)) {
#ifdef DEVDEBUG
        DbgPrint("GetToken - returning ERROR_INVALID_NAME (illegal char)\n");
#endif
        return ERROR_INVALID_NAME;
    }

    //
    // Now we need to determine where the trailing dots and spaces begin,
    // and make sure that the component name contains something other
    // than dots and spaces, unless it's "." or ".."
    //
    // NOTE: If there are not trailing dots or spaces, <cbTrailingDotSpace>
    //       is set to <cbTokenLen>.
    //

    cbTrailingDotSpace = TrailingDotsAndSpaces(pszBegin, cbTokenLen );

    //
    // See if the token has only trailing dots and spaces
    //

    if (cbTrailingDotSpace == 0) {

        //
        // Return an error if the length of the token is greater than 2.
        //

        if (cbTokenLen > 2) {
            SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME);
        }

        //
        // Return an error if the first character is not a dot or if the
        // token length is 2 and the second character is not a dot.
        //

        if ((chFirstChar != TCHAR_DOT) || ((cbTokenLen == 2) && (pszBegin[1] != TCHAR_DOT))) {
            SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME);
        }

        //
        // Now we're OK, since the token is either "." or ".."
        //
    }

    //
    // WE HAVE A VALID COMPONENT
    //

    *pflTokenType = TOKEN_TYPE_COMPONENT;

    //
    // Now we determine if this token matches any of the component-based
    // types.
    //


    //
    // Is it a drive?
    //

    if (IS_DRIVE(chFirstChar) && (cbTokenLen == 1)) {
        *pflTokenType |= TOKEN_TYPE_DRIVE;
    }

    //
    // Is it "." or ".." ?
    //
    // Since we've already validated this string, we know that if it
    // contains nothing but dots and spaces it must be one of these
    // two.
    //

    if (cbTrailingDotSpace == 0) {
        *pflTokenType |= cbTokenLen == 1 ? TOKEN_TYPE_DOT : TOKEN_TYPE_DOTDOT;
    }

    //
    // If the 8.3 flag is specified, we also have to check that the
    // component is in 8.3 format.  We determine this as follows:
    //
    // Find the first dot in the token (or the end of the token).
    // Verify that at least 1 and at most 8 characters precede it.
    // Verify that at most 3 characters follow it.
    // Verify that none of the characters which follow it are dots.
    //
    // The exceptions to this are "." and "..".  Therefore, we don't check
    // this until after we've already determined that this component is
    // neither of those.
    //

    if ((cbTrailingDotSpace != 0) && (flFlags & GTF_8_DOT_3)) {
        DWORD   cbFirstDot;
        BOOL    fNoDot;

        cbFirstDot = STRCSPN(pszBegin, _text_SingleDot);

        if (fNoDot = cbFirstDot >= cbTokenLen) {
            cbFirstDot = cbTokenLen;
        }

        if (cbFirstDot == 0
            || cbFirstDot > 8
            || cbTokenLen - cbFirstDot > 4
            || (! fNoDot && STRCSPN(pszBegin + cbFirstDot + 1, _text_SingleDot)
                            < cbTokenLen - (cbFirstDot + 1))) {
            SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME);
	    }

        if( bDBCS ) {
            //
            // In case of MBCS, We also need to check the string is valid in MBCS
            // because Unicode character count is not eqaul MBCS byte count

            CHAR szCharToken[13]; // 8 + 3 + dot + null
            int  cbConverted = 0;
            BOOL bDefaultUsed = FALSE;

            // Convert Unicode string to Mbcs.
            cbConverted = WideCharToMultiByte( CP_OEMCP,  0,
                                               pszBegin, -1,
                                               szCharToken, sizeof(szCharToken),
                                               NULL, &bDefaultUsed );

            // If the converted langth is larger than the buffer, or the WideChar string
            // contains some character that is can not be repesented by MultiByte code page,
            // set error.

            if( cbConverted == FALSE || bDefaultUsed == TRUE ) {
                SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME);
            } else {
                cbConverted -= 1; // Remove NULL;

                cbFirstDot = strcspn(szCharToken, ".");

                if (fNoDot = cbFirstDot >= (DWORD)cbConverted) {
                    cbFirstDot = cbConverted;
                }

                if (cbFirstDot == 0
                    || cbFirstDot > 8
                    || cbConverted - cbFirstDot > 4
                    || (! fNoDot && strcspn(szCharToken + cbFirstDot + 1, ".")
                                    < cbConverted - (cbFirstDot + 1))) {
                    SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME);
                }
            }
	    }
    }

    //
    // Does it contain wildcards?
    //
    // If so, set the appropriate flag(s).
    //
    // If not, it may be a valid computername.
    //

    if (STRCSPN(pszBegin, szWildcards) < cbTokenLen) {

        *pflTokenType |= TOKEN_TYPE_WILDCARD;

        //
        // Special case the single '*' token
        //

        if (cbTokenLen == 1 && chFirstChar == TCHAR_STAR) {
            *pflTokenType |= TOKEN_TYPE_WILDONE;
        }
    } else {
        if( cbTokenLen <= MAX_PATH ) {
            *pflTokenType |= TOKEN_TYPE_COMPUTERNAME;
        }
    }

    //
    // IMPORTANT:  Now we've determined if the token is a valid computername.
    //             If the <fComputernameOnly> flag is set and it's a valid
    //             computername, then we turn off all other bits.  If it's
    //             not a valid computername, we return the stored error.
    //             If the flag isn't set, we continue with the component name
    //             processing.
    //

    if (fComputernameOnly) {
        if (*pflTokenType & TOKEN_TYPE_COMPUTERNAME) {
            *pflTokenType = TOKEN_TYPE_COMPUTERNAME;
        } else {
#ifdef DEVDEBUG
        DbgPrint("GetToken - returning usNameError (%u)\n", usNameError);
#endif
            return usNameError;
        }
    } else {

        //
        // Is this an LPT[1-9] token?
        //

        if (STRNICMP(pszBegin, szLPTName, LPT_TOKEN_LEN) == 0
            && IS_NON_ZERO_DIGIT(pszBegin[LPT_TOKEN_LEN])
            && cbTrailingDotSpace == LPT_TOKEN_LEN + 1) {
            *pflTokenType |= TOKEN_TYPE_LPT;
        }

        //
        // Is this an COM[1-9] token?
        //

        if (STRNICMP(pszBegin, szCOMName, COM_TOKEN_LEN) == 0
            && IS_NON_ZERO_DIGIT(pszBegin[COM_TOKEN_LEN])
            && cbTrailingDotSpace == COM_TOKEN_LEN + 1) {
            *pflTokenType |= TOKEN_TYPE_COM;
        }

        //
        // The remainder of the component-based token types are determined
        // by string comparisons.  In order to speed things up, we store
        // these strings in sorted order and do a binary search on them,
        // which reduces the worst-case number of comparisons from N to
        // log N (where N is the number of strings).
        //

        iLow = (ULONG)-1;
        iHigh = NUM_STRING_TOKENS;

        while (iHigh - iLow > 1) {
            iMid = (iLow + iHigh) / 2;

            //
            // We do the comparison up to the length of the longer of the
            // two strings.  This guarantees us a valid non-zero value for
            // iCmpVal if they don't match.  It also means that they won't
            // match unless they're the same length.
            //

            iCmpVal = STRNICMP(pszBegin,
                                StringTokenTable[iMid].pszTokenName,
                                max(StringTokenTable[iMid].cbTokenLen,
                                    cbTrailingDotSpace) );

            if (iCmpVal < 0) {
                iHigh = iMid;
            } else if (iCmpVal > 0) {
                iLow = iMid;
            } else {

                //
                // We have a match!
                //

                *pflTokenType |= StringTokenTable[iMid].flTokenType;

                //
                // We can only match one, so don't bother continuing
                //

                break;
            }
        }
    }

    //
    // We're done; set the end pointer and return with success
    //

    *ppszEnd = pszBegin + cbTokenLen;
#ifdef DEVDEBUG
        DbgPrint("GetToken - returning success\n");
#endif
    return 0;
}

STATIC DWORD TrailingDotsAndSpaces(LPTSTR pszToken, DWORD cbTokenLen )
{
    LPTSTR pszDotSpace = pszToken + cbTokenLen - 1;

    //
    // Scan the token until we reach the beginning or we find a
    // non-dot/space.
    //

    while (pszDotSpace >= pszToken
        && (*pszDotSpace == TCHAR_DOT || *pszDotSpace == TCHAR_SPACE)) {
        pszDotSpace--;
    }

    //
    // Increment pszDotSpace so that it points to the beginning of
    // the trailing dots and spaces (or one past the end of the token
    // if there are no trailing dots or spaces).
    //

    pszDotSpace++;

    //
    // Return the index of the first trailing dot or space (or the length
    // of the token if there were none).
    //

    return (DWORD)(pszDotSpace - pszToken);
}


STATIC BOOL IsIllegalCharacter(LPTSTR pszString)
{
//    TCHAR   chTemp;
//    BOOL    fRetVal;

    //
    // Return FALSE immediately for a null character
    //

    if (*pszString == TCHAR_EOS) {
        return FALSE;
    }

    //
    // If the character is a single-byte character, we can simply see if
    // it's illegal by calling strchrf() on the illegal character array.
    // If it's a double-byte character, we have to do it the slower way
    // (with strcspnf).
    //

//    if (!IS_LEAD_BYTE(*pszString)) {
        return (STRCHR(szIllegalChars, *pszString) != NULL);
//    } else {
//
//        //
//        // We set the character after the double-byte character to the
//        // null character, to speed things up.
//        //
//
//        chTemp = pszString[2];
//        pszString[2] = TCHAR_EOS;
//        fRetVal = STRCSPN(pszString, szIllegalChars) == 0;
//        pszString[2] = chTemp;
//
//        return fRetVal;
//    }
}