/*++ Copyright (c) 1989-91 Microsoft Corporation Module Name: gettoken.c Abstract: The GetToken() function, which takes a pathname splits it into individual tokens. This function is a lexical analyzer which is called by the parsing routines of NetpPathType. Author: Danny Glasser (dannygl) 19 June 1989 Notes: For efficiency, there is code here which is different for the DBCS and non-DBCS environments. This allows us to take advantage of short cuts which are not valid in the DBCS world (such as scanning a string from right to left). See the comments below for a detailed description of the behavior of this function. Revision History: 27-Sep-1991 JohnRo Changed TEXT macro usage to allow UNICODE. 06 May 1991 rfirth 32-bit version --*/ #include "nticanon.h" #include "winnls.h" #define TEXT_LENGTH(s) ((sizeof(s)/sizeof(TCHAR)) - 1) static TCHAR szAUXName[] = TEXT("AUX"); static TCHAR szCOMMName[] = TEXT("COMM"); static TCHAR szCONName[] = TEXT("CON"); static TCHAR szDEVName[] = TEXT("DEV"); static TCHAR szMAILSLOTName[] = TEXT("MAILSLOT"); static TCHAR szNULName[] = TEXT("NUL"); static TCHAR szPIPEName[] = TEXT("PIPE"); static TCHAR szPRINTName[] = TEXT("PRINT"); static TCHAR szPRNName[] = TEXT("PRN"); static TCHAR szQUEUESName[] = TEXT("QUEUES"); static TCHAR szSEMName[] = TEXT("SEM"); static TCHAR szSHAREMEMName[] = TEXT("SHAREMEM"); static TCHAR szLPTName[] = TEXT("LPT"); static TCHAR szCOMName[] = TEXT("COM"); #define LPT_TOKEN_LEN TEXT_LENGTH(szLPTName) #define COM_TOKEN_LEN TEXT_LENGTH(szCOMName) static TCHAR szWildcards[] = TEXT("*?"); static TCHAR szIllegalChars[] = ILLEGAL_CHARS; static TCHAR szNonComponentChars[] = NON_COMPONENT_CHARS ILLEGAL_CHARS; static TCHAR _text_SingleDot[] = TEXT("."); typedef struct { LPTSTR pszTokenName; DWORD cbTokenLen; DWORD flTokenType; } STRING_TOKEN; // // IMPORTANT: In order for the binary table traversal to work, the strings // in this table MUST be in lexically-sorted order. Please // bear this in mind when adding strings to the table. // STATIC STRING_TOKEN StringTokenTable[] = { szDEVName, TEXT_LENGTH(szDEVName), TOKEN_TYPE_DEV }; #define NUM_STRING_TOKENS (sizeof(StringTokenTable) / sizeof(*StringTokenTable)) STATIC DWORD TrailingDotsAndSpaces(LPTSTR pszToken, DWORD cbTokenLen ); STATIC BOOL IsIllegalCharacter(LPTSTR pszString); DWORD GetToken( IN LPTSTR pszBegin, OUT LPTSTR* ppszEnd, OUT LPDWORD pflTokenType, IN DWORD flFlags ) /*++ Routine Description: GetToken attempts to locate and type the next token. It takes the beginning of the token and determines the end of the token (i.e. the beginning of the next token, so that it can be called again). It also sets the TOKEN_TYPE_* bits for all of the token types which are appropriate to the specified type. Arguments: pszBegin - A pointer to the first character in the token. ppszEnd - A pointer to the location in which to store the end of the current token (actually, the first character of the next token). pflTokenType- The place to store the token type. Token types are defined in TOKEN.H. flFlags - Flags to determine operation. Currently MBZ. Return Value: DWORD Success - 0 Failure - ERROR_INVALID_PARAMETER ERROR_INVALID_NAME ERROR_FILENAME_EXCED_RANGE --*/ { register TCHAR chFirstChar; register DWORD cbTokenLen; BOOL fComputernameOnly = FALSE; DWORD usNameError = 0; DWORD cbTrailingDotSpace; DWORD iLow, iHigh, iMid; LONG iCmpVal; LCID lcid = GetThreadLocale(); BOOL bDBCS = (PRIMARYLANGID( LANGIDFROMLCID(lcid)) == LANG_JAPANESE) || (PRIMARYLANGID(LANGIDFROMLCID(lcid)) == LANG_KOREAN) || (PRIMARYLANGID(LANGIDFROMLCID(lcid)) == LANG_CHINESE); extern DWORD cbMaxPathCompLen; // // This macro is used to make sure that the error value is set only // once in the computername-only case. // #define SET_COMPUTERNAMEONLY(err) if (! fComputernameOnly) \ { \ fComputernameOnly = TRUE; \ usNameError = err; \ } if (flFlags & GTF_RESERVED) { return ERROR_INVALID_PARAMETER; } // // Initialize the token type to 0 // *pflTokenType = 0; // // Store the first character // chFirstChar = *pszBegin; // // Return immediately if the string is a null string // if (chFirstChar == TCHAR_EOS) { *ppszEnd = pszBegin; *pflTokenType = TOKEN_TYPE_EOS; #ifdef DEVDEBUG DbgPrint("GetToken - returning TOKEN_TYPE_EOS\n"); #endif return 0; } // // Handle single-character, non-component tokens // if ((chFirstChar == TCHAR_BACKSLASH) || (chFirstChar == TCHAR_FWDSLASH)) { *pflTokenType = TOKEN_TYPE_SLASH; } else if (chFirstChar == TCHAR_COLON) { *pflTokenType = TOKEN_TYPE_COLON; } // // If we get here and the token type is non-zero, we have a single // character token. We set and return 0. // if (*pflTokenType) { *ppszEnd = pszBegin + 1; #ifdef DEVDEBUG DbgPrint("GetToken - *pflTokenType=%x\n", *pflTokenType); #endif return 0; } // // If we get here, the token is a component, find the end of the // component by looking for the first character in the string which // isn't a valid component character. // // IMPORTANT: There are certain names which are not valid component // names but which may be valid computernames. If we hit // such a name, we set the flag. Later // on, we check to see if the name is a valid computername. // If it is, we allow it; otherwise, we return an error. // cbTokenLen = STRCSPN(pszBegin, szNonComponentChars); // // We return an error if the first character is not a valid component // character, if the component is too long, or if the first // non-component character in the string is an illegal character. // if (cbTokenLen == 0) { #ifdef DEVDEBUG DbgPrint("GetToken - returning ERROR_INVALID_NAME (token len = 0)\n"); #endif return ERROR_INVALID_NAME; } if (cbTokenLen > cbMaxPathCompLen) { SET_COMPUTERNAMEONLY(ERROR_FILENAME_EXCED_RANGE); } if (IsIllegalCharacter(pszBegin + cbTokenLen)) { #ifdef DEVDEBUG DbgPrint("GetToken - returning ERROR_INVALID_NAME (illegal char)\n"); #endif return ERROR_INVALID_NAME; } // // Now we need to determine where the trailing dots and spaces begin, // and make sure that the component name contains something other // than dots and spaces, unless it's "." or ".." // // NOTE: If there are not trailing dots or spaces, // is set to . // cbTrailingDotSpace = TrailingDotsAndSpaces(pszBegin, cbTokenLen ); // // See if the token has only trailing dots and spaces // if (cbTrailingDotSpace == 0) { // // Return an error if the length of the token is greater than 2. // if (cbTokenLen > 2) { SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME); } // // Return an error if the first character is not a dot or if the // token length is 2 and the second character is not a dot. // if ((chFirstChar != TCHAR_DOT) || ((cbTokenLen == 2) && (pszBegin[1] != TCHAR_DOT))) { SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME); } // // Now we're OK, since the token is either "." or ".." // } // // WE HAVE A VALID COMPONENT // *pflTokenType = TOKEN_TYPE_COMPONENT; // // Now we determine if this token matches any of the component-based // types. // // // Is it a drive? // if (IS_DRIVE(chFirstChar) && (cbTokenLen == 1)) { *pflTokenType |= TOKEN_TYPE_DRIVE; } // // Is it "." or ".." ? // // Since we've already validated this string, we know that if it // contains nothing but dots and spaces it must be one of these // two. // if (cbTrailingDotSpace == 0) { *pflTokenType |= cbTokenLen == 1 ? TOKEN_TYPE_DOT : TOKEN_TYPE_DOTDOT; } // // If the 8.3 flag is specified, we also have to check that the // component is in 8.3 format. We determine this as follows: // // Find the first dot in the token (or the end of the token). // Verify that at least 1 and at most 8 characters precede it. // Verify that at most 3 characters follow it. // Verify that none of the characters which follow it are dots. // // The exceptions to this are "." and "..". Therefore, we don't check // this until after we've already determined that this component is // neither of those. // if ((cbTrailingDotSpace != 0) && (flFlags & GTF_8_DOT_3)) { DWORD cbFirstDot; BOOL fNoDot; cbFirstDot = STRCSPN(pszBegin, _text_SingleDot); if (fNoDot = cbFirstDot >= cbTokenLen) { cbFirstDot = cbTokenLen; } if (cbFirstDot == 0 || cbFirstDot > 8 || cbTokenLen - cbFirstDot > 4 || (! fNoDot && STRCSPN(pszBegin + cbFirstDot + 1, _text_SingleDot) < cbTokenLen - (cbFirstDot + 1))) { SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME); } if( bDBCS ) { // // In case of MBCS, We also need to check the string is valid in MBCS // because Unicode character count is not eqaul MBCS byte count CHAR szCharToken[13]; // 8 + 3 + dot + null int cbConverted = 0; BOOL bDefaultUsed = FALSE; // Convert Unicode string to Mbcs. cbConverted = WideCharToMultiByte( CP_OEMCP, 0, pszBegin, -1, szCharToken, sizeof(szCharToken), NULL, &bDefaultUsed ); // If the converted langth is larger than the buffer, or the WideChar string // contains some character that is can not be repesented by MultiByte code page, // set error. if( cbConverted == FALSE || bDefaultUsed == TRUE ) { SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME); } else { cbConverted -= 1; // Remove NULL; cbFirstDot = strcspn(szCharToken, "."); if (fNoDot = cbFirstDot >= (DWORD)cbConverted) { cbFirstDot = cbConverted; } if (cbFirstDot == 0 || cbFirstDot > 8 || cbConverted - cbFirstDot > 4 || (! fNoDot && strcspn(szCharToken + cbFirstDot + 1, ".") < cbConverted - (cbFirstDot + 1))) { SET_COMPUTERNAMEONLY(ERROR_INVALID_NAME); } } } } // // Does it contain wildcards? // // If so, set the appropriate flag(s). // // If not, it may be a valid computername. // if (STRCSPN(pszBegin, szWildcards) < cbTokenLen) { *pflTokenType |= TOKEN_TYPE_WILDCARD; // // Special case the single '*' token // if (cbTokenLen == 1 && chFirstChar == TCHAR_STAR) { *pflTokenType |= TOKEN_TYPE_WILDONE; } } else { if( cbTokenLen <= MAX_PATH ) { *pflTokenType |= TOKEN_TYPE_COMPUTERNAME; } } // // IMPORTANT: Now we've determined if the token is a valid computername. // If the flag is set and it's a valid // computername, then we turn off all other bits. If it's // not a valid computername, we return the stored error. // If the flag isn't set, we continue with the component name // processing. // if (fComputernameOnly) { if (*pflTokenType & TOKEN_TYPE_COMPUTERNAME) { *pflTokenType = TOKEN_TYPE_COMPUTERNAME; } else { #ifdef DEVDEBUG DbgPrint("GetToken - returning usNameError (%u)\n", usNameError); #endif return usNameError; } } else { // // Is this an LPT[1-9] token? // if (STRNICMP(pszBegin, szLPTName, LPT_TOKEN_LEN) == 0 && IS_NON_ZERO_DIGIT(pszBegin[LPT_TOKEN_LEN]) && cbTrailingDotSpace == LPT_TOKEN_LEN + 1) { *pflTokenType |= TOKEN_TYPE_LPT; } // // Is this an COM[1-9] token? // if (STRNICMP(pszBegin, szCOMName, COM_TOKEN_LEN) == 0 && IS_NON_ZERO_DIGIT(pszBegin[COM_TOKEN_LEN]) && cbTrailingDotSpace == COM_TOKEN_LEN + 1) { *pflTokenType |= TOKEN_TYPE_COM; } // // The remainder of the component-based token types are determined // by string comparisons. In order to speed things up, we store // these strings in sorted order and do a binary search on them, // which reduces the worst-case number of comparisons from N to // log N (where N is the number of strings). // iLow = (ULONG)-1; iHigh = NUM_STRING_TOKENS; while (iHigh - iLow > 1) { iMid = (iLow + iHigh) / 2; // // We do the comparison up to the length of the longer of the // two strings. This guarantees us a valid non-zero value for // iCmpVal if they don't match. It also means that they won't // match unless they're the same length. // iCmpVal = STRNICMP(pszBegin, StringTokenTable[iMid].pszTokenName, max(StringTokenTable[iMid].cbTokenLen, cbTrailingDotSpace) ); if (iCmpVal < 0) { iHigh = iMid; } else if (iCmpVal > 0) { iLow = iMid; } else { // // We have a match! // *pflTokenType |= StringTokenTable[iMid].flTokenType; // // We can only match one, so don't bother continuing // break; } } } // // We're done; set the end pointer and return with success // *ppszEnd = pszBegin + cbTokenLen; #ifdef DEVDEBUG DbgPrint("GetToken - returning success\n"); #endif return 0; } STATIC DWORD TrailingDotsAndSpaces(LPTSTR pszToken, DWORD cbTokenLen ) { LPTSTR pszDotSpace = pszToken + cbTokenLen - 1; // // Scan the token until we reach the beginning or we find a // non-dot/space. // while (pszDotSpace >= pszToken && (*pszDotSpace == TCHAR_DOT || *pszDotSpace == TCHAR_SPACE)) { pszDotSpace--; } // // Increment pszDotSpace so that it points to the beginning of // the trailing dots and spaces (or one past the end of the token // if there are no trailing dots or spaces). // pszDotSpace++; // // Return the index of the first trailing dot or space (or the length // of the token if there were none). // return (DWORD)(pszDotSpace - pszToken); } STATIC BOOL IsIllegalCharacter(LPTSTR pszString) { // TCHAR chTemp; // BOOL fRetVal; // // Return FALSE immediately for a null character // if (*pszString == TCHAR_EOS) { return FALSE; } // // If the character is a single-byte character, we can simply see if // it's illegal by calling strchrf() on the illegal character array. // If it's a double-byte character, we have to do it the slower way // (with strcspnf). // // if (!IS_LEAD_BYTE(*pszString)) { return (STRCHR(szIllegalChars, *pszString) != NULL); // } else { // // // // // We set the character after the double-byte character to the // // null character, to speed things up. // // // // chTemp = pszString[2]; // pszString[2] = TCHAR_EOS; // fRetVal = STRCSPN(pszString, szIllegalChars) == 0; // pszString[2] = chTemp; // // return fRetVal; // } }