%{
//--------------------------------------------------------------------
// Microsoft Monarch
//
// Copyright (c) Microsoft Corporation, 1997 - 1999.
//
// @doc OPTIONAL EXTRACTION CODES
//
// @module  ms-sql.l |
//          LEX tokenizer script
//
// @devnotes none
//
// @rev 0 | 04-Feb-97 | v-charca  | Created
//

/**
** NOTE : when adding a new token (XXX) modify the following:
**      1.) Add %token _XXX to sql.y
**      2.) Add lexeme pattern to sql.l stating whether the token returns a TOKEN
**          or a VALUE.  If the token returns a value a node will need to be created to
**          contain the value information. Therefore the VALUE macro will also need to
**          specify a valid VARIANT type for the value.
**/


#include "msidxtr.h"

#ifdef DEBUG
# define YYTRACE(tknNum) LexerTrace(yytext, yyleng, tknNum);
#else
# define YYTRACE(tknNum)
#endif


#define TOKEN(tknNum) YYTRACE(tknNum) return(tknNum);

#define VALUE(tknNum)                   \
    {                                   \
    YYTRACE(tknNum)                     \
    CreateTknValue(yylval, tknNum);     \
    return tknNum;                      \
    }

#define STRING_VALUE(tknNum, wch, fQuote)           \
    {                                               \
    YYTRACE(tknNum)                                 \
    CreateTknValue(yylval, tknNum, wch, fQuote);    \
    return tknNum;                                  \
    }

#define ID_VALUE(tknNum, wch)               \
    {                                       \
    YYTRACE(tknNum)                         \
    CreateTknValue(yylval, tknNum, wch);    \
    return _ID;                             \
    }

/*
** Make Lex read from a block of data
**    buffer is the character buffer,
**    result is a variable to store the number of chars read
**    ms is the size of the buffer
*/
#undef YY_INPUT
#define YY_INPUT(b, r, ms) (r = yybufferinput(b, ms))

//--------------------------------------------------------------------------------------------
// @func Makes a new copy of UNICODE string.  Filters out double quotes
// @side Allocates enough bytes to hold string
// @rdesc Pointer to new UNICODE string

LPWSTR PwszDupFilter(
    LPWSTR  pwszOrig,
    WCHAR   wch )
{
    LPWSTR pwszCopy = (LPWSTR)CoTaskMemAlloc( (wcslen(pwszOrig)+2)*sizeof(WCHAR) );
    if ( 0 != pwszCopy )
    {
        LPWSTR pwsz = pwszCopy;
        while ( 0 != *pwszOrig )
        {
            if ( *(pwszOrig+1) && *(pwszOrig+1) == *pwszOrig && wch == *pwszOrig )
                pwszOrig++;
            else
                *pwsz++ = *pwszOrig++;
        }
        *pwsz = L'\0';
    }

    return pwszCopy;
}

//--------------------------------------------------------------------------------------------
//      YYLEXER::CreateTknValue
//      Creates a QUERYTREE node structure which is passed to the YACC value stack.
//      This routines uses the TokenInfo map to determine which opids to create for
//      the given string.
//
//
void YYLEXER::CreateTknValue(
    YYSTYPE *ppct,
    short tknNum,
    YY_CHAR wch,
    BOOL fQuote )
{
    // Note that values containing variants can only be CONSTANTS or ID's
    // SHOULD BE DONE BY valType
    switch ( tknNum )
    {
        case _ID:
        case _TEMPVIEW:
            {
                // Assume table_name for now.  Might have to correct this when I
                // see the context in the parser.
                if ( 0 == (*ppct = PctAllocNode(DBVALUEKIND_WSTR)) )
                    throw(E_OUTOFMEMORY);

                (*ppct)->op    = DBOP_table_name;
                (*ppct)->wKind = DBVALUEKIND_WSTR;
                (*ppct)->value.pwszValue = CoTaskStrDup(yytext_ptr);
                if( 0 == (*ppct)->value.pwszValue )
                {
                    DeleteDBQT( *ppct );
                    *ppct = NULL;
                    throw(E_OUTOFMEMORY);
                }
                _wcsupr((*ppct)->value.pwszValue);
                break;
            }

        case _DELIMITED_ID:
            {
                if ( 0 == (*ppct = PctAllocNode(DBVALUEKIND_WSTR)) )
                    throw(E_OUTOFMEMORY);

                (*ppct)->op    = DBOP_table_name;
                (*ppct)->wKind = DBVALUEKIND_WSTR;
                // Strip quotes on delimited identifier
                yytext_ptr[wcslen(yytext_ptr)-1] = L'\0';
                (*ppct)->value.pwszValue = PwszDupFilter(yytext_ptr+1, wch);
                if( 0 == (*ppct)->value.pwszValue )
                {
                    DeleteDBQT( *ppct );
                    *ppct = NULL;
                    throw(E_OUTOFMEMORY);
                }
                break;
            }

        case _URL:
        case _STRING:
        case _PREFIX_STRING:
            {
                // NOTE:  This is really a PROPVARIANT node, but there is no DBVALUEKIND for PROPVARIANT.
                if ( 0 == (*ppct = PctAllocNode(DBVALUEKIND_VARIANT, DBOP_scalar_constant)) )
                    throw(E_OUTOFMEMORY);

                LPWSTR pwsz = yytext_ptr;
                LPWSTR pwszCopy = PwszDupFilter(pwsz, wch);

                if ( 0 == pwszCopy )
                {
                    DeleteDBQT( *ppct );
                    *ppct = NULL;
                    throw(E_OUTOFMEMORY);
                }

                LPWSTR pwszTemp = pwszCopy;
                // Strip quotes on literals or
                if ( fQuote && (*pwszCopy == L'\"' || *pwszCopy == L'\'') )
                {
                    pwszCopy++;
                    Assert(pwszCopy[wcslen(pwszCopy)-1] == L'\"' || pwszCopy[wcslen(pwszCopy)-1] == L'\'');
                    pwszCopy[wcslen(pwszCopy)-1] = L'\0';
                }

                ((PROPVARIANT*)(*ppct)->value.pvValue)->bstrVal = SysAllocString( pwszCopy );
                CoTaskMemFree( pwszTemp );  // throw away temporary before testing for out of memory
                ((PROPVARIANT*)(*ppct)->value.pvValue)->vt = VT_BSTR;
                if( 0 == ((PROPVARIANT*)(*ppct)->value.pvValue)->bstrVal )
                {
                    DeleteDBQT( *ppct );
                    *ppct = 0;
                    throw(E_OUTOFMEMORY);
                }
            }
            break;

        case _INTNUM:
            if ( 0 == (*ppct = PctAllocNode(DBVALUEKIND_VARIANT, DBOP_scalar_constant)) )
                throw(E_OUTOFMEMORY);

            ((PROPVARIANT*)(*ppct)->value.pvValue)->bstrVal = SysAllocString( yytext_ptr );
            ((PROPVARIANT*)(*ppct)->value.pvValue)->vt = VT_BSTR;
            if ( 0 == ((PROPVARIANT*)(*ppct)->value.pvValue)->bstrVal )
            {
                DeleteDBQT( *ppct );
                *ppct = 0;
                throw(E_OUTOFMEMORY);
            }

            (*ppct)->hrError = PropVariantChangeTypeI64( (PROPVARIANT*)(*ppct)->value.pvValue );
            if ( FAILED((*ppct)->hrError) )
            {
                HRESULT hr = (*ppct)->hrError;
                DeleteDBQT( *ppct );
                *ppct = 0;
                throw(hr);
            }
            break;

        case _REALNUM:
            if ( 0 == (*ppct = PctAllocNode(DBVALUEKIND_VARIANT, DBOP_scalar_constant)) )
                throw(E_OUTOFMEMORY);

            ((PROPVARIANT*)(*ppct)->value.pvValue)->bstrVal = SysAllocString( yytext_ptr );
            ((PROPVARIANT*)(*ppct)->value.pvValue)->vt = VT_BSTR;
            if ( 0 == ((PROPVARIANT*)(*ppct)->value.pvValue)->bstrVal )
            {
                DeleteDBQT( *ppct );
                *ppct = NULL;
                throw(E_OUTOFMEMORY);
            }

            (*ppct)->hrError = VariantChangeTypeEx( (*ppct)->value.pvarValue,  // convert in place
                                                    (*ppct)->value.pvarValue,
                                                    LOCALE_SYSTEM_DEFAULT,
                                                    0,
                                                    VT_R8 );
            if ( FAILED((*ppct)->hrError) )
            {
                HRESULT hr = (*ppct)->hrError;
                DeleteDBQT( *ppct );
                *ppct = 0;
                throw(hr);
            }
            break;

        default:
            Assert( !"Unkown token value" );
    }
}


%}
%x  contains
%x  cntntsrch
%x  scope0
%x  scope1
%x  scope2
%x  view

white           [ \t\n\f\r]+
id              [a-zA-Z][a-zA-Z0-9_]*
simpleterm      ([^ \n\t\f\r\'\(\)\[\]\&\|\~\!\,]+|\'\')*
br_id           ([^\"\n]*|\"\")*
integer         [-+]?[0-9]+|[-+]?0x[a-fA-F0-9]+
real            [-+]?([0-9]+(\.[0-9]*)?)([eE][-+]?[0-9]+)?|-?\.[0-9]+([eE][-+]?[0-9]+)?
quoted_string   \'([^'\n]*|\'\')*\'
string          \'[^'\n]*\'
comment         --[^\n]*
%%

%{
/***
 *** Reserved words in every context
 ***/
%}
ALL                             { TOKEN(_ALL);                      }
AND                             { TOKEN(_AND);                      }
ANY                             { TOKEN(_ANY);                      }
ARRAY                           { TOKEN(_ARRAY);                    }
AS                              { TOKEN(_AS);                       }
ASC                             { TOKEN(_ASC);                      }
CAST                            { TOKEN(_CAST);                     }
CREATE                          { TOKEN(_CREATE);                   }
CONTAINS                        { BEGIN contains;TOKEN(_CONTAINS);  }
DESC                            { TOKEN(_DESC);                     }
DROP                            { TOKEN(_DROP);                     }
FALSE                           { TOKEN(_FALSE);                    }
FREETEXT                        { TOKEN(_FREETEXT);                 }
FROM                            { TOKEN(_FROM);                     }
IS                              { TOKEN(_IS);                       }
IS{white}NOT                    { TOKEN(_IS_NOT);                   }
LIKE                            { TOKEN(_LIKE);                     }
MATCHES                         { TOKEN(_MATCHES);                  }
NOT                             { TOKEN(_NOT);                      }
NOT{white}LIKE                  { TOKEN(_NOT_LIKE);                 }
NULL                            { TOKEN(_NULL);                     }
OR                              { TOKEN(_OR);                       }
ORDER{white}BY                  { TOKEN(_ORDER_BY);                 }
PASSTHROUGH                     { TOKEN(_PASSTHROUGH);              }
PROPERTYNAME                    { TOKEN(_PROPERTYNAME);             }
PROPID                          { TOKEN(_PROPID);                   }
RANKMETHOD                      { TOKEN(_RANKMETHOD);               }
SCOPE                           { BEGIN scope0; TOKEN(_SCOPE);      }
SELECT                          { TOKEN(_SELECT);                   }
SET                             { TOKEN(_SET);                      }
SOME                            { TOKEN(_SOME);                     }
TABLE                           { TOKEN(_TABLE);                    }
TRUE                            { TOKEN(_TRUE);                     }
TYPE                            { TOKEN(_TYPE);                     }
UNION                           { TOKEN(_UNION);                    }
UNKNOWN                         { TOKEN(_UNKNOWN);                  }
VIEW                            { TOKEN(_VIEW);                     }
WHERE                           { TOKEN(_WHERE);                    }

{white}                         { /* empty lex rule */              }
{id}                            { VALUE(_ID);                       }
\#{id}                          { VALUE(_TEMPVIEW);                 }
\#\#{id}                        { VALUE(_TEMPVIEW);                 }
\"{br_id}\"                     { ID_VALUE(_DELIMITED_ID, L'"');        }
{quoted_string}                 { STRING_VALUE(_STRING, L'\'', TRUE);}
{integer}                       { VALUE(_INTNUM);                   }
{real}                          { VALUE(_REALNUM);                  }
{comment}                       { /* empty lex rule */              }

\>\=                            { TOKEN(_GE);                       }
\<\=                            { TOKEN(_LE);                       }
\<\>                            { TOKEN(_NE);                       }
\!\=                            { TOKEN(_NE);                       }
\.                              { TOKEN(_DOT);                      }
\.\.                            { BEGIN view; TOKEN(_DOTDOT);       }
\.\.\.                          { BEGIN view; TOKEN(_DOTDOTDOT);    }
\.\.SCOPE                       { BEGIN scope0; TOKEN(_DOTDOT_SCOPE);}
\.\.\.\SCOPE                    { BEGIN scope0; TOKEN(_DOTDOTDOT_SCOPE);}
.                               { YYTRACE(yytext[0]); return yytext[0]; }

%{
/***
 *** A <contains predicate> has been started.  The only things we should see are:
 ***    (                   - matched by .
 ***    <column reference>  - matched by {id} or "{br_id}"
 ***    ,                   - matched by .
 ***    '                   - matched by \'.  Also switch to content search state (cntnsrch).
 ***/
%}
<contains>\'                    { BEGIN cntntsrch;YYTRACE(yytext[0]); return yytext[0];}
<contains>{id}                  { VALUE(_ID);                       }
<contains>\"{br_id}\"           { ID_VALUE(_DELIMITED_ID, L'"');        }
<contains>{white}               { /* empty lex rule */              }
<contains>.                     { YYTRACE(yytext[0]); return yytext[0];}

%{
/***
 *** The only things we should see are:
 ***    <global view name>  - matched by {id}
 ***    _TEMPVIEW           - matched by \#{id} or \#\#{id}
 ***/
%}
<view>{id}                      { BEGIN INITIAL; VALUE(_ID);        }
<view>\#{id}                    { BEGIN INITIAL; VALUE(_TEMPVIEW);  }
<view>\#\#{id}                  { BEGIN INITIAL; VALUE(_TEMPVIEW);  }

%{
/***
 *** A <content search condition> has been started.  There are several keywords we can see here.
 *** We are also looking for a quoted string, a prefix string, or a simple term.  We are taken
 *** back to the initial state by a single quote (').
 ***/
%}
<cntntsrch>{white}AND{white}    { unput(L' '); TOKEN(_AND);         }
<cntntsrch>COERCE               { TOKEN(_COERCE);                   }
<cntntsrch>ISABOUT              { TOKEN(_ISABOUT);                  }
<cntntsrch>{white}NEAR          { TOKEN(_NEAR);                     }
<cntntsrch>{white}NOT{white}    { unput(L' '); TOKEN(_NOT);         }
<cntntsrch>{white}OR{white}     { unput(L' '); TOKEN(_OR);          }
<cntntsrch>FORMSOF              { TOKEN(_FORMSOF);                  }
<cntntsrch>WEIGHT               { TOKEN(_WEIGHT);                   }
<cntntsrch>\"{br_id}\*\"        { STRING_VALUE(_PREFIX_STRING, L'\'', TRUE);}
<cntntsrch>\"{br_id}\"          { STRING_VALUE(_STRING, L'\'', TRUE);}
<cntntsrch>\'                   { BEGIN INITIAL; YYTRACE(yytext[0]); return yytext[0];}
<cntntsrch>{white}              { /* empty lex rule */              }
<cntntsrch>{simpleterm}         { STRING_VALUE(_STRING, L'\'', FALSE)}
<cntntsrch>.                    { YYTRACE(yytext[0]); return yytext[0];}

%{
/***
 *** A <from clause> has been started.  We've already seen the keyword SCOPE, so this
 *** is not a FROM <view name>.  We're just looking for a ( now to put us into the
 *** next state (scope1).
 ***    (                   - matched by \(.  Also switch to scope1 state.
 ***/
%}
<scope0>\(                      { BEGIN scope1; YYTRACE(yytext[0]); return yytext[0];}
<scope0>{white}                 { /* empty lex rule */              }
<scope0>.                       { BEGIN scope1; YYTRACE(yytext[0]); return yytext[0];}

%{
/***
 *** We're in the middle of a <from clause>.  We've seen FROM SCOPE(, so now we need to recognize
 *** the various scope definitions that we might see here.  The two important things to recognize
 *** are:
 ***    (                   - matched by \(.  Also switch to scope2 state to match parens.
 ***    )                   - matched by \).  Also switch to the initial (finished <from clause>).
 ***/
%}
<scope1>\"{br_id}\"             { STRING_VALUE(_URL, L'"', TRUE);   }
<scope1>ALL                     { TOKEN(_ALL);                      }
<scope1>DEEP{white}TRAVERSAL    { TOKEN(_DEEP_TRAVERSAL);           }
<scope1>EXCLUDE{white}SEARCH{white}TRAVERSAL { TOKEN(_EXCLUDE_SEARCH_TRAVERSAL);}
<scope1>OF                      { TOKEN(_OF);                       }
<scope1>SHALLOW{white}TRAVERSAL { TOKEN(_SHALLOW_TRAVERSAL);        }
<scope1>{white}                 { /* empty lex rule */              }
<scope1>\(                      { BEGIN scope2; YYTRACE(yytext[0]); return yytext[0];}
<scope1>\)                      { BEGIN INITIAL; YYTRACE(yytext[0]); return yytext[0];}
<scope1>.                       { YYTRACE(yytext[0]); return yytext[0];}

%{
/***
 *** We're still in the middle of a <from clause>.  So far we've seen:
 ***    FROM SCOPE( ... (
 *** We need to find a ')' to finish out the element we're working on:
 ***    )                   - matched by \).  Also switch back to scope1 state.
 ***/
%}
<scope2>{white}                 { /* empty lex rule */              }
<scope2>\"{br_id}\"             { STRING_VALUE(_URL, L'"', TRUE);  }
<scope2>\)                      { BEGIN scope1; YYTRACE(yytext[0]); return yytext[0];}
<scope2>.                       { YYTRACE(yytext[0]); return yytext[0];}

%%