windows-xp/Source/XPSP1/NT/inetsrv/intlwb/cht/ctplus0.c

//+--------------------------------------------------------------------------
//
//  Microsoft Windows
//  Copyright (C) Microsoft Corporation, 1995 - 1999.
//
//  File:       ctplus.c
//
//  Contents:   Contains character type (orthography) data and routine
//                    to get at it.
//
//  History:    23-May-96   pathal      Created.
//
//---------------------------------------------------------------------------

#include "precomp.h"
#include <winnls.h>

//----------------------------------------------------------------------------
//  s_abBreakList
//
//  This array starts at -1, so that EOF can be found in the array.  It
//  depends on (EOF == -1) being true.  Also, all references to it must be
//  of the form (s_abCharTypeList+1)[x]
//
//  000
//  EOF
//
//  001-080
//  The lower 7F entries from the ASCII Code Page (0000-00ff) are mapped in place
//  (ex. UNICODE 0009 (HT) == 009)
//      The word characters are: $,0-9,A-Z,_,a-z
//      The word separators are: bs,tab,lf,vtab,cr,spc,
//                               ",#,%,&,',(,),*,+,comma,-,/,
//                               :,;,<,=,>,@,[,],`
//      The phrase seperators are: !,.,?,\,^,{,|,},~
//
//  NOTE: Symbols are treated as WS or PS.
//
//  081-0FF
//  The lower 7E entries from the Half Width Variant Code Page (FF00-FF7F) are
//  mapped to 081-0FF.
//
//  100-1FF
//  The lower FF entries from the General Punctuation Code Page (2000-2044) are
//  mapped to 100-1ff.
//
//  200-2FF
//  The lower FF entries from the CJK Auxiliary Code Page (3000-30FF) are mapped
//  to 200-2ff.
//
// pathal - 5/20/96
// Special default character processing for selection
// The following is a list of white space characters that T-Hammer will not right select on:
//          0x0009 (tab), 0x0020 (ansi space), 0x2005 (narrow space, 0x3000 (wide space)
// (Note: see AnalyzeHPBs for special end SPB processing of adjacent white space)
// The following is a list of nls characters to be treated as text by T-Hammer:
//      (in other words T-Hammer will neither right nor left-select on them):
//          0x001F (non-required hyphen), 0x0027 (single quote), 0x2019 (right quote),
//          0x200C (non-width optional break), 0x200D (non-width no break)
//----------------------------------------------------------------------------

const BYTE
s_abCharTypeList[0x301] =
    {
        (BYTE) -1,                                       // EOF (-1)
        PS,PS,PS,PS,PS,PS,PS,PS, WS,WS,WS,WS,PS,WS,PS,PS, // 000 - 015
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,CH,PS, // 016 - 031
        WS,PS,WS,WS,CH,CH,WS,PS,                          // sp ! " # $ % & '
        WS,WS,WS,WS,WS,WS,PS,WS,                          //  ( ) * + , - . /
        CH,CH,CH,CH,CH,CH,CH,CH,                          //  0 1 2 3 4 5 6 7
        CH,CH,WS,WS,WS,WS,WS,PS,                          //  8 9 : ; < = > ?
        WS,CH,CH,CH,CH,CH,CH,CH,                          //  @ A B C D E F G
        CH,CH,CH,CH,CH,CH,CH,CH,                          //  H I J K M L N O
        CH,CH,CH,CH,CH,CH,CH,CH,                          //  P Q R S T U V Y
        CH,CH,CH,WS,PS,WS,PS,CH,                          //  X Y Z [ \ ] ^ _
        WS,CH,CH,CH,CH,CH,CH,CH,                          //  ` a b c d e f g
        CH,CH,CH,CH,CH,CH,CH,CH,                          //  h i j k m l n o
        CH,CH,CH,CH,CH,CH,CH,CH,                          //  p q r s t u v y
        CH,CH,CH,PS,PS,PS,CH,PS,                          //  x y z { | } ~ del
        WS,PS,WS,WS,CH,CH,WS,WS,                          // FF00-FF07 (sp ! " # $ % & ')
        WS,WS,WS,WS,WS,WS,PS,WS,                          //  ( ) * + , - . /
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  0 1 2 3 4 5 6 7
        VC,VC,WS,WS,WS,WS,WS,PS,                          //  8 9 : ; < = > ?
        WS,VC,VC,VC,VC,VC,VC,VC,                          //  @ A B C D E F G
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  H I J K M L N O
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  P Q R S T U V Y
        VC,VC,VC,WS,VC,WS,PS,VC,                          //  X Y Z [ \ ] ^ _
        WS,VC,VC,VC,VC,VC,VC,VC,                          //  ` a b c d e f g
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  h i j k m l n o
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  p q r s t u v y
        VC,VC,VC,PS,PS,PS,VC,PS,                          //  x y z { | } ~ del
        VC,PS,WS,WS,WS,WS,VC,VC,                          //  FF60-FF67
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  FF68-FF6F
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  FF70-FF77
        VC,VC,VC,VC,VC,VC,VC,VC,                          //  FF70-FF7E
        WS,WS,WS,WS,WS,WS,WS,WS,                          // 2000-2007
       WS,WS,WS,WS,CH,CH,WS,WS,                          //  2008-200F
        WS,CH,WS,WS,WS,KC,PS,WS,                          //  2010-2017
        WS,CH,WS,WS,WS,WS,WS,WS,                          //  2018-201F
        WS,WS,PS,PS,PS,PS,PS,CH,                          //  2020-2027
        PS,PS,CH,CH,CH,CH,CH,PS,                          //  2028-202F
        WS,WS,WS,WS,WS,WS,WS,WS,                          //  2030-2037
        WS,WS,WS,WS,WS,WS,WS,WS,                          //  2038-203F
        WS,WS,WS,PS,WS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  2040-204F
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  2050-205F
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  2060-206F
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  2070-207F
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  2080-208F
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  2090-209F
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  20A0-20AF
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  20B0-20BF
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  20C0-20CF
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  20D0-20DF
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  20E0-20EF
        PS,PS,PS,PS,PS,PS,PS,PS, PS,PS,PS,PS,PS,PS,PS,PS, //  20F0-20FF
        WS,WS,PS,HC,HC,IC,IC,HC,                          // 3000-3007
        WS,WS,WS,WS,WS,WS,WS,WS,                          //  3008-300F
        WS,WS,WS,WS,WS,WS,WS,WS,                          //  3010-3017
        WS,WS,WS,WS,WS,WS,WS,WS,                          //  3018-301F
        HC,HC,HC,HC,HC,HC,HC,HC,                          //  3020-3027
        HC,HC,HC,HC,HC,HC,HC,HC,                          //  3028-302F
        WS,HC,IC,HC,IC,HC,HC,HC,                          //  3030-3037
        PS,PS,PS,PS,PS,PS,PS,WS,                          //  3038-303F
        WS,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, //  3040-304F
        HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, //  3050-305F
        HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, //  3060-306F
        HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, //  3070-307F
        HC,HC,HC,HC,HC,HC,HC,HC, HC,HC,HC,HC,HC,HC,HC,HC, //  3080-308F
        HC,HC,HC,HC,HC,PS,PS,PS,                          //  3090-3097
        PS,HC,HC,WS,WS,HC,HC,PS,                          //  3098-309F
        WS,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, //  30A0-30AF
        KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, //  30B0-30BF
        KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, //  30C0-30CF
        KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, //  30D0-30DF
        KC,KC,KC,KC,KC,KC,KC,KC, KC,KC,KC,KC,KC,KC,KC,KC, //  30E0-30EF
        KC,KC,KC,KC,KC,KC,IC,PS,                          //  30F0-30F7
        PS,PS,PS,WS,KC,KC,KC,PS,                          //  30F8-30FF
    };

//
// Type C1 bits are:
//
//   C1_UPPER                  0x0001      // upper case
//   C1_LOWER                  0x0002      // lower case
//   C1_DIGIT                  0x0004      // decimal digits             1
//   C1_SPACE                  0x0008      // spacing characters         2
//   C1_PUNCT                  0x0010      // punctuation characters     4
//   C1_CNTRL                  0x0020      // control characters         8
//   C1_BLANK                  0x0040      // blank characters          10
//   C1_XDIGIT                 0x0080      // other digits              20
//   C1_ALPHA                  0x0100      // any linguistic character  40
//
// But since I don't care about C1_UPPER and C1_LOWER I can right-shift
// the output of GetStringTypeEx and keep a 128 Byte lookup table.
//
// The precedence rules are: (Alpha, XDigit, Digit) --> CH
//                           (Punct) --> PS
//                           (Space, Blank, Control) --> WS
//

const BYTE
s_abCTypeList[128] =
    {
      WS, CH, WS, CH, PS, CH, WS, CH,   // 00 - 07
      WS, CH, WS, CH, PS, CH, WS, CH,   // 08 - 0F
      WS, CH, WS, CH, PS, CH, WS, CH,   // 10 - 17
      WS, CH, WS, CH, PS, CH, WS, CH,   // 18 - 1F
      CH, CH, CH, CH, CH, CH, CH, CH,   // 20 - 27
      CH, CH, CH, CH, CH, CH, CH, CH,   // 20 - 27
      CH, CH, CH, CH, CH, CH, CH, CH,   // 30 - 37
      CH, CH, CH, CH, CH, CH, CH, CH,   // 30 - 37
      CH, CH, CH, CH, CH, CH, CH, CH,   // 40 - 47
      CH, CH, CH, CH, CH, CH, CH, CH,   // 48 - 4F
      CH, CH, CH, CH, CH, CH, CH, CH,   // 50 - 57
      CH, CH, CH, CH, CH, CH, CH, CH,   // 58 - 5F
      CH, CH, CH, CH, CH, CH, CH, CH,   // 60 - 67
      CH, CH, CH, CH, CH, CH, CH, CH,   // 68 - 6F
      CH, CH, CH, CH, CH, CH, CH, CH,   // 70 - 77
      CH, CH, CH, CH, CH, CH, CH, CH,   // 78 - 7F
    };

//+---------------------------------------------------------------------------
//  Member:     WBREAKJ::TypeOf
//
//  Synopsis:   Returns the type of a character
//
//  Arguments:  [c]   -- Unicode Character
//
//  Returns:    type, one of CH, WS, PS, EOF
//
//  History:    08-Apr-94   PatHal      Adapted for Japanese WB
//
//  Notes:      This returns the type of a character, using the static
//              array s_abCharTypeList.  It adds 1 so that EOF (-1) can be in
//              the array, and accessed normally.
//
//              This is not done by overloading the [] opeator, because in
//              future versions it will not necessarly be a table lookup.
//
//  See above (typeof comments) for an explanation of the mapping
//
//----------------------------------------------------------------------------
BYTE
GetCharType(WCHAR wc )
{
    WCHAR wc2;

    // Map interesting stuff (0000, 2000, 3000, FF00) to the table range,
    // 0x0000 - 0x0300.
    //
    wc2 = (wc & 0x00FF);

    switch (wc & 0xFF00) {

        case 0xFF00:  // Half-Width Variants
            if (wc2 & 0x80) {
                return(VC);  // including Hangul
            }
            wc2 |=  0x0080;
            break;

        case 0xFE00:  // Small Variants
            if ((wc2 <= 0x006B) && (wc2 != 0x0069)) {
                return(WS);
            }
            // Treat Small $ and arabic symbols as CH
            return(CH);
            // break;

        case 0x3000:  // CJK Auxiliary
            wc2 |=  0x0200;
            break;

        case 0x2000:  // General Punctuation
            wc2 |=  0x0100;
            break;

        case 0x0000:  // Code page 0
            // Use System NLS map for code page 0
            if (wc2 & 0x80)
            {
                WORD wCharType = 0;

                GetStringTypeEx( MAKELANGID( LANG_CHINESE, SUBLANG_CHINESE_TRADITIONAL ),
                                 CT_CTYPE1,
                                 &wc2,
                                 1,
                                 &wCharType );
                return s_abCTypeList[wCharType >> 2];
            }
            break;

        default:
            //
            // Treat the whole CJK Range as Kanji
            //
            if ((wc >= 0x4E00) && (wc <= 0x9FFF)) {
                return(IC);
            }

            //
            // Treat All Gaiji as Kanji Char, too
            //
            if ((wc >= 0xE000) && (wc < 0xE758)) {
                return(IC);
            }

            //
            // Treat all CJK symbols as word separators
            // NOTE: This means that the stemmer must be smart about searching
            // for zipcodes when given one with a preceding zipcode char.
            //
            if ((wc >= 0x3200) && (wc <= 0x33DD)) {
                return(WS);
            }

            // If it's not interesting return CH as default;
            return(CH);
            // break;
    }

    return( (s_abCharTypeList+1)[wc2] );
}