// IWBreak.cpp // // CWordBreak implementation // // Copyright 2000 Microsoft Corp. // // Modification History: // 18 APR 2000 bhshin added WordBreak destructor // 30 MAR 2000 bhshin created #include "StdAfx.h" #include "KorWbrk.h" #include "IWBreak.h" #include "Lex.h" #include "Token.h" #include "Record.h" #include "Analyze.h" #include "IndexRec.h" #include "unikor.h" #include "Morpho.h" extern CRITICAL_SECTION g_CritSect; extern MAPFILE g_LexMap; extern BOOL g_fLoaded; ///////////////////////////////////////////////////////////////////////////// // CWordBreaker member functions // CWordBreaker::Init // // intialize WordBreaker object & lexicon // // Parameters: // fQuery -> (BOOL) query time flag // ulMaxTokenSize -> (ULONG) maximum input token length // *pfLicense <- (BOOL*) always return TRUE // // Result: // (HRESULT) // // 30MAR00 bhshin began STDMETHODIMP CWordBreaker::Init(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense) { if (pfLicense == NULL) return E_INVALIDARG; if (IsBadWritePtr(pfLicense, sizeof(DWORD))) return E_INVALIDARG; // store intitializing information m_fQuery = fQuery; m_ulMaxTokenSize = ulMaxTokenSize; *pfLicense = TRUE; if (!g_fLoaded) { // load lexicon file ATLTRACE(L"Load lexicon...\r\n"); if (!InitLexicon(&g_LexMap)) return LANGUAGE_E_DATABASE_NOT_FOUND; g_fLoaded = TRUE; } m_PI.lexicon = g_LexMap; WB_LOG_PRINT_HEADER(fQuery); return S_OK; } // CWordBreaker::BreakText // // main word breaking method // // Parameters: // pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text // pWordSink -> (IWordSink*) pointer to the word sink // pPhraseSink -> (IPhraseSink*) pointer to the phrase sink // // Result: // (HRESULT) // // 30MAR00 bhshin began STDMETHODIMP CWordBreaker::BreakText(TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink) { WT Type; int cchTextProcessed, cchProcessed, cchHanguel; WCHAR wchLast = L'\0'; if (pTextSource == NULL) return E_INVALIDARG; if (pWordSink == NULL) return S_OK; if (pTextSource->iCur == pTextSource->iEnd) return S_OK; ATLASSERT(pTextSource->iCur < pTextSource->iEnd); do { while (pTextSource->iCur < pTextSource->iEnd) { Tokenize(TRUE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel); if (Type == WT_REACHEND) break; cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast); if (cchProcessed < 0) return E_UNEXPECTED; pTextSource->iCur += cchProcessed; } } while (SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource))); while ( pTextSource->iCur < pTextSource->iEnd ) { Tokenize(FALSE, pTextSource, pTextSource->iCur, &Type, &cchTextProcessed, &cchHanguel); cchProcessed = WordBreak(pTextSource, Type, cchTextProcessed, cchHanguel, pWordSink, pPhraseSink, &wchLast); if (cchProcessed < 0) return E_UNEXPECTED; pTextSource->iCur += cchProcessed; } return S_OK; } // CWordBreaker::ComposePhrase // // convert a noun and modifier back into a source phrase (NOT USED) // // Parameters: // pwcNoun -> (const WCHAR*) input noun // cwcNoun -> (ULONG) length of input noun // pwcModifier -> (const WCHAR *) input modifier // cwcModifier -> (ULONG) length of input modifier // ulAttachmentType -> (ULONG) value about the method of composition // pwcPhrase -> (WCHAR *) pointer to the returned buffer // pcwcPhrase -> (ULONG *) length of returned string // // Result: // (HRESULT) // // 30MAR00 bhshin began STDMETHODIMP CWordBreaker::ComposePhrase(const WCHAR *pwcNoun, ULONG cwcNoun, const WCHAR *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase) { if (m_fQuery) return E_NOTIMPL; return WBREAK_E_QUERY_ONLY; } // CWordBreaker::GetLicenseToUse // // return license information // // Parameters: // ppwcsLicense -> (const WCHAR **) output pointer to the license information // // Result: // (HRESULT) // // 30MAR00 bhshin began STDMETHODIMP CWordBreaker::GetLicenseToUse(const WCHAR ** ppwcsLicense) { static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-2000"; if (ppwcsLicense == NULL) return E_INVALIDARG; if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) return E_INVALIDARG; *ppwcsLicense = wcsCopyright; return S_OK; } // CWordBreaker::WordBreak // // main hangul word breaking operator // // Parameters: // pTextSource -> (TEXT_SOURCE*) pointer to the structure of source text // Type -> (WT) word token type // cchTextProcessed -> (int) input length to process // cchHanguel -> (int) hangul token length (hanguel+romaji case only) // pWordSink -> (IWordSink*) pointer to the word sink // pPhraseSink -> (IPhraseSink*) pointer to the phrase sink // pwchLast -> (WCHAR*) input & output last character of previous token // // Result: // (int) -1 if error occurs, text length to process // // 30MAR00 bhshin began int CWordBreaker::WordBreak(TEXT_SOURCE *pTextSource, WT Type, int cchTextProcessed, int cchHanguel, IWordSink *pWordSink, IPhraseSink *pPhraseSink, WCHAR *pwchLast) { const WCHAR *pwcStem; int iCur; int cchToken, cchProcessed, cchHg; int cchPrefix; ATLASSERT(cchTextProcessed > 0); if (cchTextProcessed <= 0) return -1; iCur = pTextSource->iCur; pwcStem = pTextSource->awcBuffer + iCur; cchProcessed = cchTextProcessed; cchToken = cchTextProcessed; // check too long token if (cchToken > (int)m_ulMaxTokenSize || cchToken > MAX_INDEX_STRING) { cchProcessed = (m_ulMaxTokenSize < MAX_INDEX_STRING) ? m_ulMaxTokenSize : MAX_INDEX_STRING; pWordSink->PutWord(cchProcessed, pwcStem, cchProcessed, pTextSource->iCur); return cchProcessed; } //================================================= // query & index time //================================================= if (Type == WT_PHRASE_SEP) { // phrase separator *pwchLast = L'\0'; pWordSink->PutBreak(WORDREP_BREAK_EOS); } else if (Type == WT_WORD_SEP) { if (!fIsWhiteSpace(*pwcStem)) *pwchLast = L'\0'; // Korean WB do not add EOW. } else if (Type == WT_ROMAJI) { // symbol, alphabet, hanja, romaji + hanguel // get next token iCur += cchToken; Tokenize(FALSE, pTextSource, iCur, &Type, &cchToken, &cchHg); if (Type == WT_ROMAJI) { if (cchHg > 0) { // romaji+(hanguel+romaji) case -> put word itself cchProcessed += cchToken; iCur += cchToken; cchProcessed += GetWordPhrase(FALSE, pTextSource, iCur); WB_LOG_START(pwcStem, cchProcessed); pWordSink->PutWord(cchProcessed, pwcStem, cchProcessed, pTextSource->iCur); WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_SYMBOL); } else { WB_LOG_START(pwcStem, cchProcessed); // {romaj}{romaj} case : -> breaking first {romaji} CIndexInfo IndexInfo; if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink)) goto ErrorReturn; AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed, cchHanguel, &IndexInfo, &cchPrefix); if (m_fQuery) { IndexInfo.AddIndex(pwcStem, cchProcessed+cchToken, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1); WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY); if (!IndexInfo.PutQueryIndexList()) goto ErrorReturn; } else { if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur)) goto ErrorReturn; } } } else if (Type == WT_HANGUEL) { // romaji(hanguel+romaji) + hanguel case WCHAR wzRomaji[MAX_INDEX_STRING+1]; int cchRomaji; cchRomaji = (cchProcessed > MAX_INDEX_STRING) ? MAX_INDEX_STRING : cchProcessed; wcsncpy(wzRomaji, pwcStem, cchRomaji); wzRomaji[cchRomaji] = L'\0'; WB_LOG_START(pwcStem, cchProcessed+cchToken); cchProcessed += cchToken; // start position include romanji CIndexInfo IndexInfo; if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink)) goto ErrorReturn; if (cchHanguel > 0) { AnalyzeRomaji(pwcStem, cchRomaji, pTextSource->iCur, cchRomaji, cchHanguel, &IndexInfo, &cchPrefix); } else { cchPrefix = CheckURLPrefix(pwcStem, cchProcessed-cchToken); } // analyze string starts from last hangul pwcStem = pTextSource->awcBuffer + iCur; if (cchRomaji > 0) IndexInfo.SetRomajiInfo(wzRomaji, cchRomaji, cchPrefix); // analyze string always with indexing mode on symbol processing if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchToken, iCur, &IndexInfo, *pwchLast)) goto ErrorReturn; if (m_fQuery) { if (cchRomaji > 0) IndexInfo.SetRomajiInfo(NULL, 0, 0); IndexInfo.AddIndex(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed+cchToken-1); WB_LOG_ADD_INDEX(pTextSource->awcBuffer + pTextSource->iCur, cchProcessed, INDEX_QUERY); if (!IndexInfo.PutQueryIndexList()) goto ErrorReturn; } else { if (!IndexInfo.MakeSingleLengthMergedIndex()) goto ErrorReturn; if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur)) goto ErrorReturn; } *pwchLast = *(pwcStem + cchToken - 1); } else // next: WT_START, WT_PHRASE_SEP, WT_WORD_SEP, WT_REACHEND { WB_LOG_START(pwcStem, cchProcessed); CIndexInfo IndexInfo; if (!IndexInfo.Initialize(cchProcessed, pTextSource->iCur, pWordSink, pPhraseSink)) goto ErrorReturn; AnalyzeRomaji(pwcStem, cchProcessed, pTextSource->iCur, cchProcessed, cchHanguel, &IndexInfo, &cchPrefix); if (m_fQuery) { IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1); WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY); if (!IndexInfo.PutQueryIndexList()) goto ErrorReturn; } else { if (!IndexInfo.PutFinalIndexList(pTextSource->awcBuffer + pTextSource->iCur)) goto ErrorReturn; } } } else if (Type == WT_HANGUEL) { // hangul input WB_LOG_START(pwcStem, cchProcessed); CIndexInfo IndexInfo; if (!IndexInfo.Initialize(cchProcessed, iCur, pWordSink, pPhraseSink)) goto ErrorReturn; if (!AnalyzeString(&m_PI, m_fQuery, pwcStem, cchProcessed, iCur, &IndexInfo, *pwchLast)) goto ErrorReturn; if (m_fQuery) { IndexInfo.AddIndex(pwcStem, cchProcessed, WEIGHT_HARD_MATCH, 0, cchProcessed-1); WB_LOG_ADD_INDEX(pwcStem, cchProcessed, INDEX_QUERY); if (!IndexInfo.PutQueryIndexList()) goto ErrorReturn; } else { if (!IndexInfo.MakeSingleLengthMergedIndex()) goto ErrorReturn; if (!IndexInfo.PutFinalIndexList(pwcStem)) goto ErrorReturn; } *pwchLast = *(pwcStem + cchProcessed - 1); } WB_LOG_PRINT_ALL(); WB_LOG_END(); return cchProcessed; ErrorReturn: WB_LOG_END(); return -1; } // CWordBreaker::AnalyzeRomaji // // helper function for romaji token wordbreaking // // Parameters: // pwcStem -> (const WCHAR*) input token string // cchStem -> (int) length of input romaji token // iCur -> (int) source string position // cchProcessed -> (int) input length to process // cchHanguel -> (int) hangul token length (hanguel+romaji case only) // pIndexInfo -> (CIndexInfo *) output index list // pcchPrefix -> (int*) output prefix length // // Result: // (void) // // 23NOV00 bhshin began void CWordBreaker::AnalyzeRomaji(const WCHAR *pwcStem, int cchStem, int iCur, int cchProcessed, int cchHanguel, CIndexInfo *pIndexInfo, int *pcchPrefix) { int cchPrefix = 0; // hanguel+romaji case if (cchHanguel < cchProcessed) { // hanguel if (cchHanguel > 0) { pIndexInfo->AddIndex(pwcStem, cchHanguel, WEIGHT_HARD_MATCH, 0, cchHanguel-1); WB_LOG_ADD_INDEX(pwcStem, cchHanguel, INDEX_SYMBOL); } // romaji if ((cchStem-cchHanguel) > 0) { pIndexInfo->AddIndex(pwcStem + cchHanguel, cchStem - cchHanguel, WEIGHT_HARD_MATCH, cchHanguel, cchStem-1); WB_LOG_ADD_INDEX(pwcStem + cchHanguel, cchStem - cchHanguel, INDEX_SYMBOL); } } if (cchHanguel == 1 || (cchStem-cchHanguel) == 1) { // romaji(hangul+romaji) pIndexInfo->AddIndex(pwcStem, cchStem, WEIGHT_HARD_MATCH, 0, cchStem-1); WB_LOG_ADD_INDEX(pwcStem, cchStem, INDEX_SYMBOL); } // check URL prefix cchPrefix = CheckURLPrefix(pwcStem, cchProcessed); if (cchPrefix > 0 && cchPrefix < cchProcessed) { pIndexInfo->AddIndex(pwcStem + cchPrefix, cchStem - cchPrefix, WEIGHT_HARD_MATCH, cchPrefix, cchStem-1); WB_LOG_ADD_INDEX(pwcStem + cchPrefix, cchStem - cchPrefix, INDEX_SYMBOL); } *pcchPrefix = cchPrefix; // return it }