#include "base.h" #include "SpanishDict.h" #define MAX_WORD_LEN 128 CSpanishDict::CSpanishDict(WCHAR* pwcsInitFilePath) : m_vaDictItem4(DICT_4_INIT_SIZE), m_vaDictItem8(DICT_8_INIT_SIZE), m_vaDictItemStr(DICT_STR_INIT_SIZE), m_ulDictItem4Count(0), m_ulDictItem8Count(0), m_ulDictItemStrCount(0) { m_apSpanishSuffix = new CSpanishSuffixDict(); CStandardCFile Words(pwcsInitFilePath, L"r"); WCHAR pwcsBuf[MAX_WORD_LEN]; DictStatus status; while(fgetws(pwcsBuf, MAX_WORD_LEN, (FILE*) Words)) { if (pwcsBuf[0] == L'\n') { continue; } SpanishDictItem pItem(pwcsBuf); if (pItem.m_ulLen <= COMPRESS_4_SIZE) { m_vaDictItem4[m_ulDictItem4Count].ulStr = pItem.m_ulStrCompress; m_vaDictItem4[m_ulDictItem4Count].ulData = pItem.m_dwCompress; m_ulDictItem4Count++; } else if (pItem.m_ulLen <= COMPRESS_8_SIZE) { m_vaDictItem8[m_ulDictItem8Count].ullStr = pItem.m_ullStrCompress; m_vaDictItem8[m_ulDictItem8Count].ulData = pItem.m_dwCompress; m_ulDictItem8Count++; } else { m_vaDictItemStr[m_ulDictItemStrCount].pszStr = new unsigned char[pItem.m_ulLen + 1]; bool bRet; bRet = g_apSpanishUtil->ConvertToChar( pItem.m_pwcs, pItem.m_ulLen, m_vaDictItemStr[m_ulDictItemStrCount].pszStr, pItem.m_ulLen + 1); Assert(bRet); m_vaDictItemStr[m_ulDictItemStrCount].ulData = pItem.m_dwCompress; m_ulDictItemStrCount++; } } } void CSpanishDict::BreakWord( ULONG ulLen, WCHAR* pwcsWord, bool* pfExistAlt, ULONG* pulAltLen, WCHAR* pwcsAlt) { *pfExistAlt = false; if (ulLen <= 2) { return; } // // very fast heuristic to find non breakable words // if (pwcsWord[ulLen - 1] != L'e' && pwcsWord[ulLen - 1] != L's' && pwcsWord[ulLen - 2] != L'l') { return; } DictStatus status; short sResCount; WCHAR pwcsBuf[MAX_WORD_LEN]; WCHAR* pwcs = pwcsWord; ULONG ul = ulLen; pwcsBuf[ul] = L'\0'; while (ul > 0) { pwcsBuf[ul - 1] = *pwcs; ul--; pwcs++; } CSuffixTerm* prTerm[10]; status = m_apSpanishSuffix->m_SuffixTrie.trie_Find( pwcsBuf, TRIE_ALL_MATCHES | TRIE_IGNORECASE, 10, prTerm, &sResCount); WCHAR pwcsTemp[MAX_WORD_LEN]; ULONG ulTempLen; while (sResCount > 0) { CSuffixTerm* pTerm = prTerm[sResCount - 1]; Assert(ulLen < MAX_WORD_LEN); wcsncpy(pwcsTemp, pwcsWord, ulLen); pwcsTemp[ulLen] = L'\0'; ulTempLen = ulLen; bool bRet; ULONG ulCompressedData; if (!(pTerm->ulType & (TYPE11 | TYPE12 | TYPE13 |TYPE14))) { Assert(ulLen >= pTerm->ulCut); if (ulLen == pTerm->ulCut) { sResCount--; continue; } pwcsTemp[ulLen - pTerm->ulCut] = L'\0'; ulTempLen = ulLen - pTerm->ulCut; bRet = Find(pwcsTemp, ulTempLen, ulCompressedData); if (pTerm->ulType == TYPE1 && (!bRet)) { pwcsTemp[ulTempLen] = L's'; pwcsTemp[ulTempLen + 1] = L'\0'; bRet = Find(pwcsTemp, ulTempLen + 1, ulCompressedData); } if ( (!bRet) || (!(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))) { sResCount--; continue; } *pfExistAlt = true; wcscpy(pwcsAlt, pwcsTemp); *pulAltLen = ulTempLen; g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData); switch (pTerm->ulType) { case TYPE1: return; case TYPE2: *pulAltLen += 3; wcscat(pwcsAlt, L"ndo"); return; case TYPE3: *pulAltLen += 1; wcscat(pwcsAlt, L"n"); return; case TYPE4: *pulAltLen += 3; wcscat(pwcsAlt, L"mos"); return; case TYPE5: *pulAltLen += 1; wcscat(pwcsAlt, L"d"); return; case TYPE6: *pulAltLen += 1; wcscat(pwcsAlt, L"r"); return; case TYPE7: case TYPE8: case TYPE9: case TYPE10: case TYPE15: case TYPE16: return; default: Assert(false); } } else { *pfExistAlt = true; switch (pTerm->ulType) { case TYPE11: { Assert(ulTempLen >= pTerm->ulLen); if (ulTempLen == pTerm->ulLen) { break; } pwcsTemp[ulTempLen - pTerm->ulLen] = L'\0'; ulTempLen -= pTerm->ulLen; bRet = Find(pwcsTemp, ulTempLen, ulCompressedData); if (bRet && (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)) { wcscpy(pwcsAlt, pwcsTemp); *pulAltLen = ulTempLen; g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData); *pfExistAlt = true; return; } } break; case TYPE12: case TYPE14: { pwcsTemp[ulTempLen-3] = L's'; // removing the no form the nos pwcsTemp[ulTempLen-2] = L'\0'; bRet = Find(pwcsTemp, ulTempLen - 2, ulCompressedData); if (bRet && (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)) { wcscpy(pwcsAlt, pwcsTemp); g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData); *pulAltLen = ulTempLen - 2; *pfExistAlt = true; return; } Assert(pTerm->ulLen >= 3); Assert(ulTempLen >= pTerm->ulLen); if (ulTempLen == pTerm->ulLen) { break; } ulTempLen -= pTerm->ulLen; pwcsTemp[ulTempLen] = L'\0'; bRet = Find(pwcsTemp, ulTempLen, ulCompressedData); if (bRet && (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)) { wcscpy(pwcsAlt, pwcsTemp); g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData); *pulAltLen = ulTempLen - 2; *pfExistAlt = true; return; } } break; case TYPE13: { pwcsTemp[ulTempLen-1] = L'\0'; bRet = Find(pwcsTemp, ulTempLen - 1, ulCompressedData); if (bRet && (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)) { wcscpy(pwcsAlt, pwcsTemp); g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData); *pulAltLen = ulTempLen - 1; *pfExistAlt = true; return; } Assert(pTerm->ulLen >= 3); Assert(ulTempLen >= pTerm->ulLen); Assert(ulTempLen >= pTerm->ulLen); if (ulTempLen == pTerm->ulLen) { break; } ulTempLen -= pTerm->ulLen; pwcsTemp[ulTempLen] = L'\0'; bRet = Find(pwcsTemp, ulTempLen, ulCompressedData); if (bRet && (g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)) { wcscpy(pwcsAlt, pwcsTemp); g_apSpanishUtil->ReplaceAccent(pwcsAlt, ulCompressedData); *pulAltLen = ulTempLen - 2; *pfExistAlt = true; return; } } break; } } sResCount--; } pwcsAlt[0] = L'\0'; *pfExistAlt = false; } bool CSpanishDict::Find(WCHAR* pwcs, ULONG ulLen, ULONG& ulData) { bool bRet; if (ulLen <= COMPRESS_4_SIZE) { CompressDictItem4 Key; bRet = g_apSpanishUtil->CompressStr4(pwcs, ulLen, Key.ulStr); if (!bRet) { return false; } CompressDictItem4* pItem; pItem = BinaryFind( (CompressDictItem4*)m_vaDictItem4, m_ulDictItem4Count, Key); if (!pItem) { return false; } ulData = pItem->ulData; } else if (ulLen <= COMPRESS_8_SIZE) { CompressDictItem8 Key; bRet = g_apSpanishUtil->CompressStr8(pwcs, ulLen, Key.ullStr); if (!bRet) { return false; } CompressDictItem8* pItem; pItem = BinaryFind( (CompressDictItem8*)m_vaDictItem8, m_ulDictItem8Count, Key); if (!pItem) { return false; } ulData = pItem->ulData; } else { unsigned char psz[32]; bool bRet; bRet = g_apSpanishUtil->ConvertToChar(pwcs, ulLen, psz, 32); if (!bRet) { return false; } PsudoCompressDictItemStr Key; Key.pszStr = psz; CompressDictItemStr* pItem; pItem = BinaryFind( (CompressDictItemStr*)m_vaDictItemStr, m_ulDictItemStrCount, Key); if (!pItem) { return false; } ulData = pItem->ulData; } return true; }