You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
404 lines
12 KiB
404 lines
12 KiB
#include "base.h"
|
|
#include "SpanishDict.h"
|
|
|
|
CSpanishDict::CSpanishDict(WCHAR* pwcsInitFilePath) :
|
|
m_vaDictItem4(DICT_4_INIT_SIZE),
|
|
m_vaDictItem8(DICT_8_INIT_SIZE),
|
|
m_vaDictItemStr(DICT_STR_INIT_SIZE),
|
|
m_ulDictItem4Count(0),
|
|
m_ulDictItem8Count(0),
|
|
m_ulDictItemStrCount(0)
|
|
{
|
|
m_apSpanishSuffix = new CSpanishSuffixDict();
|
|
|
|
CStandardCFile Words(pwcsInitFilePath, L"r");
|
|
|
|
WCHAR pwcsBuf[MAX_LINE_LEN];
|
|
char pszBuf[MAX_LINE_LEN];
|
|
|
|
while(fgets(pszBuf, MAX_LINE_LEN, (FILE*) Words))
|
|
{
|
|
if (pszBuf[0] == '\n')
|
|
{
|
|
continue;
|
|
}
|
|
|
|
int imbRet = MultiByteToWideChar(
|
|
1252, // English / Spanish code page
|
|
MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
|
|
pszBuf, -1,
|
|
pwcsBuf,
|
|
MAX_LINE_LEN);
|
|
|
|
Assert(imbRet > 0);
|
|
|
|
SpanishDictItem pItem(pwcsBuf);
|
|
|
|
if (pItem.m_ulLen <= COMPRESS_4_SIZE)
|
|
{
|
|
m_vaDictItem4[m_ulDictItem4Count].ulStr = pItem.m_ulStrCompress;
|
|
m_vaDictItem4[m_ulDictItem4Count].ulData = pItem.m_dwCompress;
|
|
m_ulDictItem4Count++;
|
|
}
|
|
else if (pItem.m_ulLen <= COMPRESS_8_SIZE)
|
|
{
|
|
m_vaDictItem8[m_ulDictItem8Count].ullStr = pItem.m_ullStrCompress;
|
|
m_vaDictItem8[m_ulDictItem8Count].ulData = pItem.m_dwCompress;
|
|
m_ulDictItem8Count++;
|
|
}
|
|
else
|
|
{
|
|
m_vaDictItemStr[m_ulDictItemStrCount].pszStr = new unsigned char[pItem.m_ulLen + 1];
|
|
|
|
bool bRet;
|
|
bRet = g_apSpanishUtil->ConvertToChar(
|
|
pItem.m_pwcs,
|
|
pItem.m_ulLen,
|
|
m_vaDictItemStr[m_ulDictItemStrCount].pszStr,
|
|
pItem.m_ulLen + 1);
|
|
|
|
Assert(bRet);
|
|
m_vaDictItemStr[m_ulDictItemStrCount].ulData = pItem.m_dwCompress;
|
|
m_ulDictItemStrCount++;
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
void CSpanishDict::BreakWord(
|
|
ULONG ulLen,
|
|
WCHAR* pwcsWord,
|
|
bool* pfExistAlt,
|
|
ULONG* pulAltLen,
|
|
WCHAR* pwcsAlt)
|
|
{
|
|
*pfExistAlt = false;
|
|
if (ulLen <= 2)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (*pulAltLen < MAX_WORD_LEN)
|
|
{
|
|
return;
|
|
}
|
|
|
|
//
|
|
// very fast heuristic to find non breakable words
|
|
//
|
|
|
|
if (pwcsWord[ulLen - 1] != L'e' &&
|
|
pwcsWord[ulLen - 1] != L's' &&
|
|
pwcsWord[ulLen - 2] != L'l')
|
|
{
|
|
return;
|
|
}
|
|
|
|
|
|
DictStatus status;
|
|
short sResCount;
|
|
|
|
WCHAR pwcsBuf[MAX_WORD_LEN];
|
|
WCHAR* pwcs = pwcsWord;
|
|
|
|
ULONG ul = ulLen;
|
|
pwcsBuf[ul] = L'\0';
|
|
while (ul > 0)
|
|
{
|
|
pwcsBuf[ul - 1] = *pwcs;
|
|
ul--;
|
|
pwcs++;
|
|
}
|
|
|
|
CSuffixTerm* prTerm[10];
|
|
status = m_apSpanishSuffix->m_SuffixTrie.trie_Find(
|
|
pwcsBuf,
|
|
TRIE_ALL_MATCHES | TRIE_IGNORECASE,
|
|
10,
|
|
prTerm,
|
|
&sResCount);
|
|
|
|
WCHAR pwcsTemp[MAX_WORD_LEN];
|
|
ULONG ulTempLen;
|
|
|
|
while (sResCount > 0)
|
|
{
|
|
CSuffixTerm* pTerm = prTerm[sResCount - 1];
|
|
Assert(ulLen < MAX_WORD_LEN);
|
|
wcsncpy(pwcsTemp, pwcsWord, ulLen);
|
|
pwcsTemp[ulLen] = L'\0';
|
|
|
|
ulTempLen = ulLen;
|
|
bool bRet;
|
|
ULONG ulCompressedData;
|
|
|
|
if (!(pTerm->ulType & (TYPE11 | TYPE12 | TYPE13 |TYPE14)))
|
|
{
|
|
Assert(ulLen >= pTerm->ulCut);
|
|
if (ulLen == pTerm->ulCut)
|
|
{
|
|
sResCount--;
|
|
continue;
|
|
}
|
|
pwcsTemp[ulLen - pTerm->ulCut] = L'\0';
|
|
ulTempLen = ulLen - pTerm->ulCut;
|
|
|
|
|
|
bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
|
|
|
|
if (pTerm->ulType == TYPE1 && (!bRet))
|
|
{
|
|
pwcsTemp[ulTempLen] = L's';
|
|
pwcsTemp[ulTempLen + 1] = L'\0';
|
|
bRet = Find(pwcsTemp, ulTempLen + 1, ulCompressedData);
|
|
}
|
|
|
|
if ( (!bRet) ||
|
|
(!(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType)))
|
|
{
|
|
sResCount--;
|
|
continue;
|
|
}
|
|
|
|
*pfExistAlt = true;
|
|
wcscpy(pwcsAlt, pwcsTemp);
|
|
*pulAltLen = ulTempLen;
|
|
g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
|
|
|
|
switch (pTerm->ulType)
|
|
{
|
|
case TYPE1:
|
|
return;
|
|
case TYPE2:
|
|
*pulAltLen += 3;
|
|
wcscat(pwcsAlt, L"ndo");
|
|
return;
|
|
case TYPE3:
|
|
*pulAltLen += 1;
|
|
wcscat(pwcsAlt, L"n");
|
|
return;
|
|
case TYPE4:
|
|
*pulAltLen += 3;
|
|
wcscat(pwcsAlt, L"mos");
|
|
return;
|
|
case TYPE5:
|
|
*pulAltLen += 1;
|
|
wcscat(pwcsAlt, L"d");
|
|
return;
|
|
case TYPE6:
|
|
*pulAltLen += 1;
|
|
wcscat(pwcsAlt, L"r");
|
|
return;
|
|
case TYPE7:
|
|
case TYPE8:
|
|
case TYPE9:
|
|
case TYPE10:
|
|
case TYPE15:
|
|
case TYPE16:
|
|
return;
|
|
default:
|
|
Assert(false);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*pfExistAlt = true;
|
|
|
|
switch (pTerm->ulType)
|
|
{
|
|
case TYPE11:
|
|
{
|
|
Assert(ulTempLen >= pTerm->ulLen);
|
|
if (ulTempLen == pTerm->ulLen)
|
|
{
|
|
break;
|
|
}
|
|
pwcsTemp[ulTempLen - pTerm->ulLen] = L'\0';
|
|
ulTempLen -= pTerm->ulLen;
|
|
|
|
bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
|
|
|
|
if (bRet &&
|
|
(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
|
|
{
|
|
wcscpy(pwcsAlt, pwcsTemp);
|
|
*pulAltLen = ulTempLen;
|
|
g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
|
|
*pfExistAlt = true;
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case TYPE12:
|
|
case TYPE14:
|
|
{
|
|
pwcsTemp[ulTempLen-3] = L's'; // removing the no form the nos
|
|
pwcsTemp[ulTempLen-2] = L'\0';
|
|
bRet = Find(pwcsTemp, ulTempLen - 2, ulCompressedData);
|
|
|
|
if (bRet &&
|
|
(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
|
|
{
|
|
wcscpy(pwcsAlt, pwcsTemp);
|
|
*pulAltLen = ulTempLen - 2;
|
|
g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
|
|
*pfExistAlt = true;
|
|
return;
|
|
}
|
|
|
|
Assert(pTerm->ulLen >= 3);
|
|
Assert(ulTempLen >= pTerm->ulLen);
|
|
if (ulTempLen == pTerm->ulLen)
|
|
{
|
|
break;
|
|
}
|
|
|
|
ulTempLen -= pTerm->ulLen;
|
|
pwcsTemp[ulTempLen] = L'\0';
|
|
|
|
bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
|
|
if (bRet &&
|
|
(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
|
|
{
|
|
wcscpy(pwcsAlt, pwcsTemp);
|
|
*pulAltLen = ulTempLen - 2;
|
|
*pfExistAlt = true;
|
|
g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
|
|
return;
|
|
}
|
|
|
|
}
|
|
break;
|
|
|
|
case TYPE13:
|
|
{
|
|
pwcsTemp[ulTempLen-1] = L'\0';
|
|
bRet = Find(pwcsTemp, ulTempLen - 1, ulCompressedData);
|
|
|
|
|
|
if (bRet &&
|
|
(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
|
|
{
|
|
wcscpy(pwcsAlt, pwcsTemp);
|
|
*pulAltLen = ulTempLen - 1;
|
|
*pfExistAlt = true;
|
|
g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
|
|
return;
|
|
}
|
|
|
|
Assert(pTerm->ulLen >= 3);
|
|
Assert(ulTempLen >= pTerm->ulLen);
|
|
Assert(ulTempLen >= pTerm->ulLen);
|
|
if (ulTempLen == pTerm->ulLen)
|
|
{
|
|
break;
|
|
}
|
|
|
|
ulTempLen -= pTerm->ulLen;
|
|
pwcsTemp[ulTempLen] = L'\0';
|
|
|
|
bRet = Find(pwcsTemp, ulTempLen, ulCompressedData);
|
|
|
|
if (bRet &&
|
|
(g_apSpanishUtil->GetTypeFromCompressedData(ulCompressedData) & pTerm->ulType))
|
|
{
|
|
wcscpy(pwcsAlt, pwcsTemp);
|
|
*pulAltLen = ulTempLen - 2;
|
|
*pfExistAlt = true;
|
|
g_apSpanishUtil->ReplaceAccent(pwcsAlt, *pulAltLen, ulCompressedData);
|
|
return;
|
|
}
|
|
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
sResCount--;
|
|
}
|
|
|
|
pwcsAlt[0] = L'\0';
|
|
*pfExistAlt = false;
|
|
}
|
|
|
|
|
|
bool CSpanishDict::Find(WCHAR* pwcs, ULONG ulLen, ULONG& ulData)
|
|
{
|
|
bool bRet;
|
|
|
|
if (ulLen <= COMPRESS_4_SIZE)
|
|
{
|
|
CompressDictItem4 Key;
|
|
bRet = g_apSpanishUtil->CompressStr4(pwcs, ulLen, Key.ulStr);
|
|
if (!bRet)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
CompressDictItem4* pItem;
|
|
pItem = BinaryFind<CompressDictItem4>(
|
|
(CompressDictItem4*)m_vaDictItem4,
|
|
m_ulDictItem4Count,
|
|
Key);
|
|
|
|
if (!pItem)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ulData = pItem->ulData;
|
|
}
|
|
else if (ulLen <= COMPRESS_8_SIZE)
|
|
{
|
|
CompressDictItem8 Key;
|
|
bRet = g_apSpanishUtil->CompressStr8(pwcs, ulLen, Key.ullStr);
|
|
if (!bRet)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
CompressDictItem8* pItem;
|
|
pItem = BinaryFind<CompressDictItem8>(
|
|
(CompressDictItem8*)m_vaDictItem8,
|
|
m_ulDictItem8Count,
|
|
Key);
|
|
|
|
if (!pItem)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ulData = pItem->ulData;
|
|
}
|
|
else
|
|
{
|
|
unsigned char psz[32];
|
|
bool bRet;
|
|
bRet = g_apSpanishUtil->ConvertToChar(pwcs, ulLen, psz, 32);
|
|
if (!bRet)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
PsudoCompressDictItemStr Key;
|
|
Key.pszStr = psz;
|
|
CompressDictItemStr* pItem;
|
|
|
|
pItem = BinaryFind<CompressDictItemStr>(
|
|
(CompressDictItemStr*)m_vaDictItemStr,
|
|
m_ulDictItemStrCount,
|
|
Key);
|
|
|
|
if (!pItem)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ulData = pItem->ulData;
|
|
}
|
|
|
|
return true;
|
|
}
|