|
|
////////////////////////////////////////////////////////////////////////////////
//
// Filename : SpanishUtils.h
// Purpose : Genral utilities for spanish
//
// Project : WordBreakers
// Component: Spanish word breaker
//
// Author : yairh
//
// Log:
//
// Jun 20 2000 yairh creation
//
////////////////////////////////////////////////////////////////////////////////
#ifndef _SPANISH_UTILS_H_
#define _SPANISH_UTILS_H_
#include "trie.h"
#define TYPE1 1<<0
#define TYPE2 1<<1
#define TYPE3 1<<2
#define TYPE4 1<<3
#define TYPE5 1<<4
#define TYPE6 1<<5
#define TYPE7 1<<6
#define TYPE8 1<<7
#define TYPE9 1<<8
#define TYPE10 1<<9
#define TYPE11 1<<10
#define TYPE12 1<<11
#define TYPE13 1<<12
#define TYPE14 1<<13
#define TYPE15 1<<14
#define TYPE16 1<<15
#define COMPRESS_4_SIZE 6
#define COMPRESS_8_SIZE 12
class CSpanishUtil { public: CSpanishUtil::CSpanishUtil();
int aiWcscmp(const WCHAR* p, const WCHAR* t); int aiStrcmp(const unsigned char* p, const unsigned char* t); int aiWcsncmp(const WCHAR* p, const WCHAR* t, const int iLen);
void ReplaceAccent(WCHAR* pwcs, ULONG ulLen, DWORD dwCompressedBuf) { WORD w = (WORD) dwCompressedBuf; BYTE bLoc; BYTE bc = 0; bc = (w & 0xF00) >> 8; if (bc) { bLoc = (w & 0xF000) >> 12; Assert(bLoc < ulLen); pwcs[bLoc] = (WCHAR)m_rReverseAccentConvert[bc]; } bc = w & 0xF; if (bc) { bLoc = (w & 0xF0) >> 4; Assert(bLoc < ulLen); pwcs[bLoc] = (WCHAR) m_rReverseAccentConvert[bc]; } }
ULONG GetTypeFromCompressedData(DWORD dw) { return dw >> 16; } DWORD CompressData( ULONG ulType, BYTE bLoc1, BYTE bChar1, BYTE bLoc2, BYTE bChar2) { return (ulType << 16) | (bLoc1 << 12) | (bChar1 << 8) | (bLoc2 << 4) | (bChar2); }
ULONG AddTypeToCompressedData(ULONG ul, ULONG ulType) { return (ul | (ulType << 16)); }
bool CompressStr4(WCHAR* pwcsStr, ULONG ulLen, ULONG& ulCompress) { //
// each char is 5 bits
//
int iShift = 27; ulCompress = 0; ULONG ul = 0; while(ul < ulLen) { Assert(iShift>=0); if ((*pwcsStr > 0xFF) || (m_rCharCompress[*pwcsStr] == 0) ) { return false; } ulCompress |= m_rCharCompress[*pwcsStr] << iShift;
iShift -= 5; pwcsStr++; ul++; }
return true; }
bool CompressStr8(WCHAR* pwcsStr, ULONG ulLen, ULONGLONG& ullCompress) { //
// each char is 5 bits
//
int iShift = 59; ullCompress = 0;
ULONG ul = 0; while(ul < ulLen) { Assert(iShift>=0); if ((*pwcsStr > 0xFF) || m_rCharCompress[*pwcsStr] == 0 ) { return false; } ullCompress |= ((ULONGLONG)m_rCharCompress[*pwcsStr]) << iShift;
iShift -= 5; pwcsStr++; ul++; }
return true; }
bool ConvertToChar(const WCHAR* pwcs, const ULONG ulLen, unsigned char* pszOut, ULONG ulOutLen) { if (ulOutLen < ulLen + 1) { return false; }
ULONG ul = 0; while (ul < ulLen) { if (*pwcs > 0xFF) { return false; } *pszOut = *((char*)pwcs); pszOut++; pwcs++; ul++; }
*pszOut = '\0'; return true; } public:
//
// members.
//
WCHAR m_rCharConvert[256]; BYTE m_rCharCompress[256]; char m_rAccentConvert[256]; WCHAR m_rReverseAccentConvert[16];
};
extern CAutoClassPointer<CSpanishUtil> g_apSpanishUtil;
class CToAccUpper { public: static WCHAR MapToUpper( IN WCHAR wc ) { if ( (wc & 0xff00) == 0 ) { return ( g_apSpanishUtil->m_rCharConvert[wc] ); } else { return ( towupper(wc) ); } // if
} };
class SpanishDictItem { public: SpanishDictItem(ULONG ulW, WCHAR* pwcsW, ULONG ulAL, WCHAR* pwcsA, ULONG ulC, ULONG ulT) { m_fOwnMemory = true; Assert(ulW == ulAL);
m_ulLen = ulW; m_pwcs = new WCHAR[ulW + 1]; wcsncpy(m_pwcs, pwcsW, ulW); m_pwcs[ulW] = L'\0';
m_pwcsAlt = new WCHAR[ulAL + 1]; wcsncpy(m_pwcsAlt, pwcsA, ulAL); m_pwcsAlt[ulAL] = L'\0'; m_ulAltLen = ulAL;
m_ulCounter = ulC; m_ulType = ulT;
WCHAR* p = pwcsW; BYTE i = 0; BYTE k = 0; BYTE r[4] = {0}; while (*p) { if (*p != pwcsA[i]) { Assert(k < 4); Assert(i < 16); Assert( g_apSpanishUtil->m_rCharConvert[*p] == g_apSpanishUtil->m_rCharConvert[pwcsA[i]]); r[k] = i; r[k+1] = g_apSpanishUtil->m_rAccentConvert[pwcsA[i]]; k+=2; }
i++; p++; }
m_dwCompress = g_apSpanishUtil->CompressData(m_ulType, r[0], r[1], r[2], r[3]); if (m_ulLen <= COMPRESS_4_SIZE) { bool b = g_apSpanishUtil->CompressStr4(m_pwcs, m_ulLen, m_ulStrCompress); Assert(b); } else if (m_ulLen <= COMPRESS_8_SIZE) { bool b = g_apSpanishUtil->CompressStr8(m_pwcs, m_ulLen, m_ullStrCompress); Assert(b); }
}
SpanishDictItem(WCHAR* pwcsBuf) { m_fOwnMemory = false;
ULONG ul = wcslen(pwcsBuf); pwcsBuf[ul - 1] = L'\0'; WCHAR* p = pwcsBuf;
WCHAR* ppwcsParams[7]; ppwcsParams[0] = p; int i = 1; while(*p) { if (*p == L';') { *p = L'\0'; ppwcsParams[i] = p+1; i++; } p++; }
m_pwcs = ppwcsParams[0]; m_ulLen = _wtol(ppwcsParams[1]); m_pwcsAlt = ppwcsParams[2]; m_ulAltLen = _wtol(ppwcsParams[3]); m_ulType = _wtol(ppwcsParams[4]); m_dwCompress = _wtol(ppwcsParams[5]); if (m_ulLen <= COMPRESS_4_SIZE) { m_ulStrCompress = _wtol(ppwcsParams[6]); } else if (m_ulLen <= COMPRESS_8_SIZE) { m_ullStrCompress = _wtoi64(ppwcsParams[6]); }
}
~SpanishDictItem() { if (m_fOwnMemory) { delete[] m_pwcs; delete[] m_pwcsAlt; } }
void AddType(ULONG ulType) { m_ulType |= ulType; m_dwCompress = g_apSpanishUtil->AddTypeToCompressedData(m_dwCompress, ulType); }
int Serialize(WCHAR* pwcsBuf) { if (m_ulLen <= COMPRESS_4_SIZE) { return swprintf( pwcsBuf, L"%s;%d;%s;%d;%d;%u;%u\n", m_pwcs, m_ulLen, m_pwcsAlt, m_ulAltLen, m_ulType, m_dwCompress, m_ulStrCompress); } else if (m_ulLen <= COMPRESS_8_SIZE) { return swprintf( pwcsBuf, L"%s;%d;%s;%d;%d;%u;%I64u\n", m_pwcs, m_ulLen, m_pwcsAlt, m_ulAltLen, m_ulType, m_dwCompress, m_ullStrCompress);
} return swprintf( pwcsBuf, L"%s;%d;%s;%d;%d;%u;0\n", m_pwcs, m_ulLen, m_pwcsAlt, m_ulAltLen, m_ulType, m_dwCompress);
}
ULONG m_ulLen; WCHAR* m_pwcs; ULONG m_ulAltLen; WCHAR* m_pwcsAlt; ULONG m_ulCounter; ULONG m_ulType; DWORD m_dwCompress; ULONG m_ulStrCompress; ULONGLONG m_ullStrCompress;
bool m_fOwnMemory;
};
class CStandardCFile { public: CStandardCFile(WCHAR *pwcsFileName, WCHAR *pwcsMode, bool fThrowExcptionOn = true) { m_pFile = _wfopen(pwcsFileName, pwcsMode); if (! m_pFile && fThrowExcptionOn) { throw CGenericException(L"Could not open file"); } }
~CStandardCFile() { if (m_pFile) { fclose(m_pFile); } }
operator FILE*() { return m_pFile; }
protected: FILE *m_pFile; };
struct CSuffixTerm { WCHAR* pwcs; ULONG ulLen; ULONG ulCut; ULONG ulType; };
extern const CSuffixTerm g_rSpanishSuffix[] ;
class CSpanishSuffixDict { public: CSpanishSuffixDict();
CTrie<CSuffixTerm, CToAccUpper> m_SuffixTrie; };
#endif // _SPANISH_UTILS_H_
|