|
|
// =========================================================================
// Copyright (C) 1997 - 1998, Microsoft Corporation. All Rights Reserved.
//
// File Name : BASEAPI.CPP
// Function : NLP BASE ENGINE API Definition
// =========================================================================
#include <string.h>
#include <malloc.h>
#include <sys\stat.h>
#include "basecore.hpp"
#include "basecode.hpp"
#include "basedef.hpp"
#include "basegbl.hpp"
#include "MainDict.h"
extern int Compose_RIEUL_Irregular (char *, char *); extern int Compose_HIEUH_Irregular (char *, char *); extern int Compose_PIEUP_Irregular (char *, char *); extern int Compose_TIEUT_Irregular (char *, char *); extern int Compose_SIOS_Irregular (char *, char *); extern BOOL Compose_YEO_Irregular (char *, char *); extern BOOL Compose_REO_REU_Irregular (char *, char *); extern BOOL Compose_GEORA_Irregular (char *, char *); extern BOOL Compose_Regular (char *, char *);
extern void SetSilHeosa (int, WORD *);
#include "stemkor.h"
// by dhyu -- 1996. 1
typedef struct { LPCSTR contract; LPCSTR noconstract; } contract_tossi;
contract_tossi ContractTossi [] = { { "\xa4\xa4", "\xB4\xC2"}, { "\xA4\xA9", "\xB8\xA6"}, { "\xA4\xA4\xC4\xBF\xB3\xE7", "\xB4\xC2\xC4\xBF\xB3\xE7"}, { NULL, NULL} };
/*
char ChangableFirstStem [][2] = { {__K_D_D, __V_m}, // ssangtikeut, eu
{
} */
inline BOOL isHANGEUL(char cCh1,char cCh2) { unsigned char ch1,ch2 ;
ch1=(unsigned char)cCh1; ch2 =(unsigned char)cCh2;
if ( ((ch1 >= 0xb0) && (ch1 <= 0xc8)) && (ch2>=0xa1) ) return TRUE; else if ( ((ch1 >= 0x81) && (ch1 <= 0xc5)) && ( ((ch2 >= 0x41) && (ch2 <= 0x5a)) || ((ch2 >= 0x61) && (ch2 <= 0x7a)) || ((ch2 >= 0x81) && (ch2 <= 0xa0)) ) ) return TRUE; else if ( ((ch1 >= 0x81) && (ch1 <= 0xa0)) && (ch2 >= 0xa1) ) return TRUE; //else if ( ((ch1 >= 0xca) && (ch1 <= 0xfe)) && (ch2 >= 0xa1) )
// return TRUE;
else if ((ch1 == 0xa4) && (ch2 >= 0xa1)) return TRUE;
return FALSE; }
WINSRC StemmerInit(HSTM *hStm) // Stemmer Engine session Handle
{ STMI *pstmi; HGLOBAL hgbl;
hgbl = GlobalAlloc(GHND, sizeof(STMI)); if (hgbl == NULL) return FAIL; else *hStm = (HSTM) hgbl;
pstmi = (STMI*)GlobalLock(hgbl); if (pstmi == NULL) return FAIL;
pstmi->Option = 0x00000000;
GlobalUnlock(hgbl); return NULL; // normal operation
}
WINSRC StemmerSetOption (HSTM hStm, UINT Option) { STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL) { MessageBox (NULL, "StemmerSetOption", "Fail", MB_OK); GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
pstmi->Option = Option;
GlobalUnlock (hgbl); return NULL; }
WINSRC StemmerGetOption (HSTM hStm, UINT *Option) { STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
*Option = pstmi->Option;
GlobalUnlock (hgbl); return NULL; }
WINSRC StemmerOpenMdr(HSTM sid, char *lpspathMain) // Dictionary File path
{ STMI *pstmi; HGLOBAL hgbl;
hgbl = (HGLOBAL) sid; pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
if (lstrlen(lpspathMain) == 0) { GlobalUnlock(hgbl); return srcIOErrorMdr | srcInvalidMdr; }
if (!OpenMainDict (lpspathMain)) { GlobalUnlock(hgbl); return srcIOErrorMdr | srcInvalidMdr; }
GlobalUnlock(hgbl);
return NULL; // normal operation
}
WINSRC StemmerCloseMdr(HSTM sid) {
STMI *pstmi; HGLOBAL hgbl;
hgbl = (HGLOBAL) sid; pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL) return FAIL;
if (pstmi->bMdr) CloseMainDict ();
GlobalUnlock(hgbl);
return NULL; // normal operation
}
WINSRC StemmerDecomposeW (HSTM hStm, LPCWSTR iword, LPWDOB lpSob) { LPSTR MultiByteIword; DOB sob; int index = 0;
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) iword, -1, NULL, 0, NULL, NULL); MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len); // add a check for this point
if ( MultiByteIword == NULL ) { return srcModuleError; }
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) iword, -1, MultiByteIword, len, NULL, NULL);
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
// add a check for this point
if ( sob.wordlist == NULL ) { LocalFree(MultiByteIword); return srcModuleError; }
sob.sch = lpSob->sch; SRC src = StemmerDecompose(hStm, MultiByteIword, &sob); lpSob->num = sob.num;
if (src == NULL) { char *tmpstr;
for (int j = 0, index2 = 0; j < sob.num; j++) { tmpstr = sob.wordlist+index2; len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0); LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
// add a check for this point
if ( tmpwstr == NULL ) { LocalFree (MultiByteIword); LocalFree (sob.wordlist); return srcModuleError; }
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len); memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR)); memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2); memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR)); index += (len+2); index2 += (lstrlen(tmpstr)+4); LocalFree (tmpwstr); } } lpSob->len = (WORD)index;
LocalFree (MultiByteIword); LocalFree (sob.wordlist);
return src; }
SRC GetOneResult (RLIST *rList, LPDOB lpSob) { WORD value; int count;
if (rList->num >= rList->max) return srcNoMoreResult;
lpSob->len = 0; lpSob->num = 0;
for (UINT i = rList->num, index = 0; i < rList->max; i++) { count = 0; while (rList->next [index+count] != '+' && rList->next [index+count] != '\t') count++;
if (lpSob->len + count < lpSob->sch) { memcpy (lpSob->wordlist+lpSob->len, rList->next+index, count); lpSob->num++; } else return srcOOM | srcExcessBuffer;
lpSob->len += (WORD)count; lpSob->wordlist [lpSob->len++] = '\0'; SetSilHeosa(rList->vbuf [i], &value); memcpy (lpSob->wordlist + lpSob->len, &value, 2); lpSob->wordlist [lpSob->len+2] = '\0'; lpSob->len += 3; if (rList->next[index+count] == '\t') break; index += (count + 1); }
rList->next += (index+count+1); rList->num = i+1;
return NULL; }
WINSRC StemmerDecompose(HSTM hstm, LPCSTR iword, // input word
LPDOB psob) // the number of candidates
{ int len = lstrlen ((char *) iword);
if (len >= 45) { psob->num = 1; lstrcpy ((LPSTR) psob->wordlist, (LPSTR) iword); psob->len = (WORD)len; return srcInvalid; }
for (int i = 0; i < len; i += 2) if (!isHANGEUL (iword [i], iword [i+1])) { psob->num = 1; lstrcpy ((LPSTR) psob->wordlist, (LPSTR) iword); psob->len = (WORD)len; return srcInvalid; }
STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
BaseEngine BaseCheck;
char lrgsz [400]; memset (pstmi->rList.lrgsz, NULLCHAR, 400); lstrcpy (pstmi->rList.iword, iword); pstmi->rList.max = 0; BOOL affixFlag = TRUE;
if (pstmi->Option & SO_ALONE) { int num = BaseCheck.NLP_BASE_ALONE (iword, lrgsz); if (num > 0) { affixFlag = FALSE; lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } }
if (pstmi->Option & SO_NOUNPHRASE) { int num = BaseCheck.NLP_BASE_NOUN (iword, lrgsz); if (num > 0) { affixFlag = FALSE; lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } }
if (pstmi->Option & SO_PREDICATE) { int num = BaseCheck.NLP_BASE_VERB (iword, lrgsz); if (num > 0) { lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } }
if (pstmi->Option & SO_COMPOUND) { if (pstmi->rList.max == 0) { int num = BaseCheck.NLP_BASE_COMPOUND (iword, lrgsz); if (num > 0) { lstrcpy (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [i] = BaseCheck.vbuf [i]; pstmi->rList.max = num; } } }
if (affixFlag && pstmi->Option & SO_SUFFIX) { int num = BaseCheck.NLP_BASE_AFFIX (iword, lrgsz); if (num > 0) { lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } }
pstmi->rList.num = 0; pstmi->rList.next = pstmi->rList.lrgsz;
SRC src = GetOneResult (&(pstmi->rList), psob); if (src == srcNoMoreResult) { src = srcInvalid; lstrcpy (psob->wordlist, iword); }
GlobalUnlock(hgbl); return src; }
WINSRC StemmerDecomposeMoreW (HSTM hStm, LPCWSTR lpWord, LPWDOB lpSob) { LPSTR MultiByteIword; DOB sob;
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, NULL, 0, NULL, NULL); MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point
if ( MultiByteIword == NULL ) { return srcModuleError; }
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, MultiByteIword, len, NULL, NULL);
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
// add a check for this point
if ( sob.wordlist == NULL ) { LocalFree(MultiByteIword); return srcModuleError; }
sob.sch = lpSob->sch; SRC src = StemmerDecomposeMore(hStm, MultiByteIword, &sob); lpSob->num = sob.num;
int index = 0; if (src == NULL) { char *tmpstr; for (int j = 0, index2 = 0; j < sob.num; j++) { tmpstr = sob.wordlist+index2; len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0); LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
// add a check for this point
if ( tmpwstr == NULL ) { LocalFree(MultiByteIword); LocalFree(sob.wordlist); return srcModuleError; }
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len); memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR)); memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2); memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR)); index += (len+2); index2 += (lstrlen(tmpstr)+4); LocalFree (tmpwstr); } } lpSob->len = (WORD)index;
LocalFree (MultiByteIword); LocalFree (sob.wordlist);
return src; }
WINSRC StemmerDecomposeMore (HSTM hStm, LPCSTR lpWord, LPDOB lpSob) { STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
if (lstrcmp (pstmi->rList.iword, lpWord)) { return srcModuleError; }
SRC src = GetOneResult (&(pstmi->rList), lpSob);
GlobalUnlock(hgbl);
return src; }
WINSRC StemmerEnumDecomposeW (HSTM hStm, LPCWSTR lpWord, LPWDOB lpSob, LPFNDECOMPOSEW lpfnCallBack) { LPSTR MultiByteIword; DOB sob;
int len = lstrlen ((char *) lpWord); if (len >= 45) { lpSob->num = 1; wcscpy (lpSob->wordlist, lpWord); lpSob->len = (WORD)len; return srcInvalid; }
for (int i = 0; i < len; i++) if (0xabff < lpWord [i] && lpWord [i] < 0xd7a4) { lpSob->num = 1; lstrcpy ((LPSTR) lpSob->wordlist, (LPSTR) lpWord); lpSob->len = (WORD)len; return srcInvalid; }
STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
BaseEngine BaseCheck;
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, NULL, 0, NULL, NULL); MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point
if ( MultiByteIword == NULL ) { GlobalUnlock(hgbl); return srcModuleError; }
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, MultiByteIword, len, NULL, NULL);
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
// add a check for this point
if ( sob.wordlist == NULL ) { GlobalUnlock(hgbl); LocalFree(MultiByteIword); return srcModuleError; }
sob.sch = lpSob->sch;
char lrgsz [400]; memset (pstmi->rList.lrgsz, NULLCHAR, 400); lstrcpy (pstmi->rList.iword, MultiByteIword); pstmi->rList.max = 0; int num = BaseCheck.NLP_BASE_NOUN (MultiByteIword, lrgsz); if (num > 0) { lstrcpy (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [i] = BaseCheck.vbuf [i]; pstmi->rList.max = num; } num = BaseCheck.NLP_BASE_ALONE (MultiByteIword, lrgsz); if (num > 0) { lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } num = BaseCheck.NLP_BASE_VERB (MultiByteIword, lrgsz); if (num > 0) { lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } if (num == 0) { num = BaseCheck.NLP_BASE_COMPOUND (MultiByteIword, lrgsz); if (num > 0) { lstrcpy (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [i] = BaseCheck.vbuf [i]; pstmi->rList.max = num; } }
pstmi->rList.num = 0; pstmi->rList.next = pstmi->rList.lrgsz;
while (GetOneResult (&(pstmi->rList), &sob) == NULL) { char *tmpstr; for (int j = 0, index2 = 0, index = 0; j < sob.num; j++) { tmpstr = sob.wordlist+index2; len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0); LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len); // add a check for this point
if ( tmpwstr == NULL ) { GlobalUnlock(hgbl); LocalFree (MultiByteIword); LocalFree (sob.wordlist); return srcModuleError; }
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len); memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR)); memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2); memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR)); index += (len+2); index2 += (lstrlen(tmpstr)+4); LocalFree (tmpwstr); } lpSob->len = (WORD)index; lpSob->num = sob.num; lpfnCallBack (lpSob); }
GlobalUnlock(hgbl);
LocalFree (MultiByteIword); LocalFree (sob.wordlist);
return NULL; }
WINSRC StemmerEnumDecompose (HSTM hStm, LPCSTR lpWord, LPDOB lpSob, LPFNDECOMPOSE lpfnCallBack) { int len = lstrlen ((char *) lpWord); if (len >= 45) { lpSob->num = 1; lstrcpy ((LPSTR) lpSob->wordlist, lpWord); lpSob->len = (WORD)len; return srcInvalid; }
for (int i = 0; i < len; i += 2) if (!isHANGEUL (lpWord [i], lpWord [i+1])) { lpSob->num = 1; lstrcpy ((LPSTR) lpSob->wordlist, (LPSTR) lpWord); lpSob->len = (WORD)len; return srcInvalid; }
STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
BaseEngine BaseCheck;
char lrgsz [400]; memset (pstmi->rList.lrgsz, NULLCHAR, 400); lstrcpy (pstmi->rList.iword, lpWord); int num = BaseCheck.NLP_BASE_NOUN (lpWord, lrgsz); pstmi->rList.max = 0; if (num > 0) { lstrcpy (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [i] = BaseCheck.vbuf [i]; pstmi->rList.max = num; } num = BaseCheck.NLP_BASE_ALONE (lpWord, lrgsz); if (num > 0) { lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } num = BaseCheck.NLP_BASE_VERB (lpWord, lrgsz); if (num > 0) { lstrcat (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i]; pstmi->rList.max += num; } if (num == 0) { num = BaseCheck.NLP_BASE_COMPOUND (lpWord, lrgsz); if (num > 0) { lstrcpy (pstmi->rList.lrgsz, lrgsz); for (int i = 0; i < num; i++) pstmi->rList.vbuf [i] = BaseCheck.vbuf [i]; pstmi->rList.max = num; } }
pstmi->rList.num = 0; pstmi->rList.next = pstmi->rList.lrgsz;
while (GetOneResult (&(pstmi->rList), lpSob) == NULL) lpfnCallBack (lpSob);
GlobalUnlock(hgbl);
return NULL; }
WINSRC StemmerComposeW (HSTM hstm, WCIB sib, LPWSTR rword) { CIB tmpsib; LPSTR MultiByteRword;
int len = (wcslen (sib.silsa) + 1) * 2; tmpsib.silsa = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point.
if ( tmpsib.silsa == NULL ) { return srcModuleError; }
len = WideCharToMultiByte (CP_ACP, 0, (LPCWSTR) sib.silsa, -1, tmpsib.silsa, len, NULL, NULL);
int len2 = (wcslen (sib.heosa) + 1) * 2; tmpsib.heosa = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len2);
// add a check for this point.
if ( tmpsib.heosa == NULL ) { LocalFree(tmpsib.silsa); return srcModuleError; }
len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) sib.heosa, -1, tmpsib.heosa, len2, NULL, NULL);
MultiByteRword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * (len + len2));
// add a check for this point.
if ( MultiByteRword == NULL ) { LocalFree(tmpsib.silsa); LocalFree(tmpsib.heosa); return srcModuleError; }
tmpsib.pos = sib.pos; SRC src = StemmerCompose (hstm, tmpsib, MultiByteRword);
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteRword, -1, NULL, 0); MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteRword, -1, (LPWSTR) rword, len);
LocalFree (tmpsib.silsa); LocalFree (tmpsib.heosa); LocalFree (MultiByteRword);
return src; }
int CVCheckNP(char *stem, char *ending, BYTE action) // Check vowel harmony for NOUN + Tossi. If the last letter of stem is RIEUR, that should seriously be considered.
{ int len = strlen (ending) + 1;
if ((action & 0x80) && (action & 0x40)) // CV = 11
return VALID;
if (!(action & 0x80) && (action & 0x40)) { // CV = 01
if (stem[0] >= __V_k) return VALID; if (stem[0] == __K_R && ending[0] == __K_R && ending[1] == __V_h) // Tossi is "RO"(CV=01) and the last letter of stem is RIEUR.
return VALID; if (ending[0] == __K_S && ending[1] == __V_j) { // "SEO" --> "E SEO"
memmove (ending+2, ending, len); ending [0] = __K_I; ending [1] = __V_p; return MORECHECK; } if (ending[0] == __K_N && ending[1] == __V_m && ending[2] == __K_N) { // "NEUN" --> "EUN"
ending [0] = __K_I; } if (ending[0] == __K_G && ending[1] == __V_k) { // "GA" --> "I"
ending[0] = __K_I; ending[1] = __V_l; return MORECHECK; } if (ending[0] == __K_I && ending[1] == __V_hk) { // "WA" --> "GWA"
ending [0] = __K_G; return MORECHECK; } if (ending [0] == __K_R) { if (ending[1] == __V_m && ending[2] == __K_R) { // "REUL" --> "EUL"
ending [0] = __K_I; return INVALID; } if (ending[1] == __V_h) { // "RO" --> "EU RO"
memmove (ending+2, ending, len); ending [0] = __K_I; ending [1] = __V_m; return MORECHECK; } // add "I" to the first part of ending
memmove (ending+2, ending, len); ending [0] = __K_I; ending [1] = __V_l; return MORECHECK; } if ((ending [0] == __K_N) || (ending [0] == __K_S && ending [1] == __V_l) || // "SI"
(ending [0] == __K_I && ending [1] == __V_u) || // "YEO"
(ending[0] == __K_I && ending[1] == __V_i && ending[2] == __K_M // "YA MAL RO" --> "I YA MAL RO"
&& ending[3] == __V_k && ending[4] == __K_R && ending[5] == __K_R && ending[6] == __V_h)) { // Add "I" to the first part of ending
memmove (ending+2, ending, len); ending [0] = __K_I; ending [1] = __V_l; return MORECHECK; } return MORECHECK; }
// CV==10
if (stem[0] >= __V_k) { if (ending [0] == __K_G) { // "GWA" --> "WA"
ending [0] = __K_I; return MORECHECK; } if (ending[1] == __V_l) { if (len == 3) { // "I" --> "GA"
ending [0] = __K_G; ending [1] = __V_k; return MORECHECK; } else { // remove "I"
memmove (ending, ending+2, len-2); return INVALID; } } if (ending[1] == __V_k) { ending [1] = __V_i; return MORECHECK; } if (ending[2] == __K_N) { // "EUN" --> "NEUN"
ending [0] = __K_N; return MORECHECK; } if (len == 4) { // "EUL" --> "REUL"
ending [0] = __K_R; return MORECHECK; } else { // Remove "EU"
memmove (ending, ending+2, len-2); return INVALID; } } if (stem[0] == __K_R && ending[0] == __K_I && ending[1] == __V_m && ending[2] == __K_R && ending[3] == __V_h) { // Remove "EU"
memmove (ending, ending+2, len-2); return INVALID; } return VALID; }
WINSRC StemmerCompose (HSTM hstm, CIB sib, LPSTR rword) { STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hstm; int ret, i; BYTE action;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
lstrcpy (rword, (char *)sib.silsa);
for (i = 0; sib.silsa [i] != 0; i += 2) if (!isHANGEUL (sib.silsa [i], sib.silsa [i+1])) { lstrcat (rword, sib.heosa); return NULL; }
for (i = 0; sib.heosa [i] != 0; i +=2) if (!isHANGEUL (sib.heosa [i], sib.heosa [i+1])) { lstrcat (rword, sib.heosa); return NULL; }
CODECONVERT conv;
char *incode = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen (sib.silsa)*3+1 + lstrlen (sib.heosa)*3+7));
// add a check for this point.
if ( incode == NULL ) { GlobalUnlock(hgbl); return srcModuleError; }
char *inheosa = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen (sib.heosa)*3+7));
// add a check for this point.
if ( inheosa == NULL ) { GlobalUnlock(hgbl); LocalFree(incode); return srcModuleError; }
conv.HAN2INS (sib.silsa, incode, codeWanSeong); conv.HAN2INR (sib.heosa, inheosa, codeWanSeong);
LPSTR tmptossi = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lstrlen (sib.heosa)*2 ); // add a check for this point
if (tmptossi == NULL ) { GlobalUnlock(hgbl); LocalFree(incode); LocalFree(inheosa); return srcModuleError; }
char *inending = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen(sib.heosa)*3+7)); // add a check for this point
if ( inending== NULL ) { GlobalUnlock(hgbl); LocalFree(incode); LocalFree(inheosa); LocalFree(tmptossi); return srcModuleError; }
char *inrword = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen(sib.silsa)*3+lstrlen(sib.heosa)*3+6)); // add a check for this point
if (inrword == NULL ) { GlobalUnlock(hgbl); LocalFree(incode); LocalFree(inheosa); LocalFree(tmptossi); LocalFree(inending); return srcModuleError; }
switch (sib.pos & 0x0f00) { case POS_NOUN : case POS_PRONOUN : case POS_NUMBER :
lstrcpy (tmptossi, sib.heosa); if (FindHeosaWord (inheosa, _TOSSI, &action) & FINAL) { conv.ReverseIN (inheosa, inending); conv.ReverseIN (incode, inrword); CVCheckNP (inrword, inending, action);
conv.INS2HAN (inending, tmptossi, codeWanSeong);
// we should check contraction tossi, for example, Nieun, Rieul
for (i = 0; ContractTossi [i].contract != NULL; i++) if (lstrcmp (ContractTossi [i].contract, tmptossi)==0) conv.HAN2INS ((char *)tmptossi, inending, codeWanSeong);
lstrcat (incode, inending); conv.INS2HAN(incode, (char *)rword, codeWanSeong); //LocalFree (incode);
LocalFree (inheosa); LocalFree (tmptossi); LocalFree (inending); LocalFree (inrword); GlobalUnlock (hgbl); return NULL; }
lstrcat (rword, tmptossi); LocalFree (incode); LocalFree (inheosa); LocalFree (tmptossi); LocalFree (inending); LocalFree (inrword); GlobalUnlock (hgbl); return srcComposeError;
break;
case POS_VERB : case POS_ADJECTIVE : case POS_AUXVERB : case POS_AUXADJ :
conv.HAN2INS ((char *)sib.heosa, inending, codeWanSeong); conv.HAN2INR ((char *)sib.silsa, incode, codeWanSeong); if ((ret = Compose_RIEUL_Irregular (incode, inending)) != NOT_COMPOSED) goto ErrorCheck; if ((ret = Compose_HIEUH_Irregular (incode, inending)) != NOT_COMPOSED) goto ErrorCheck; if ((ret = Compose_PIEUP_Irregular (incode, inending)) != NOT_COMPOSED) goto ErrorCheck; if ((ret = Compose_TIEUT_Irregular (incode, inending)) != NOT_COMPOSED) goto ErrorCheck; if ((ret = Compose_SIOS_Irregular (incode, inending)) != NOT_COMPOSED) goto ErrorCheck; if (Compose_YEO_Irregular (incode, inending)) goto Quit; if (Compose_REO_REU_Irregular (incode, inending)) goto Quit; if (Compose_GEORA_Irregular (incode, inending)) goto Quit; Compose_Regular (incode, inending);
ErrorCheck : if (ret == COMPOSE_ERROR) { lstrcat (rword, sib.heosa); LocalFree (incode); LocalFree (inheosa); LocalFree (tmptossi); LocalFree (inending); LocalFree (inrword); GlobalUnlock (hgbl); return srcComposeError; } Quit: conv.ReverseIN (incode, inrword); lstrcat (inrword, inending); conv.INS2HAN (inrword, (char *)rword, codeWanSeong);
break; default : lstrcat (rword, sib.heosa); LocalFree (incode); LocalFree (inheosa); LocalFree (tmptossi); LocalFree (inending); LocalFree (inrword); GlobalUnlock (hgbl); return srcComposeError; }
LocalFree (incode); LocalFree (inheosa); LocalFree (tmptossi); LocalFree (inending); LocalFree (inrword); GlobalUnlock (hgbl); return NULL; }
WINSRC StemmerTerminate(HSTM hstm) { STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
GlobalUnlock (hgbl); GlobalFree (hgbl);
return NULL; //normal operation
}
WINSRC StemmerOpenUdr (HSTM stmi, LPCSTR lpPathUdr) { return NULL; }
WINSRC StemmerCloseUdr (HSTM stmi) { return NULL; }
WINSRC StemmerCompareW (HSTM hstm, LPCWSTR lpStr1, LPCWSTR lpStr2, LPWSTR lpStem, LPWSTR lpEnding1, LPWSTR lpEnding2, WORD *pos) { LPSTR MultiByteStr1, MultiByteStr2, MultiByteStem, MultiByteEnding1, MultiByteEnding2;
int len1 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr1, -1, NULL, 0, NULL, NULL); MultiByteStr1 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len1); // add a check for this point.
if (MultiByteStr1 == NULL ) { return srcModuleError; }
len1 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr1, -1, MultiByteStr1, len1, NULL, NULL);
int len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr2, -1, NULL, 0, NULL, NULL); MultiByteStr2 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len2); // add a check for this point.
if (MultiByteStr2 == NULL ) { LocalFree(MultiByteStr1); return srcModuleError; }
len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr2, -1, MultiByteStr2, len2, NULL, NULL);
int len = len1 > len2 ? len1 : len2;
MultiByteStem = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len); // add a check for this point.
if (MultiByteStem == NULL ) { LocalFree(MultiByteStr1); LocalFree(MultiByteStr2); return srcModuleError; }
MultiByteEnding1 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len); // add a check for this point.
if (MultiByteEnding1 == NULL ) { LocalFree(MultiByteStr1); LocalFree(MultiByteStr2); LocalFree(MultiByteStem); return srcModuleError; }
MultiByteEnding2 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len); // add a check for this point.
if (MultiByteEnding2 == NULL ) { LocalFree(MultiByteStr1); LocalFree(MultiByteStr2); LocalFree(MultiByteStem); LocalFree(MultiByteEnding1); return srcModuleError; }
SRC src = StemmerCompare(hstm, MultiByteStr1, MultiByteStr2, MultiByteStem, MultiByteEnding1, MultiByteEnding2, pos);
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteStem, -1, lpStem, sizeof (lpStem)); MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteEnding1, -1, lpEnding1, sizeof (lpEnding1)); MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteEnding2, -1, lpEnding2, sizeof (lpEnding2));
LocalFree (MultiByteStr1); LocalFree (MultiByteStr2); LocalFree (MultiByteStem); LocalFree (MultiByteEnding1); LocalFree (MultiByteEnding2);
return src; }
WINSRC StemmerCompare (HSTM hstm, LPCSTR lpStr1, LPCSTR lpStr2, LPSTR lpStem, LPSTR lpEnding1, LPSTR lpEnding2, WORD *pos) { // First, check the chosung of two strings
// if they are different, we may not use stemming.
CODECONVERT conv; char inheosa1 [80], inheosa2 [80]; BYTE action;
char *incodeStr1 = new char [lstrlen (lpStr1) * 4 + 1]; char *incodeStr2 = new char [lstrlen (lpStr2) * 4 + 1]; conv.HAN2INS ((char *)lpStr1, incodeStr1, codeWanSeong); conv.HAN2INS ((char *)lpStr2, incodeStr2, codeWanSeong);
if (incodeStr1 [0] != incodeStr2 [0]) return srcInvalid;
if (incodeStr1 [1] != incodeStr2 [1]) { return srcInvalid; }
delete incodeStr1; delete incodeStr2;
STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
BaseEngine BaseCheck;
char stem1[10][100], stem2[10][100], ending1[10][100], ending2[10][100], lrgsz [400]; int num1, num2, count; WORD winfo [10]; if ((pstmi->Option & SO_NOUNPHRASE) && (pstmi->Option & (SO_NP_NOUN | SO_NP_PRONOUN | SO_NP_NUMBER | SO_NP_DEPENDENT))) { int num = BaseCheck.NLP_BASE_NOUN (lpStr1, lrgsz); BOOL first = TRUE; for (int i = 0, index = 0, l = 0, index2 = 0; i < num; i++) { count = 0; while (lrgsz [index+count] != '+' && lrgsz[index+count] != '\t') count++;
if (first) { memcpy (stem1 [l], lrgsz+index, count); stem1 [l][count] = '\0'; winfo [l] = BaseCheck.vbuf [i]; first = FALSE; } else { memcpy (ending1 [l]+index2, lrgsz+index, count); index2 += count; }
if (lrgsz[index+count] == '\t') { ending1 [l][index2] = '\0'; l++; first = TRUE; index2 = 0; } index += (count + 1); } num1 = l; num = BaseCheck.NLP_BASE_NOUN (lpStr2, lrgsz); for (i = 0, index = 0, l = 0, index2 = 0; i < num; i++) { count = 0; while (lrgsz [index+count] != '+' && lrgsz [index+count] != '\t') count++;
if (first) { memcpy (stem2 [l], lrgsz+index, count); stem2 [l][count] = '\0'; first = FALSE; } else { memcpy (ending2 [l]+index2, lrgsz+index, count); index2 += count; }
if (lrgsz[index+count] == '\t') { ending2 [l][index2] = '\0'; l++; first = TRUE; index2 = 0; } index += (count + 1); } num2 = l;
int j; for (i = 0; i < num1; i++) { for (j = 0; j < num2; j++) if (lstrcmp (stem1[i], stem2 [j]) == 0) break; if (j != num2) break; }
if (i != num1) { lstrcpy (lpStem, stem1 [i]); lstrcpy (lpEnding1, ending1 [i]); lstrcpy (lpEnding2, ending2 [j]); *pos = winfo [i]; GlobalUnlock (hgbl); return NULL; } }
if (pstmi->Option & (SO_PREDICATE | SO_AUXILIARY)) { int num = BaseCheck.NLP_BASE_VERB (lpStr1, lrgsz); BOOL first = TRUE; for (int i = 0, index = 0, l = 0, index2 = 0; i < num; i++) { count = 0; while (lrgsz [index+count] != '+' && lrgsz[index+count] != '\t') count++;
if (first) { memcpy (stem1 [l], lrgsz+index, count); stem1 [l][count] = '\0'; winfo [l] = BaseCheck.vbuf [i]; first = FALSE; } else { memcpy (ending1 [l]+index2, lrgsz+index, count); index2 += count; }
if (lrgsz[index+count] == '\t') { ending1 [l][index2] = '\0'; l++; first = TRUE; index2 = 0; } index += (count + 1); } num1 = l; num = BaseCheck.NLP_BASE_VERB (lpStr2, lrgsz); for (i = 0, index = 0, l = 0, index2 = 0; i < num; i++) { count = 0; while (lrgsz [index+count] != '+' && lrgsz [index+count] != '\t') count++;
if (first) { memcpy (stem2 [l], lrgsz+index, count); stem2 [l][count] = '\0'; first = FALSE; } else { memcpy (ending2 [l]+index2, lrgsz+index, count); index2 += count; }
if (lrgsz[index+count] == '\t') { ending2 [l][index2] = '\0'; l++; first = TRUE; index2 = 0; } index += (count + 1); } num2 = l;
int j; for (i = 0; i < num1; i++) { for (j = 0; j < num2; j++) if (lstrcmp (stem1[i], stem2 [j]) == 0) break; if (j != num2) break; }
if (i != num1) { lstrcpy (lpStem, stem1 [i]); lstrcpy (lpEnding1, ending1 [i]); lstrcpy (lpEnding2, ending2 [j]); *pos = winfo [i]; GlobalUnlock (hgbl); return NULL; } }
// for proper noun, for example, name
if (pstmi->Option & SO_NP_PROPER) { int len1 = lstrlen(lpStr1); int len2 = lstrlen(lpStr2); int shortlen = len1 > len2 ? len2 : len1; if (strncmp (lpStr1, lpStr2, shortlen) == 0) { lstrcpy (lpStem, lpStr1); lpStem [shortlen] = '\0'; char index [1]; index[0] = 'm';
CODECONVERT Conv; BOOL res1 = TRUE, res2= TRUE;
lstrcpy (lpEnding1, lpStr1 + shortlen); lstrcpy (lpEnding2, lpStr2 + shortlen); if (lstrlen (lpEnding1)) { Conv.HAN2INS ((char *)lpEnding1, inheosa1, codeWanSeong); if (!(FindHeosaWord(inheosa1, _TOSSI, &action) & FINAL)) res1 = FALSE; }
if (lstrlen (lpEnding2)) { Conv.HAN2INS ((char *)lpEnding2, inheosa2, codeWanSeong); if (!(FindHeosaWord(inheosa2, _TOSSI, &action) & FINAL)) res2 = FALSE; }
if (res1 && res2) { *pos = POS_NOUN | PROPER_NOUN; GlobalUnlock (hgbl); return NULL; } } }
GlobalUnlock (hgbl); return srcInvalid; }
WINSRC StemmerIsEndingW (HSTM hstm, LPCWSTR lpStr, UINT flag, BOOL *found) {
LPSTR MultiByteStr;
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr, -1, NULL, 0, NULL, NULL); MultiByteStr = (LPSTR) LocalAlloc (LPTR, len); // add a check for this point
if (MultiByteStr == NULL ) { return srcModuleError; } len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr, -1, MultiByteStr, len, NULL, NULL);
SRC src = StemmerIsEnding(hstm, MultiByteStr, flag, found);
LocalFree (MultiByteStr); return src;
}
WINSRC StemmerIsEnding (HSTM hstm, LPCSTR lpStr, UINT flag, BOOL *found) { BOOL tossiCheck, endingCheck;
switch (flag) { case IS_TOSSI : tossiCheck = TRUE; endingCheck = FALSE; break; case IS_ENDING : endingCheck = TRUE; tossiCheck = FALSE; break; case IS_TOSSI | IS_ENDING : tossiCheck = endingCheck = TRUE; break; default : return srcModuleError; }
STMI *pstmi; HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl); if (pstmi == NULL) { GlobalUnlock(hgbl); return srcModuleError | srcInvalidID; }
BYTE action; char *inheosa = (char *)LocalAlloc (LPTR, lstrlen(lpStr) * 4 + 1); // add a check for this point
if (inheosa == NULL ) { GlobalUnlock(hgbl); return srcModuleError; }
CODECONVERT Conv; Conv.HAN2INR ((char *)lpStr, inheosa, codeWanSeong);
*found = FALSE;
if (tossiCheck) { int res = FindHeosaWord(inheosa, _TOSSI, &action); if (res & FINAL) { *found = TRUE; endingCheck = FALSE; } } if (endingCheck) { int res = FindHeosaWord(inheosa, _ENDING, &action); if (res == FINAL) *found = TRUE; }
LocalFree (inheosa);
GlobalUnlock (hgbl); return NULL; }
/*
BOOL WINAPI DllMain (HINSTANCE hDLL, DWORD dwReason, LPVOID lpReserved){ extern char TempJumpNum [], TempSujaNum [], TempBaseNum [], TempNumNoun [], TempSuffixOut []; extern char bTemp [], TempETC [], TempDap []; extern LenDict JumpNum; extern LenDict SujaNum; extern LenDict BaseNum; extern LenDict NumNoun; extern LenDict Suffix; extern LenDict B_Dict; extern LenDict T_Dict; extern LenDict Dap;
switch(dwReason) { case DLL_PROCESS_ATTACH : JumpNum.InitLenDict(TempJumpNum, 5, 5); SujaNum.InitLenDict(TempSujaNum, 8, 27); BaseNum.InitLenDict(TempBaseNum, 5, 3); NumNoun.InitLenDict(TempNumNoun, 8, 32); Suffix.InitLenDict(TempSuffixOut, 8, 8); B_Dict.InitLenDict(bTemp, 5, 1); T_Dict.InitLenDict(TempETC, 10, 7); Dap.InitLenDict(TempDap, 5, 1); break ; case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: break; case DLL_PROCESS_DETACH : break ; } //switch
return TRUE ; } */
|