mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1537 lines
43 KiB
1537 lines
43 KiB
// =========================================================================
|
|
// Copyright (C) 1997 - 1998, Microsoft Corporation. All Rights Reserved.
|
|
//
|
|
// File Name : BASEAPI.CPP
|
|
// Function : NLP BASE ENGINE API Definition
|
|
// =========================================================================
|
|
|
|
#include <string.h>
|
|
#include <malloc.h>
|
|
#include <sys\stat.h>
|
|
|
|
#include "basecore.hpp"
|
|
#include "basecode.hpp"
|
|
#include "basedef.hpp"
|
|
#include "basegbl.hpp"
|
|
#include "MainDict.h"
|
|
|
|
extern int Compose_RIEUL_Irregular (char *, char *);
|
|
extern int Compose_HIEUH_Irregular (char *, char *);
|
|
extern int Compose_PIEUP_Irregular (char *, char *);
|
|
extern int Compose_TIEUT_Irregular (char *, char *);
|
|
extern int Compose_SIOS_Irregular (char *, char *);
|
|
extern BOOL Compose_YEO_Irregular (char *, char *);
|
|
extern BOOL Compose_REO_REU_Irregular (char *, char *);
|
|
extern BOOL Compose_GEORA_Irregular (char *, char *);
|
|
extern BOOL Compose_Regular (char *, char *);
|
|
|
|
extern void SetSilHeosa (int, WORD *);
|
|
|
|
#include "stemkor.h"
|
|
|
|
|
|
// by dhyu -- 1996. 1
|
|
typedef struct
|
|
{
|
|
LPCSTR contract;
|
|
LPCSTR noconstract;
|
|
} contract_tossi;
|
|
|
|
contract_tossi ContractTossi [] =
|
|
{
|
|
{ "\xa4\xa4", "\xB4\xC2"},
|
|
{ "\xA4\xA9", "\xB8\xA6"},
|
|
{ "\xA4\xA4\xC4\xBF\xB3\xE7", "\xB4\xC2\xC4\xBF\xB3\xE7"},
|
|
{ NULL, NULL}
|
|
};
|
|
|
|
/*
|
|
char ChangableFirstStem [][2] =
|
|
{
|
|
{__K_D_D, __V_m}, // ssangtikeut, eu
|
|
{
|
|
|
|
}
|
|
*/
|
|
|
|
inline
|
|
BOOL isHANGEUL(char cCh1,char cCh2)
|
|
{
|
|
unsigned char ch1,ch2 ;
|
|
|
|
ch1=(unsigned char)cCh1;
|
|
ch2 =(unsigned char)cCh2;
|
|
|
|
if ( ((ch1 >= 0xb0) && (ch1 <= 0xc8)) && (ch2>=0xa1) )
|
|
return TRUE;
|
|
else if ( ((ch1 >= 0x81) && (ch1 <= 0xc5)) && ( ((ch2 >= 0x41) && (ch2 <= 0x5a)) || ((ch2 >= 0x61) && (ch2 <= 0x7a)) || ((ch2 >= 0x81) && (ch2 <= 0xa0)) ) )
|
|
return TRUE;
|
|
else if ( ((ch1 >= 0x81) && (ch1 <= 0xa0)) && (ch2 >= 0xa1) )
|
|
return TRUE;
|
|
//else if ( ((ch1 >= 0xca) && (ch1 <= 0xfe)) && (ch2 >= 0xa1) )
|
|
// return TRUE;
|
|
else if ((ch1 == 0xa4) && (ch2 >= 0xa1))
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
WINSRC StemmerInit(HSTM *hStm) // Stemmer Engine session Handle
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl;
|
|
|
|
|
|
hgbl = GlobalAlloc(GHND, sizeof(STMI));
|
|
if (hgbl == NULL) return FAIL;
|
|
else
|
|
*hStm = (HSTM) hgbl;
|
|
|
|
pstmi = (STMI*)GlobalLock(hgbl);
|
|
if (pstmi == NULL) return FAIL;
|
|
|
|
pstmi->Option = 0x00000000;
|
|
|
|
GlobalUnlock(hgbl);
|
|
return NULL; // normal operation
|
|
}
|
|
|
|
WINSRC StemmerSetOption (HSTM hStm, UINT Option)
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hStm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
|
|
if (pstmi == NULL)
|
|
{
|
|
MessageBox (NULL, "StemmerSetOption", "Fail", MB_OK);
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
pstmi->Option = Option;
|
|
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerGetOption (HSTM hStm, UINT *Option)
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hStm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
|
|
*Option = pstmi->Option;
|
|
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerOpenMdr(HSTM sid, char *lpspathMain) // Dictionary File path
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl;
|
|
|
|
hgbl = (HGLOBAL) sid;
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
if (lstrlen(lpspathMain) == 0)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcIOErrorMdr | srcInvalidMdr;
|
|
}
|
|
|
|
|
|
if (!OpenMainDict (lpspathMain))
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcIOErrorMdr | srcInvalidMdr;
|
|
}
|
|
|
|
GlobalUnlock(hgbl);
|
|
|
|
return NULL; // normal operation
|
|
}
|
|
|
|
|
|
WINSRC StemmerCloseMdr(HSTM sid)
|
|
{
|
|
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl;
|
|
|
|
hgbl = (HGLOBAL) sid;
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
|
|
if (pstmi == NULL) return FAIL;
|
|
|
|
if (pstmi->bMdr)
|
|
CloseMainDict ();
|
|
|
|
GlobalUnlock(hgbl);
|
|
|
|
return NULL; // normal operation
|
|
}
|
|
|
|
WINSRC StemmerDecomposeW (HSTM hStm,
|
|
LPCWSTR iword,
|
|
LPWDOB lpSob)
|
|
{
|
|
LPSTR MultiByteIword;
|
|
DOB sob;
|
|
int index = 0;
|
|
|
|
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) iword, -1, NULL, 0, NULL, NULL);
|
|
MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
// add a check for this point
|
|
if ( MultiByteIword == NULL ) {
|
|
return srcModuleError;
|
|
}
|
|
|
|
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) iword, -1, MultiByteIword, len, NULL, NULL);
|
|
|
|
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
|
|
|
|
// add a check for this point
|
|
if ( sob.wordlist == NULL ) {
|
|
LocalFree(MultiByteIword);
|
|
return srcModuleError;
|
|
}
|
|
|
|
sob.sch = lpSob->sch;
|
|
SRC src = StemmerDecompose(hStm, MultiByteIword, &sob);
|
|
lpSob->num = sob.num;
|
|
|
|
if (src == NULL)
|
|
{
|
|
char *tmpstr;
|
|
|
|
for (int j = 0, index2 = 0; j < sob.num; j++)
|
|
{
|
|
tmpstr = sob.wordlist+index2;
|
|
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0);
|
|
LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
|
|
|
|
// add a check for this point
|
|
if ( tmpwstr == NULL ) {
|
|
LocalFree (MultiByteIword);
|
|
LocalFree (sob.wordlist);
|
|
return srcModuleError;
|
|
}
|
|
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len);
|
|
memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR));
|
|
memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2);
|
|
memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR));
|
|
index += (len+2);
|
|
index2 += (lstrlen(tmpstr)+4);
|
|
LocalFree (tmpwstr);
|
|
}
|
|
}
|
|
lpSob->len = (WORD)index;
|
|
|
|
LocalFree (MultiByteIword);
|
|
LocalFree (sob.wordlist);
|
|
|
|
return src;
|
|
}
|
|
|
|
SRC GetOneResult (RLIST *rList, LPDOB lpSob)
|
|
{
|
|
WORD value;
|
|
int count;
|
|
|
|
if (rList->num >= rList->max)
|
|
return srcNoMoreResult;
|
|
|
|
lpSob->len = 0;
|
|
lpSob->num = 0;
|
|
|
|
for (UINT i = rList->num, index = 0; i < rList->max; i++)
|
|
{
|
|
count = 0;
|
|
while (rList->next [index+count] != '+' && rList->next [index+count] != '\t')
|
|
count++;
|
|
|
|
if (lpSob->len + count < lpSob->sch)
|
|
{
|
|
memcpy (lpSob->wordlist+lpSob->len, rList->next+index, count);
|
|
lpSob->num++;
|
|
}
|
|
else
|
|
return srcOOM | srcExcessBuffer;
|
|
|
|
lpSob->len += (WORD)count;
|
|
lpSob->wordlist [lpSob->len++] = '\0';
|
|
SetSilHeosa(rList->vbuf [i], &value);
|
|
memcpy (lpSob->wordlist + lpSob->len, &value, 2);
|
|
lpSob->wordlist [lpSob->len+2] = '\0';
|
|
lpSob->len += 3;
|
|
if (rList->next[index+count] == '\t')
|
|
break;
|
|
index += (count + 1);
|
|
}
|
|
|
|
rList->next += (index+count+1);
|
|
rList->num = i+1;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerDecompose(HSTM hstm,
|
|
LPCSTR iword, // input word
|
|
LPDOB psob) // the number of candidates
|
|
{
|
|
int len = lstrlen ((char *) iword);
|
|
|
|
|
|
if (len >= 45)
|
|
{
|
|
psob->num = 1;
|
|
lstrcpy ((LPSTR) psob->wordlist, (LPSTR) iword);
|
|
psob->len = (WORD)len;
|
|
return srcInvalid;
|
|
}
|
|
|
|
for (int i = 0; i < len; i += 2)
|
|
if (!isHANGEUL (iword [i], iword [i+1]))
|
|
{
|
|
psob->num = 1;
|
|
lstrcpy ((LPSTR) psob->wordlist, (LPSTR) iword);
|
|
psob->len = (WORD)len;
|
|
return srcInvalid;
|
|
}
|
|
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hstm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
BaseEngine BaseCheck;
|
|
|
|
char lrgsz [400];
|
|
memset (pstmi->rList.lrgsz, NULLCHAR, 400);
|
|
lstrcpy (pstmi->rList.iword, iword);
|
|
pstmi->rList.max = 0;
|
|
BOOL affixFlag = TRUE;
|
|
|
|
if (pstmi->Option & SO_ALONE)
|
|
{
|
|
int num = BaseCheck.NLP_BASE_ALONE (iword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
affixFlag = FALSE;
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
}
|
|
|
|
if (pstmi->Option & SO_NOUNPHRASE)
|
|
{
|
|
int num = BaseCheck.NLP_BASE_NOUN (iword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
affixFlag = FALSE;
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
}
|
|
|
|
if (pstmi->Option & SO_PREDICATE)
|
|
{
|
|
int num = BaseCheck.NLP_BASE_VERB (iword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
}
|
|
|
|
if (pstmi->Option & SO_COMPOUND)
|
|
{
|
|
if (pstmi->rList.max == 0)
|
|
{
|
|
int num = BaseCheck.NLP_BASE_COMPOUND (iword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcpy (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max = num;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (affixFlag && pstmi->Option & SO_SUFFIX)
|
|
{
|
|
int num = BaseCheck.NLP_BASE_AFFIX (iword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
}
|
|
|
|
pstmi->rList.num = 0;
|
|
pstmi->rList.next = pstmi->rList.lrgsz;
|
|
|
|
SRC src = GetOneResult (&(pstmi->rList), psob);
|
|
if (src == srcNoMoreResult)
|
|
{
|
|
src = srcInvalid;
|
|
lstrcpy (psob->wordlist, iword);
|
|
}
|
|
|
|
GlobalUnlock(hgbl);
|
|
return src;
|
|
}
|
|
|
|
WINSRC StemmerDecomposeMoreW (HSTM hStm, LPCWSTR lpWord, LPWDOB lpSob)
|
|
{
|
|
LPSTR MultiByteIword;
|
|
DOB sob;
|
|
|
|
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, NULL, 0, NULL, NULL);
|
|
MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
|
|
// add a check for this point
|
|
if ( MultiByteIword == NULL ) {
|
|
return srcModuleError;
|
|
}
|
|
|
|
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, MultiByteIword, len, NULL, NULL);
|
|
|
|
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
|
|
|
|
// add a check for this point
|
|
if ( sob.wordlist == NULL ) {
|
|
LocalFree(MultiByteIword);
|
|
return srcModuleError;
|
|
}
|
|
|
|
sob.sch = lpSob->sch;
|
|
SRC src = StemmerDecomposeMore(hStm, MultiByteIword, &sob);
|
|
lpSob->num = sob.num;
|
|
|
|
int index = 0;
|
|
if (src == NULL)
|
|
{
|
|
char *tmpstr;
|
|
for (int j = 0, index2 = 0; j < sob.num; j++)
|
|
{
|
|
tmpstr = sob.wordlist+index2;
|
|
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0);
|
|
LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
|
|
|
|
// add a check for this point
|
|
if ( tmpwstr == NULL ) {
|
|
LocalFree(MultiByteIword);
|
|
LocalFree(sob.wordlist);
|
|
return srcModuleError;
|
|
}
|
|
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len);
|
|
memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR));
|
|
memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2);
|
|
memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR));
|
|
index += (len+2);
|
|
index2 += (lstrlen(tmpstr)+4);
|
|
LocalFree (tmpwstr);
|
|
}
|
|
}
|
|
lpSob->len = (WORD)index;
|
|
|
|
LocalFree (MultiByteIword);
|
|
LocalFree (sob.wordlist);
|
|
|
|
return src;
|
|
}
|
|
|
|
WINSRC StemmerDecomposeMore (HSTM hStm, LPCSTR lpWord, LPDOB lpSob)
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hStm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
if (lstrcmp (pstmi->rList.iword, lpWord))
|
|
{
|
|
return srcModuleError;
|
|
}
|
|
|
|
SRC src = GetOneResult (&(pstmi->rList), lpSob);
|
|
|
|
GlobalUnlock(hgbl);
|
|
|
|
return src;
|
|
}
|
|
|
|
WINSRC StemmerEnumDecomposeW (HSTM hStm, LPCWSTR lpWord, LPWDOB lpSob, LPFNDECOMPOSEW lpfnCallBack)
|
|
{
|
|
LPSTR MultiByteIword;
|
|
DOB sob;
|
|
|
|
int len = lstrlen ((char *) lpWord);
|
|
if (len >= 45)
|
|
{
|
|
lpSob->num = 1;
|
|
wcscpy (lpSob->wordlist, lpWord);
|
|
lpSob->len = (WORD)len;
|
|
return srcInvalid;
|
|
}
|
|
|
|
for (int i = 0; i < len; i++)
|
|
if (0xabff < lpWord [i] && lpWord [i] < 0xd7a4)
|
|
{
|
|
lpSob->num = 1;
|
|
lstrcpy ((LPSTR) lpSob->wordlist, (LPSTR) lpWord);
|
|
lpSob->len = (WORD)len;
|
|
return srcInvalid;
|
|
}
|
|
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hStm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
BaseEngine BaseCheck;
|
|
|
|
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, NULL, 0, NULL, NULL);
|
|
MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
|
|
// add a check for this point
|
|
if ( MultiByteIword == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError;
|
|
}
|
|
|
|
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, MultiByteIword, len, NULL, NULL);
|
|
|
|
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
|
|
|
|
// add a check for this point
|
|
if ( sob.wordlist == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
LocalFree(MultiByteIword);
|
|
return srcModuleError;
|
|
}
|
|
|
|
sob.sch = lpSob->sch;
|
|
|
|
char lrgsz [400];
|
|
memset (pstmi->rList.lrgsz, NULLCHAR, 400);
|
|
lstrcpy (pstmi->rList.iword, MultiByteIword);
|
|
pstmi->rList.max = 0;
|
|
int num = BaseCheck.NLP_BASE_NOUN (MultiByteIword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcpy (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max = num;
|
|
}
|
|
num = BaseCheck.NLP_BASE_ALONE (MultiByteIword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
num = BaseCheck.NLP_BASE_VERB (MultiByteIword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
if (num == 0)
|
|
{
|
|
num = BaseCheck.NLP_BASE_COMPOUND (MultiByteIword, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcpy (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max = num;
|
|
}
|
|
}
|
|
|
|
pstmi->rList.num = 0;
|
|
pstmi->rList.next = pstmi->rList.lrgsz;
|
|
|
|
while (GetOneResult (&(pstmi->rList), &sob) == NULL)
|
|
{
|
|
char *tmpstr;
|
|
for (int j = 0, index2 = 0, index = 0; j < sob.num; j++)
|
|
{
|
|
tmpstr = sob.wordlist+index2;
|
|
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0);
|
|
LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
|
|
// add a check for this point
|
|
|
|
if ( tmpwstr == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
LocalFree (MultiByteIword);
|
|
LocalFree (sob.wordlist);
|
|
return srcModuleError;
|
|
}
|
|
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len);
|
|
memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR));
|
|
memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2);
|
|
memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR));
|
|
index += (len+2);
|
|
index2 += (lstrlen(tmpstr)+4);
|
|
LocalFree (tmpwstr);
|
|
}
|
|
lpSob->len = (WORD)index;
|
|
lpSob->num = sob.num;
|
|
lpfnCallBack (lpSob);
|
|
}
|
|
|
|
GlobalUnlock(hgbl);
|
|
|
|
LocalFree (MultiByteIword);
|
|
LocalFree (sob.wordlist);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerEnumDecompose (HSTM hStm, LPCSTR lpWord, LPDOB lpSob, LPFNDECOMPOSE lpfnCallBack)
|
|
{
|
|
int len = lstrlen ((char *) lpWord);
|
|
if (len >= 45)
|
|
{
|
|
lpSob->num = 1;
|
|
lstrcpy ((LPSTR) lpSob->wordlist, lpWord);
|
|
lpSob->len = (WORD)len;
|
|
return srcInvalid;
|
|
}
|
|
|
|
for (int i = 0; i < len; i += 2)
|
|
if (!isHANGEUL (lpWord [i], lpWord [i+1]))
|
|
{
|
|
lpSob->num = 1;
|
|
lstrcpy ((LPSTR) lpSob->wordlist, (LPSTR) lpWord);
|
|
lpSob->len = (WORD)len;
|
|
return srcInvalid;
|
|
}
|
|
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hStm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
BaseEngine BaseCheck;
|
|
|
|
char lrgsz [400];
|
|
memset (pstmi->rList.lrgsz, NULLCHAR, 400);
|
|
lstrcpy (pstmi->rList.iword, lpWord);
|
|
int num = BaseCheck.NLP_BASE_NOUN (lpWord, lrgsz);
|
|
pstmi->rList.max = 0;
|
|
if (num > 0)
|
|
{
|
|
lstrcpy (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max = num;
|
|
}
|
|
num = BaseCheck.NLP_BASE_ALONE (lpWord, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
num = BaseCheck.NLP_BASE_VERB (lpWord, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcat (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max += num;
|
|
}
|
|
if (num == 0)
|
|
{
|
|
num = BaseCheck.NLP_BASE_COMPOUND (lpWord, lrgsz);
|
|
if (num > 0)
|
|
{
|
|
lstrcpy (pstmi->rList.lrgsz, lrgsz);
|
|
for (int i = 0; i < num; i++)
|
|
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
|
|
pstmi->rList.max = num;
|
|
}
|
|
}
|
|
|
|
|
|
pstmi->rList.num = 0;
|
|
pstmi->rList.next = pstmi->rList.lrgsz;
|
|
|
|
while (GetOneResult (&(pstmi->rList), lpSob) == NULL)
|
|
lpfnCallBack (lpSob);
|
|
|
|
GlobalUnlock(hgbl);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerComposeW (HSTM hstm, WCIB sib, LPWSTR rword)
|
|
{
|
|
CIB tmpsib;
|
|
LPSTR MultiByteRword;
|
|
|
|
int len = (wcslen (sib.silsa) + 1) * 2;
|
|
tmpsib.silsa = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
|
|
// add a check for this point.
|
|
if ( tmpsib.silsa == NULL ) {
|
|
return srcModuleError;
|
|
}
|
|
|
|
len = WideCharToMultiByte (CP_ACP, 0, (LPCWSTR) sib.silsa, -1, tmpsib.silsa, len, NULL, NULL);
|
|
|
|
int len2 = (wcslen (sib.heosa) + 1) * 2;
|
|
tmpsib.heosa = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len2);
|
|
|
|
// add a check for this point.
|
|
if ( tmpsib.heosa == NULL ) {
|
|
LocalFree(tmpsib.silsa);
|
|
return srcModuleError;
|
|
}
|
|
|
|
len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) sib.heosa, -1, tmpsib.heosa, len2, NULL, NULL);
|
|
|
|
MultiByteRword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * (len + len2));
|
|
|
|
// add a check for this point.
|
|
if ( MultiByteRword == NULL ) {
|
|
LocalFree(tmpsib.silsa);
|
|
LocalFree(tmpsib.heosa);
|
|
return srcModuleError;
|
|
}
|
|
|
|
tmpsib.pos = sib.pos;
|
|
SRC src = StemmerCompose (hstm, tmpsib, MultiByteRword);
|
|
|
|
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteRword, -1, NULL, 0);
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteRword, -1, (LPWSTR) rword, len);
|
|
|
|
LocalFree (tmpsib.silsa);
|
|
LocalFree (tmpsib.heosa);
|
|
LocalFree (MultiByteRword);
|
|
|
|
return src;
|
|
}
|
|
|
|
int CVCheckNP(char *stem, char *ending, BYTE action)
|
|
// Check vowel harmony for NOUN + Tossi. If the last letter of stem is RIEUR, that should seriously be considered.
|
|
{
|
|
int len = strlen (ending) + 1;
|
|
|
|
if ((action & 0x80) && (action & 0x40)) // CV = 11
|
|
return VALID;
|
|
|
|
if (!(action & 0x80) && (action & 0x40)) { // CV = 01
|
|
if (stem[0] >= __V_k)
|
|
return VALID;
|
|
if (stem[0] == __K_R && ending[0] == __K_R && ending[1] == __V_h)
|
|
// Tossi is "RO"(CV=01) and the last letter of stem is RIEUR.
|
|
return VALID;
|
|
if (ending[0] == __K_S && ending[1] == __V_j) {
|
|
// "SEO" --> "E SEO"
|
|
memmove (ending+2, ending, len);
|
|
ending [0] = __K_I;
|
|
ending [1] = __V_p;
|
|
return MORECHECK;
|
|
}
|
|
if (ending[0] == __K_N && ending[1] == __V_m && ending[2] == __K_N) {
|
|
// "NEUN" --> "EUN"
|
|
ending [0] = __K_I;
|
|
}
|
|
if (ending[0] == __K_G && ending[1] == __V_k) {
|
|
// "GA" --> "I"
|
|
ending[0] = __K_I;
|
|
ending[1] = __V_l;
|
|
return MORECHECK;
|
|
}
|
|
if (ending[0] == __K_I && ending[1] == __V_hk) {
|
|
// "WA" --> "GWA"
|
|
ending [0] = __K_G;
|
|
return MORECHECK;
|
|
}
|
|
if (ending [0] == __K_R) {
|
|
if (ending[1] == __V_m && ending[2] == __K_R) {
|
|
// "REUL" --> "EUL"
|
|
ending [0] = __K_I;
|
|
return INVALID;
|
|
}
|
|
if (ending[1] == __V_h) {
|
|
// "RO" --> "EU RO"
|
|
memmove (ending+2, ending, len);
|
|
ending [0] = __K_I;
|
|
ending [1] = __V_m;
|
|
return MORECHECK;
|
|
}
|
|
// add "I" to the first part of ending
|
|
memmove (ending+2, ending, len);
|
|
ending [0] = __K_I;
|
|
ending [1] = __V_l;
|
|
return MORECHECK;
|
|
}
|
|
if ((ending [0] == __K_N) ||
|
|
(ending [0] == __K_S && ending [1] == __V_l) || // "SI"
|
|
(ending [0] == __K_I && ending [1] == __V_u) || // "YEO"
|
|
(ending[0] == __K_I && ending[1] == __V_i && ending[2] == __K_M // "YA MAL RO" --> "I YA MAL RO"
|
|
&& ending[3] == __V_k && ending[4] == __K_R && ending[5] == __K_R && ending[6] == __V_h))
|
|
{
|
|
// Add "I" to the first part of ending
|
|
memmove (ending+2, ending, len);
|
|
ending [0] = __K_I;
|
|
ending [1] = __V_l;
|
|
return MORECHECK;
|
|
}
|
|
return MORECHECK;
|
|
}
|
|
|
|
// CV==10
|
|
if (stem[0] >= __V_k) {
|
|
if (ending [0] == __K_G) {
|
|
// "GWA" --> "WA"
|
|
ending [0] = __K_I;
|
|
return MORECHECK;
|
|
}
|
|
if (ending[1] == __V_l) {
|
|
if (len == 3) {
|
|
// "I" --> "GA"
|
|
ending [0] = __K_G;
|
|
ending [1] = __V_k;
|
|
return MORECHECK;
|
|
}
|
|
else {
|
|
// remove "I"
|
|
memmove (ending, ending+2, len-2);
|
|
return INVALID;
|
|
}
|
|
}
|
|
if (ending[1] == __V_k)
|
|
{
|
|
ending [1] = __V_i;
|
|
return MORECHECK;
|
|
}
|
|
if (ending[2] == __K_N) {
|
|
// "EUN" --> "NEUN"
|
|
ending [0] = __K_N;
|
|
return MORECHECK;
|
|
}
|
|
if (len == 4) {
|
|
// "EUL" --> "REUL"
|
|
ending [0] = __K_R;
|
|
return MORECHECK;
|
|
}
|
|
else {
|
|
// Remove "EU"
|
|
memmove (ending, ending+2, len-2);
|
|
return INVALID;
|
|
}
|
|
}
|
|
if (stem[0] == __K_R && ending[0] == __K_I && ending[1] == __V_m
|
|
&& ending[2] == __K_R && ending[3] == __V_h) {
|
|
// Remove "EU"
|
|
memmove (ending, ending+2, len-2);
|
|
return INVALID;
|
|
}
|
|
return VALID;
|
|
}
|
|
|
|
WINSRC StemmerCompose (HSTM hstm, CIB sib, LPSTR rword)
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hstm;
|
|
int ret, i;
|
|
BYTE action;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
lstrcpy (rword, (char *)sib.silsa);
|
|
|
|
for (i = 0; sib.silsa [i] != 0; i += 2)
|
|
if (!isHANGEUL (sib.silsa [i], sib.silsa [i+1]))
|
|
{
|
|
lstrcat (rword, sib.heosa);
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; sib.heosa [i] != 0; i +=2)
|
|
if (!isHANGEUL (sib.heosa [i], sib.heosa [i+1]))
|
|
{
|
|
lstrcat (rword, sib.heosa);
|
|
return NULL;
|
|
}
|
|
|
|
CODECONVERT conv;
|
|
|
|
char *incode = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen (sib.silsa)*3+1 + lstrlen (sib.heosa)*3+7));
|
|
|
|
// add a check for this point.
|
|
if ( incode == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError;
|
|
}
|
|
|
|
char *inheosa = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen (sib.heosa)*3+7));
|
|
|
|
// add a check for this point.
|
|
if ( inheosa == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
LocalFree(incode);
|
|
return srcModuleError;
|
|
}
|
|
|
|
conv.HAN2INS (sib.silsa, incode, codeWanSeong);
|
|
conv.HAN2INR (sib.heosa, inheosa, codeWanSeong);
|
|
|
|
LPSTR tmptossi = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lstrlen (sib.heosa)*2 );
|
|
// add a check for this point
|
|
if (tmptossi == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
LocalFree(incode);
|
|
LocalFree(inheosa);
|
|
return srcModuleError;
|
|
}
|
|
|
|
char *inending = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen(sib.heosa)*3+7));
|
|
// add a check for this point
|
|
if ( inending== NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
LocalFree(incode);
|
|
LocalFree(inheosa);
|
|
LocalFree(tmptossi);
|
|
return srcModuleError;
|
|
}
|
|
|
|
char *inrword = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen(sib.silsa)*3+lstrlen(sib.heosa)*3+6));
|
|
// add a check for this point
|
|
if (inrword == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
LocalFree(incode);
|
|
LocalFree(inheosa);
|
|
LocalFree(tmptossi);
|
|
LocalFree(inending);
|
|
return srcModuleError;
|
|
}
|
|
|
|
|
|
switch (sib.pos & 0x0f00)
|
|
{
|
|
case POS_NOUN :
|
|
case POS_PRONOUN :
|
|
case POS_NUMBER :
|
|
|
|
lstrcpy (tmptossi, sib.heosa);
|
|
if (FindHeosaWord (inheosa, _TOSSI, &action) & FINAL)
|
|
{
|
|
conv.ReverseIN (inheosa, inending);
|
|
conv.ReverseIN (incode, inrword);
|
|
CVCheckNP (inrword, inending, action);
|
|
|
|
|
|
conv.INS2HAN (inending, tmptossi, codeWanSeong);
|
|
|
|
// we should check contraction tossi, for example, Nieun, Rieul
|
|
for (i = 0; ContractTossi [i].contract != NULL; i++)
|
|
if (lstrcmp (ContractTossi [i].contract, tmptossi)==0)
|
|
conv.HAN2INS ((char *)tmptossi, inending, codeWanSeong);
|
|
|
|
lstrcat (incode, inending);
|
|
conv.INS2HAN(incode, (char *)rword, codeWanSeong);
|
|
//LocalFree (incode);
|
|
LocalFree (inheosa);
|
|
LocalFree (tmptossi);
|
|
LocalFree (inending);
|
|
LocalFree (inrword);
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
|
|
lstrcat (rword, tmptossi);
|
|
LocalFree (incode);
|
|
LocalFree (inheosa);
|
|
LocalFree (tmptossi);
|
|
LocalFree (inending);
|
|
LocalFree (inrword);
|
|
GlobalUnlock (hgbl);
|
|
return srcComposeError;
|
|
|
|
break;
|
|
|
|
case POS_VERB :
|
|
case POS_ADJECTIVE :
|
|
case POS_AUXVERB :
|
|
case POS_AUXADJ :
|
|
|
|
conv.HAN2INS ((char *)sib.heosa, inending, codeWanSeong);
|
|
conv.HAN2INR ((char *)sib.silsa, incode, codeWanSeong);
|
|
if ((ret = Compose_RIEUL_Irregular (incode, inending)) != NOT_COMPOSED)
|
|
goto ErrorCheck;
|
|
if ((ret = Compose_HIEUH_Irregular (incode, inending)) != NOT_COMPOSED)
|
|
goto ErrorCheck;
|
|
if ((ret = Compose_PIEUP_Irregular (incode, inending)) != NOT_COMPOSED)
|
|
goto ErrorCheck;
|
|
if ((ret = Compose_TIEUT_Irregular (incode, inending)) != NOT_COMPOSED)
|
|
goto ErrorCheck;
|
|
if ((ret = Compose_SIOS_Irregular (incode, inending)) != NOT_COMPOSED)
|
|
goto ErrorCheck;
|
|
if (Compose_YEO_Irregular (incode, inending))
|
|
goto Quit;
|
|
if (Compose_REO_REU_Irregular (incode, inending))
|
|
goto Quit;
|
|
if (Compose_GEORA_Irregular (incode, inending))
|
|
goto Quit;
|
|
Compose_Regular (incode, inending);
|
|
|
|
ErrorCheck : if (ret == COMPOSE_ERROR)
|
|
{
|
|
lstrcat (rword, sib.heosa);
|
|
LocalFree (incode);
|
|
LocalFree (inheosa);
|
|
LocalFree (tmptossi);
|
|
LocalFree (inending);
|
|
LocalFree (inrword);
|
|
GlobalUnlock (hgbl);
|
|
return srcComposeError;
|
|
}
|
|
Quit: conv.ReverseIN (incode, inrword);
|
|
lstrcat (inrword, inending);
|
|
conv.INS2HAN (inrword, (char *)rword, codeWanSeong);
|
|
|
|
break;
|
|
default :
|
|
lstrcat (rword, sib.heosa);
|
|
LocalFree (incode);
|
|
LocalFree (inheosa);
|
|
LocalFree (tmptossi);
|
|
LocalFree (inending);
|
|
LocalFree (inrword);
|
|
GlobalUnlock (hgbl);
|
|
return srcComposeError;
|
|
}
|
|
|
|
LocalFree (incode);
|
|
LocalFree (inheosa);
|
|
LocalFree (tmptossi);
|
|
LocalFree (inending);
|
|
LocalFree (inrword);
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerTerminate(HSTM hstm)
|
|
{
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hstm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
GlobalUnlock (hgbl);
|
|
GlobalFree (hgbl);
|
|
|
|
return NULL; //normal operation
|
|
}
|
|
|
|
WINSRC StemmerOpenUdr (HSTM stmi, LPCSTR lpPathUdr)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerCloseUdr (HSTM stmi)
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
WINSRC StemmerCompareW (HSTM hstm, LPCWSTR lpStr1, LPCWSTR lpStr2, LPWSTR lpStem, LPWSTR lpEnding1, LPWSTR lpEnding2, WORD *pos)
|
|
{
|
|
LPSTR MultiByteStr1, MultiByteStr2, MultiByteStem, MultiByteEnding1, MultiByteEnding2;
|
|
|
|
int len1 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr1, -1, NULL, 0, NULL, NULL);
|
|
MultiByteStr1 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len1);
|
|
// add a check for this point.
|
|
if (MultiByteStr1 == NULL ) {
|
|
return srcModuleError;
|
|
}
|
|
|
|
len1 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr1, -1, MultiByteStr1, len1, NULL, NULL);
|
|
|
|
int len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr2, -1, NULL, 0, NULL, NULL);
|
|
MultiByteStr2 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len2);
|
|
// add a check for this point.
|
|
if (MultiByteStr2 == NULL ) {
|
|
LocalFree(MultiByteStr1);
|
|
return srcModuleError;
|
|
}
|
|
|
|
len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr2, -1, MultiByteStr2, len2, NULL, NULL);
|
|
|
|
int len = len1 > len2 ? len1 : len2;
|
|
|
|
MultiByteStem = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
// add a check for this point.
|
|
if (MultiByteStem == NULL ) {
|
|
LocalFree(MultiByteStr1);
|
|
LocalFree(MultiByteStr2);
|
|
return srcModuleError;
|
|
}
|
|
|
|
MultiByteEnding1 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
// add a check for this point.
|
|
if (MultiByteEnding1 == NULL ) {
|
|
LocalFree(MultiByteStr1);
|
|
LocalFree(MultiByteStr2);
|
|
LocalFree(MultiByteStem);
|
|
return srcModuleError;
|
|
}
|
|
|
|
MultiByteEnding2 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
|
|
// add a check for this point.
|
|
if (MultiByteEnding2 == NULL ) {
|
|
LocalFree(MultiByteStr1);
|
|
LocalFree(MultiByteStr2);
|
|
LocalFree(MultiByteStem);
|
|
LocalFree(MultiByteEnding1);
|
|
return srcModuleError;
|
|
}
|
|
|
|
|
|
SRC src = StemmerCompare(hstm, MultiByteStr1, MultiByteStr2, MultiByteStem, MultiByteEnding1, MultiByteEnding2, pos);
|
|
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteStem, -1, lpStem, sizeof (lpStem));
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteEnding1, -1, lpEnding1, sizeof (lpEnding1));
|
|
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteEnding2, -1, lpEnding2, sizeof (lpEnding2));
|
|
|
|
LocalFree (MultiByteStr1);
|
|
LocalFree (MultiByteStr2);
|
|
LocalFree (MultiByteStem);
|
|
LocalFree (MultiByteEnding1);
|
|
LocalFree (MultiByteEnding2);
|
|
|
|
return src;
|
|
}
|
|
|
|
WINSRC StemmerCompare (HSTM hstm, LPCSTR lpStr1, LPCSTR lpStr2, LPSTR lpStem, LPSTR lpEnding1, LPSTR lpEnding2, WORD *pos)
|
|
{
|
|
// First, check the chosung of two strings
|
|
// if they are different, we may not use stemming.
|
|
CODECONVERT conv;
|
|
char inheosa1 [80], inheosa2 [80];
|
|
BYTE action;
|
|
|
|
char *incodeStr1 = new char [lstrlen (lpStr1) * 4 + 1];
|
|
char *incodeStr2 = new char [lstrlen (lpStr2) * 4 + 1];
|
|
conv.HAN2INS ((char *)lpStr1, incodeStr1, codeWanSeong);
|
|
conv.HAN2INS ((char *)lpStr2, incodeStr2, codeWanSeong);
|
|
|
|
if (incodeStr1 [0] != incodeStr2 [0])
|
|
return srcInvalid;
|
|
|
|
if (incodeStr1 [1] != incodeStr2 [1])
|
|
{
|
|
return srcInvalid;
|
|
}
|
|
|
|
delete incodeStr1;
|
|
delete incodeStr2;
|
|
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hstm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
BaseEngine BaseCheck;
|
|
|
|
char stem1[10][100], stem2[10][100], ending1[10][100], ending2[10][100], lrgsz [400];
|
|
int num1, num2, count;
|
|
WORD winfo [10];
|
|
if ((pstmi->Option & SO_NOUNPHRASE) && (pstmi->Option & (SO_NP_NOUN | SO_NP_PRONOUN | SO_NP_NUMBER | SO_NP_DEPENDENT)))
|
|
{
|
|
int num = BaseCheck.NLP_BASE_NOUN (lpStr1, lrgsz);
|
|
BOOL first = TRUE;
|
|
for (int i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
|
|
{
|
|
count = 0;
|
|
while (lrgsz [index+count] != '+' && lrgsz[index+count] != '\t')
|
|
count++;
|
|
|
|
if (first)
|
|
{
|
|
memcpy (stem1 [l], lrgsz+index, count);
|
|
stem1 [l][count] = '\0';
|
|
winfo [l] = BaseCheck.vbuf [i];
|
|
first = FALSE;
|
|
}
|
|
else
|
|
{
|
|
memcpy (ending1 [l]+index2, lrgsz+index, count);
|
|
index2 += count;
|
|
}
|
|
|
|
if (lrgsz[index+count] == '\t')
|
|
{
|
|
ending1 [l][index2] = '\0';
|
|
l++;
|
|
first = TRUE;
|
|
index2 = 0;
|
|
}
|
|
index += (count + 1);
|
|
}
|
|
num1 = l;
|
|
num = BaseCheck.NLP_BASE_NOUN (lpStr2, lrgsz);
|
|
for (i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
|
|
{
|
|
count = 0;
|
|
while (lrgsz [index+count] != '+' && lrgsz [index+count] != '\t')
|
|
count++;
|
|
|
|
if (first)
|
|
{
|
|
memcpy (stem2 [l], lrgsz+index, count);
|
|
stem2 [l][count] = '\0';
|
|
first = FALSE;
|
|
}
|
|
else
|
|
{
|
|
memcpy (ending2 [l]+index2, lrgsz+index, count);
|
|
index2 += count;
|
|
}
|
|
|
|
if (lrgsz[index+count] == '\t')
|
|
{
|
|
ending2 [l][index2] = '\0';
|
|
l++;
|
|
first = TRUE;
|
|
index2 = 0;
|
|
}
|
|
index += (count + 1);
|
|
}
|
|
num2 = l;
|
|
|
|
int j;
|
|
for (i = 0; i < num1; i++)
|
|
{
|
|
for (j = 0; j < num2; j++)
|
|
if (lstrcmp (stem1[i], stem2 [j]) == 0)
|
|
break;
|
|
if (j != num2)
|
|
break;
|
|
}
|
|
|
|
if (i != num1)
|
|
{
|
|
lstrcpy (lpStem, stem1 [i]);
|
|
lstrcpy (lpEnding1, ending1 [i]);
|
|
lstrcpy (lpEnding2, ending2 [j]);
|
|
*pos = winfo [i];
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (pstmi->Option & (SO_PREDICATE | SO_AUXILIARY))
|
|
{
|
|
int num = BaseCheck.NLP_BASE_VERB (lpStr1, lrgsz);
|
|
BOOL first = TRUE;
|
|
for (int i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
|
|
{
|
|
count = 0;
|
|
while (lrgsz [index+count] != '+' && lrgsz[index+count] != '\t')
|
|
count++;
|
|
|
|
if (first)
|
|
{
|
|
memcpy (stem1 [l], lrgsz+index, count);
|
|
stem1 [l][count] = '\0';
|
|
winfo [l] = BaseCheck.vbuf [i];
|
|
first = FALSE;
|
|
}
|
|
else
|
|
{
|
|
memcpy (ending1 [l]+index2, lrgsz+index, count);
|
|
index2 += count;
|
|
}
|
|
|
|
if (lrgsz[index+count] == '\t')
|
|
{
|
|
ending1 [l][index2] = '\0';
|
|
l++;
|
|
first = TRUE;
|
|
index2 = 0;
|
|
}
|
|
index += (count + 1);
|
|
}
|
|
num1 = l;
|
|
num = BaseCheck.NLP_BASE_VERB (lpStr2, lrgsz);
|
|
for (i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
|
|
{
|
|
count = 0;
|
|
while (lrgsz [index+count] != '+' && lrgsz [index+count] != '\t')
|
|
count++;
|
|
|
|
if (first)
|
|
{
|
|
memcpy (stem2 [l], lrgsz+index, count);
|
|
stem2 [l][count] = '\0';
|
|
first = FALSE;
|
|
}
|
|
else
|
|
{
|
|
memcpy (ending2 [l]+index2, lrgsz+index, count);
|
|
index2 += count;
|
|
}
|
|
|
|
if (lrgsz[index+count] == '\t')
|
|
{
|
|
ending2 [l][index2] = '\0';
|
|
l++;
|
|
first = TRUE;
|
|
index2 = 0;
|
|
}
|
|
index += (count + 1);
|
|
}
|
|
num2 = l;
|
|
|
|
int j;
|
|
for (i = 0; i < num1; i++)
|
|
{
|
|
for (j = 0; j < num2; j++)
|
|
if (lstrcmp (stem1[i], stem2 [j]) == 0)
|
|
break;
|
|
if (j != num2)
|
|
break;
|
|
}
|
|
|
|
if (i != num1)
|
|
{
|
|
lstrcpy (lpStem, stem1 [i]);
|
|
lstrcpy (lpEnding1, ending1 [i]);
|
|
lstrcpy (lpEnding2, ending2 [j]);
|
|
*pos = winfo [i];
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// for proper noun, for example, name
|
|
if (pstmi->Option & SO_NP_PROPER)
|
|
{
|
|
int len1 = lstrlen(lpStr1);
|
|
int len2 = lstrlen(lpStr2);
|
|
int shortlen = len1 > len2 ? len2 : len1;
|
|
if (strncmp (lpStr1, lpStr2, shortlen) == 0)
|
|
{
|
|
lstrcpy (lpStem, lpStr1);
|
|
lpStem [shortlen] = '\0';
|
|
char index [1];
|
|
index[0] = 'm';
|
|
|
|
CODECONVERT Conv;
|
|
BOOL res1 = TRUE, res2= TRUE;
|
|
|
|
lstrcpy (lpEnding1, lpStr1 + shortlen);
|
|
lstrcpy (lpEnding2, lpStr2 + shortlen);
|
|
if (lstrlen (lpEnding1))
|
|
{
|
|
Conv.HAN2INS ((char *)lpEnding1, inheosa1, codeWanSeong);
|
|
if (!(FindHeosaWord(inheosa1, _TOSSI, &action) & FINAL))
|
|
res1 = FALSE;
|
|
}
|
|
|
|
if (lstrlen (lpEnding2))
|
|
{
|
|
Conv.HAN2INS ((char *)lpEnding2, inheosa2, codeWanSeong);
|
|
if (!(FindHeosaWord(inheosa2, _TOSSI, &action) & FINAL))
|
|
res2 = FALSE;
|
|
}
|
|
|
|
if (res1 && res2)
|
|
{
|
|
*pos = POS_NOUN | PROPER_NOUN;
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
GlobalUnlock (hgbl);
|
|
return srcInvalid;
|
|
}
|
|
|
|
WINSRC StemmerIsEndingW (HSTM hstm, LPCWSTR lpStr, UINT flag, BOOL *found)
|
|
{
|
|
|
|
LPSTR MultiByteStr;
|
|
|
|
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr, -1, NULL, 0, NULL, NULL);
|
|
MultiByteStr = (LPSTR) LocalAlloc (LPTR, len);
|
|
// add a check for this point
|
|
if (MultiByteStr == NULL ) {
|
|
return srcModuleError;
|
|
}
|
|
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr, -1, MultiByteStr, len, NULL, NULL);
|
|
|
|
SRC src = StemmerIsEnding(hstm, MultiByteStr, flag, found);
|
|
|
|
LocalFree (MultiByteStr);
|
|
return src;
|
|
|
|
}
|
|
|
|
WINSRC StemmerIsEnding (HSTM hstm, LPCSTR lpStr, UINT flag, BOOL *found)
|
|
{
|
|
BOOL tossiCheck, endingCheck;
|
|
|
|
switch (flag)
|
|
{
|
|
case IS_TOSSI : tossiCheck = TRUE; endingCheck = FALSE; break;
|
|
case IS_ENDING : endingCheck = TRUE; tossiCheck = FALSE; break;
|
|
case IS_TOSSI | IS_ENDING : tossiCheck = endingCheck = TRUE; break;
|
|
default : return srcModuleError;
|
|
}
|
|
|
|
|
|
STMI *pstmi;
|
|
HGLOBAL hgbl = (HGLOBAL) hstm;
|
|
|
|
pstmi = (STMI *)GlobalLock(hgbl);
|
|
if (pstmi == NULL)
|
|
{
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError | srcInvalidID;
|
|
}
|
|
|
|
BYTE action;
|
|
char *inheosa = (char *)LocalAlloc (LPTR, lstrlen(lpStr) * 4 + 1);
|
|
// add a check for this point
|
|
if (inheosa == NULL ) {
|
|
GlobalUnlock(hgbl);
|
|
return srcModuleError;
|
|
}
|
|
|
|
CODECONVERT Conv;
|
|
Conv.HAN2INR ((char *)lpStr, inheosa, codeWanSeong);
|
|
|
|
*found = FALSE;
|
|
|
|
if (tossiCheck)
|
|
{
|
|
int res = FindHeosaWord(inheosa, _TOSSI, &action);
|
|
if (res & FINAL)
|
|
{
|
|
*found = TRUE;
|
|
endingCheck = FALSE;
|
|
}
|
|
}
|
|
if (endingCheck)
|
|
{
|
|
int res = FindHeosaWord(inheosa, _ENDING, &action);
|
|
if (res == FINAL)
|
|
*found = TRUE;
|
|
}
|
|
|
|
LocalFree (inheosa);
|
|
|
|
GlobalUnlock (hgbl);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
BOOL WINAPI DllMain (HINSTANCE hDLL, DWORD dwReason, LPVOID lpReserved){
|
|
extern char TempJumpNum [], TempSujaNum [], TempBaseNum [], TempNumNoun [], TempSuffixOut [];
|
|
extern char bTemp [], TempETC [], TempDap [];
|
|
extern LenDict JumpNum;
|
|
extern LenDict SujaNum;
|
|
extern LenDict BaseNum;
|
|
extern LenDict NumNoun;
|
|
extern LenDict Suffix;
|
|
extern LenDict B_Dict;
|
|
extern LenDict T_Dict;
|
|
extern LenDict Dap;
|
|
|
|
switch(dwReason) {
|
|
case DLL_PROCESS_ATTACH :
|
|
JumpNum.InitLenDict(TempJumpNum, 5, 5);
|
|
SujaNum.InitLenDict(TempSujaNum, 8, 27);
|
|
BaseNum.InitLenDict(TempBaseNum, 5, 3);
|
|
NumNoun.InitLenDict(TempNumNoun, 8, 32);
|
|
Suffix.InitLenDict(TempSuffixOut, 8, 8);
|
|
B_Dict.InitLenDict(bTemp, 5, 1);
|
|
T_Dict.InitLenDict(TempETC, 10, 7);
|
|
Dap.InitLenDict(TempDap, 5, 1);
|
|
break ;
|
|
case DLL_THREAD_ATTACH:
|
|
break;
|
|
case DLL_THREAD_DETACH:
|
|
break;
|
|
case DLL_PROCESS_DETACH :
|
|
break ;
|
|
} //switch
|
|
|
|
return TRUE ;
|
|
}
|
|
*/
|