mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
447 lines
7.5 KiB
447 lines
7.5 KiB
#include <windows.h>
|
|
#include <assert.h>
|
|
#include "PropNoun.H"
|
|
|
|
int __cdecl CharCompare(
|
|
const void *item1,
|
|
const void *item2)
|
|
{
|
|
PCharProb pChar1 = (PCharProb) item1;
|
|
PCharProb pChar2 = (PCharProb) item2;
|
|
|
|
if (pChar1->dwUnicode > pChar2->dwUnicode) {
|
|
return 1;
|
|
} else if (pChar1->dwUnicode < pChar2->dwUnicode) {
|
|
return -1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
int __cdecl UnicodeCompare(
|
|
const void *item1,
|
|
const void *item2)
|
|
{
|
|
int nSize1 = lstrlenW((LPWSTR) item1) * sizeof(WCHAR),
|
|
nSize2 = lstrlenW((LPWSTR) item2) * sizeof(WCHAR);
|
|
return memcmp(item1, item2, nSize1 > nSize2 ? nSize1 : nSize2);
|
|
}
|
|
|
|
int __cdecl EngNameCompare(
|
|
const void *item1,
|
|
const void *item2)
|
|
{
|
|
PEngName p1 = (PEngName) item1;
|
|
PEngName p2 = (PEngName) item2;
|
|
|
|
if (p1->wPrevUnicode > p2->wPrevUnicode) {
|
|
return 1;
|
|
} else if (p1->wPrevUnicode < p2->wPrevUnicode) {
|
|
return -1;
|
|
} else {
|
|
if (p1->wNextUnicode > p2->wNextUnicode) {
|
|
return 1;
|
|
} else if (p1->wNextUnicode < p2->wNextUnicode) {
|
|
return -1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
CProperNoun::CProperNoun(
|
|
HINSTANCE hInstance) :
|
|
m_dProperNameThreshold(FL_PROPER_NAME_THRESHOLD),
|
|
m_pCharProb(NULL),
|
|
m_dwTotalCharProbNum(0),
|
|
m_pEngNameData(NULL),
|
|
m_hProcessHeap(0),
|
|
m_hInstance(hInstance)
|
|
{
|
|
}
|
|
|
|
CProperNoun::~CProperNoun()
|
|
{
|
|
}
|
|
|
|
BOOL CProperNoun::InitData()
|
|
{
|
|
BOOL fRet = FALSE;
|
|
HRSRC hResource;
|
|
HGLOBAL hGlobal;
|
|
|
|
m_hProcessHeap = GetProcessHeap();
|
|
|
|
// Find resource
|
|
hResource = FindResource(m_hInstance, TEXT("CNAME"), TEXT("BIN"));
|
|
if (!hResource) { goto _exit; }
|
|
|
|
// Load resource
|
|
hGlobal = LoadResource(m_hInstance, hResource);
|
|
if (!hGlobal) { goto _exit; }
|
|
|
|
m_pCharProb = (PCharProb) LockResource(hGlobal);
|
|
if (!m_pCharProb) { goto _exit; }
|
|
m_dwTotalCharProbNum = SizeofResource(m_hInstance, hResource) / sizeof(CharProb);
|
|
/*
|
|
// Find resource
|
|
hResource = FindResource(m_hInstance, TEXT("ENAME"),
|
|
TEXT("BIN"));
|
|
if (!hResource) { goto _exit; }
|
|
|
|
// Load resource
|
|
hGlobal = LoadResource(m_hInstance, hResource);
|
|
if (!hGlobal) { goto _exit; }
|
|
|
|
m_pEngNameData = (PEngNameData) LockResource(hGlobal);
|
|
m_pEngNameData->pwUnicode = (PWORD) ((PBYTE) m_pEngNameData +
|
|
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
|
|
sizeof(m_pEngNameData->dwTotalEngNamePairNum));
|
|
m_pEngNameData->pEngNamePair = (PEngName) ((PBYTE) m_pEngNameData +
|
|
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
|
|
sizeof(m_pEngNameData->dwTotalEngNamePairNum) +
|
|
sizeof(m_pEngNameData->pwUnicode[0]) * m_pEngNameData->dwTotalEngUnicodeNum);
|
|
|
|
// m_pEngName = (PEngName) LockResource(hGlobal);
|
|
// m_dwTotalEngNameNum = SizeofResource(m_hInstance, hResource) / sizeof(EngName);
|
|
*/
|
|
qsort(m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]), UnicodeCompare);
|
|
|
|
fRet = TRUE;
|
|
|
|
_exit:
|
|
|
|
return fRet;
|
|
}
|
|
|
|
BOOL CProperNoun::IsAProperNoun(
|
|
LPWSTR lpwszChar,
|
|
UINT uCount)
|
|
{
|
|
return (IsAChineseName(lpwszChar, uCount) || IsAEnglishName(lpwszChar, uCount));
|
|
}
|
|
|
|
BOOL CProperNoun::IsAChineseName(
|
|
LPCWSTR lpcwszChar,
|
|
UINT uCount)
|
|
{
|
|
static WCHAR wszChar[3] = { NULL };
|
|
PWCHAR pwsResult;
|
|
|
|
wszChar[0] = lpcwszChar[0];
|
|
|
|
// Find surname
|
|
if (pwsResult = (PWCHAR) bsearch(wszChar, m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]),
|
|
UnicodeCompare)) {
|
|
FLOAT flProbability = 1;
|
|
PCharProb pCharProb;
|
|
CharProb CProb;
|
|
|
|
// Calculate probability to be a proper noun
|
|
for (UINT i = 1; i < uCount; ++i) {
|
|
CProb.dwUnicode = lpcwszChar[i];
|
|
if (pCharProb = (PCharProb) bsearch(&CProb, m_pCharProb,
|
|
m_dwTotalCharProbNum, sizeof(m_pCharProb[0]), CharCompare)) {
|
|
flProbability *= pCharProb->flProbability;
|
|
} else {
|
|
flProbability *= (FLOAT) FL_DEFAULT_CHAR_PROBABILITY;
|
|
}
|
|
}
|
|
|
|
if (flProbability >= m_dProperNameThreshold) {
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
BOOL CProperNoun::IsAEnglishName(
|
|
LPCWSTR lpwszChar,
|
|
UINT uCount)
|
|
{
|
|
static EngName Name;
|
|
|
|
Name.wPrevUnicode = lpwszChar[0];
|
|
Name.wNextUnicode = lpwszChar[uCount - 1];
|
|
|
|
if (bsearch(&Name, m_pEngNameData->pEngNamePair, m_pEngNameData->dwTotalEngUnicodeNum, sizeof(EngName), EngNameCompare)) {
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
WCHAR CProperNoun::m_pwszSurname[][3] = {
|
|
L"丁",
|
|
L"卜",
|
|
L"于",
|
|
L"尹",
|
|
L"仇",
|
|
L"元",
|
|
L"卞",
|
|
L"壬",
|
|
L"尤",
|
|
L"巴",
|
|
L"戈",
|
|
L"毛",
|
|
L"牛",
|
|
L"王",
|
|
L"丘",
|
|
L"仕",
|
|
L"冉",
|
|
L"包",
|
|
L"司",
|
|
L"史",
|
|
L"田",
|
|
L"白",
|
|
L"伊",
|
|
L"伍",
|
|
L"任",
|
|
L"光",
|
|
L"向",
|
|
L"安",
|
|
L"戎",
|
|
L"朴",
|
|
L"朱",
|
|
L"江",
|
|
L"艾",
|
|
L"衣",
|
|
L"余",
|
|
L"吳",
|
|
L"呂",
|
|
L"妙",
|
|
L"孝",
|
|
L"宋",
|
|
L"岑",
|
|
L"巫",
|
|
L"李",
|
|
L"杜",
|
|
L"沈",
|
|
L"汪",
|
|
L"狄",
|
|
L"谷",
|
|
L"貝",
|
|
L"辛",
|
|
L"邢",
|
|
L"卓",
|
|
L"周",
|
|
L"孟",
|
|
L"季",
|
|
L"官",
|
|
L"屈",
|
|
L"岳",
|
|
L"念",
|
|
L"昌",
|
|
L"林",
|
|
L"武",
|
|
L"法",
|
|
L"牧",
|
|
L"盂",
|
|
L"祁",
|
|
L"芳",
|
|
L"邵",
|
|
L"邱",
|
|
L"金",
|
|
L"侯",
|
|
L"俞\",
|
|
L"城",
|
|
L"姜",
|
|
L"姚",
|
|
L"施",
|
|
L"昱",
|
|
L"柯",
|
|
L"查",
|
|
L"柳",
|
|
L"段",
|
|
L"洪",
|
|
L"紀",
|
|
L"胡",
|
|
L"范",
|
|
L"苗",
|
|
L"計",
|
|
L"郎",
|
|
L"郁",
|
|
L"革",
|
|
L"韋",
|
|
L"候",
|
|
L"倪",
|
|
L"凌",
|
|
L"唐",
|
|
L"夏",
|
|
L"姬",
|
|
L"孫",
|
|
L"宮",
|
|
L"席",
|
|
L"徐",
|
|
L"晏",
|
|
L"桂",
|
|
L"桑",
|
|
L"桃",
|
|
L"班",
|
|
L"祖",
|
|
L"祝",
|
|
L"秦",
|
|
L"耿",
|
|
L"荊",
|
|
L"袁",
|
|
L"郝",
|
|
L"院",
|
|
L"陣",
|
|
L"馬",
|
|
L"高",
|
|
L"屠",
|
|
L"崔",
|
|
L"康",
|
|
L"庾",
|
|
L"張",
|
|
L"戚",
|
|
L"扈",
|
|
L"敖",
|
|
L"啟",
|
|
L"敏",
|
|
L"曹",
|
|
L"梁",
|
|
L"梅",
|
|
L"盛",
|
|
L"莫",
|
|
L"莊",
|
|
L"莉",
|
|
L"許\",
|
|
L"連",
|
|
L"郭",
|
|
L"陳",
|
|
L"陸",
|
|
L"陶",
|
|
L"章",
|
|
L"麥",
|
|
L"麻",
|
|
L"傅",
|
|
L"凱",
|
|
L"單",
|
|
L"喻",
|
|
L"喬",
|
|
L"彭",
|
|
L"揚",
|
|
L"智",
|
|
L"曾",
|
|
L"朝",
|
|
L"游",
|
|
L"湯",
|
|
L"焦",
|
|
L"皓",
|
|
L"程",
|
|
L"童",
|
|
L"舒",
|
|
L"華",
|
|
L"費",
|
|
L"賀",
|
|
L"買",
|
|
L"辜",
|
|
L"鄂",
|
|
L"鈕",
|
|
L"隋",
|
|
L"陽",
|
|
L"雲",
|
|
L"馮",
|
|
L"黃",
|
|
L"黑",
|
|
L"敬",
|
|
L"楚",
|
|
L"楊",
|
|
L"溫",
|
|
L"瑞",
|
|
L"葉",
|
|
L"葛",
|
|
L"董",
|
|
L"裘",
|
|
L"詹",
|
|
L"資",
|
|
L"賈",
|
|
L"鄒",
|
|
L"雍",
|
|
L"雷",
|
|
L"嘉",
|
|
L"廖",
|
|
L"榮",
|
|
L"熊",
|
|
L"甄",
|
|
L"緒",
|
|
L"裴",
|
|
L"褚",
|
|
L"趙",
|
|
L"鄞",
|
|
L"齊",
|
|
L"劉",
|
|
L"樓",
|
|
L"樊",
|
|
L"歐",
|
|
L"潘",
|
|
L"滕",
|
|
L"蔣",
|
|
L"蔡",
|
|
L"衛",
|
|
L"鄭",
|
|
L"鄧",
|
|
L"魯",
|
|
L"黎",
|
|
L"冀",
|
|
L"機",
|
|
L"燕",
|
|
L"盧",
|
|
L"穆",
|
|
L"翰",
|
|
L"蕭",
|
|
L"衡",
|
|
L"賴",
|
|
L"錢",
|
|
L"閻",
|
|
L"霍",
|
|
L"靜",
|
|
L"駱",
|
|
L"鮑",
|
|
L"龍",
|
|
L"應",
|
|
L"戴",
|
|
L"璩",
|
|
L"膺",
|
|
L"薄",
|
|
L"薜",
|
|
L"薛",
|
|
L"薇",
|
|
L"謝",
|
|
L"鍾",
|
|
L"韓",
|
|
L"瞿",
|
|
L"簡",
|
|
L"聶",
|
|
L"薩",
|
|
L"藍",
|
|
L"豐",
|
|
L"闕",
|
|
L"顏",
|
|
L"魏",
|
|
L"龐",
|
|
L"羅",
|
|
L"藤",
|
|
L"譚",
|
|
L"關",
|
|
L"隴",
|
|
L"嚴",
|
|
L"竇",
|
|
L"藺",
|
|
L"蘆",
|
|
L"蘇",
|
|
L"鐘",
|
|
L"饒",
|
|
L"顧",
|
|
L"龔",
|
|
L"巖",
|
|
L"佘",
|
|
L"卲",
|
|
L"邾",
|
|
L"涂\",
|
|
L"鄢"
|
|
};
|
|
|
|
DWORD CProperNoun::m_dwTotalSurnameNum = sizeof(m_pwszSurname) / sizeof(m_pwszSurname[0]);
|