mirror of https://github.com/tongzx/nt5src
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
136 lines
4.5 KiB
136 lines
4.5 KiB
/******************************************************************************\
|
|
* FILE: unigram.h
|
|
*
|
|
* Public structures and functions library that are used to access the
|
|
* unigram information.
|
|
*
|
|
* Note that the code to create the binary file is in mkuni, not in the
|
|
* common library.
|
|
\******************************************************************************/
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/************************************************************************************************\
|
|
* Public interface to unigram data.
|
|
\************************************************************************************************/
|
|
|
|
//
|
|
// Structures and types
|
|
//
|
|
|
|
// Structure giving access to a loaded copy of the unigram tables. We store the
|
|
// frequencies as scores that are -10 * log2(prob).
|
|
// Note we do a hack to keep the score values in one byte. We subtract an
|
|
// offset from the values. Values that overflow that range are truncated to fit.
|
|
typedef struct tagUNIGRAM_INFO {
|
|
WORD cScores; // Number of entries in score table.
|
|
WORD iRareScore; // Frequency of items not in freq. table.
|
|
BYTE iOffset; // Offset to add to scores.
|
|
BYTE spare[3]; // keep alignment.
|
|
BYTE *pScores; // Pointer to scores.
|
|
|
|
void *pLoadInfo1; // Handles needed to unload the data
|
|
void *pLoadInfo2;
|
|
void *pLoadInfo3;
|
|
} UNIGRAM_INFO;
|
|
|
|
//
|
|
// Functions.
|
|
//
|
|
|
|
// Load unigram information from a file.
|
|
BOOL UnigramLoadFile(LOCRUN_INFO *pLocRunInfo, UNIGRAM_INFO *pUnigramInfo, wchar_t *pPath);
|
|
|
|
// Unload runtime localization information that was loaded from a file.
|
|
BOOL UnigramUnloadFile(UNIGRAM_INFO *pUnigramInfo);
|
|
|
|
// Load unigram information from a resource.
|
|
// Note, don't need to unload resources.
|
|
BOOL UnigramLoadRes(
|
|
LOCRUN_INFO *pLocRunInfo,
|
|
UNIGRAM_INFO *pUnigramInfo,
|
|
HINSTANCE hInst,
|
|
int nResID,
|
|
int nType
|
|
);
|
|
|
|
// Load runtime localization information from an image already loaded into
|
|
// memory.
|
|
BOOL UnigramLoadPointer(LOCRUN_INFO *pLocRunInfo, UNIGRAM_INFO *pUnigramInfo, void *pData);
|
|
|
|
// Get unigram probability for a character. Character must be passed in as
|
|
// dense coded value. Warning: value returned as log2(prob)/10. I don't know
|
|
// why, but this is what the old code did!
|
|
float UnigramCost(
|
|
UNIGRAM_INFO *pUnigramInfo,
|
|
wchar_t dch
|
|
);
|
|
|
|
#ifdef ZTRAIN
|
|
// Takes a character (possibly folded) and returns the probability of that
|
|
// character occurring.
|
|
float UnigramCostFolded(LOCRUN_INFO *pLocRunInfo, UNIGRAM_INFO *pUnigramInfo, wchar_t wFold);
|
|
#endif
|
|
|
|
/************************************************************************************************\
|
|
* Stuff to access binary unigram file, only used by common and mktable.
|
|
\************************************************************************************************/
|
|
|
|
// The format for the unigram file is:
|
|
// Header:
|
|
// DWORD File type indicator.
|
|
// DWORD Size of header.
|
|
// BYTE Lowest version this code that can read this file.
|
|
// BYTE Version of this code that wrote this file.
|
|
// wchar_t[4] Locale ID (3 characters plus null).
|
|
// DWORD * 3 Locale signature
|
|
// WORD Number of entries in frequency table.
|
|
// WORD Frequency of items not in freq. table.
|
|
// WORD Reserved for future use.
|
|
// DWORD * 2 Reserved for future use.
|
|
// Frequency table:
|
|
// BYTE Frequency for dense code 0.
|
|
// BYTE Frequency for dense code 1.
|
|
// .
|
|
// .
|
|
// .
|
|
// BYTE Frequency for dense code N.
|
|
//
|
|
// NOTE: Frequencies are stored as -10 * log2(prob)
|
|
|
|
//
|
|
// Constants
|
|
//
|
|
|
|
// Magic key the identifies the Local Runtime files
|
|
#define UNIGRAM_FILE_TYPE 0xFD8BA978
|
|
|
|
// Version information for file.
|
|
#define UNIGRAM_MIN_FILE_VERSION 0 // First version of code that can read this file
|
|
#define UNIGRAM_CUR_FILE_VERSION 0 // Current version of code.
|
|
#define UNIGRAM_OLD_FILE_VERSION 0 // Oldest file version this code can read.
|
|
|
|
//
|
|
// Structures and types
|
|
//
|
|
|
|
// Structure to hold file header.
|
|
typedef struct tagUNIGRAM_HEADER {
|
|
DWORD fileType; // This should always be set to UNIGRAM_FILE_TYPE.
|
|
DWORD headerSize; // Size of the header.
|
|
BYTE minFileVer; // Earliest version of code that can read this file
|
|
BYTE curFileVer; // Current version of code that wrote the file.
|
|
wchar_t locale[4]; // Locale ID string.
|
|
DWORD adwSignature [3]; // Locale signature
|
|
WORD cScores; // Number of entries in score table.
|
|
WORD iRareScore; // Frequency of items not in freq. table.
|
|
BYTE iOffset;
|
|
BYTE reserved1;
|
|
DWORD reserved2[2];
|
|
} UNIGRAM_HEADER;
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|