Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

190 lines
5.7 KiB

/*
SVMHANDLER.H
(c) copyright 1998 Microsoft Corp
Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection
Robert Rounthwaite ([email protected])
*/
#if _MSC_VER > 1000
#pragma once
#endif
#include <msoejunk.h>
#ifdef DEBUG
interface ILogFile;
#endif // DEBUG
enum boolop
{
boolopOr = 0,
boolopAnd
};
enum FeatureLocation
{
locNil = 0,
locBody = 1,
locSubj = 2,
locFrom = 3,
locTo = 4,
locSpecial = 5
};
const DOUBLE THRESH_DEFAULT = 0.90;
const DOUBLE THRESH_MOST = 0.99;
const DOUBLE THRESH_LEAST = 0.80;
typedef struct tagFEATURECOMP
{
FeatureLocation loc;
union
{
LPSTR pszFeature;
ULONG ulRuleNum; // used with locSpecial
};
// map feature to location in dst file/location in SVM output
// more than one feature component may map to the same location, combined with the op
ULONG ulFeature;
boolop bop; // first feature in group is alway bopOr
BOOL fPresent;
DWORD dwFlags;
USHORT cchFeature;
} FEATURECOMP, * PFEATURECOMP;
static const int CPBLIST_MAX = 256;
typedef struct tagBODYLIST
{
USHORT usItem;
USHORT iNext;
} BODYLIST, * PBODYLIST;
class CJunkFilter : public IOEJunkFilter
{
private:
enum
{
STATE_UNINIT = 0x00000000,
STATE_INITIALIZED = 0x00000001
};
private:
LONG m_cRef;
CRITICAL_SECTION m_cs;
DWORD m_dwState;
// Properties of the user
LPSTR m_pszFirstName;
ULONG m_cchFirstName;
LPSTR m_pszLastName;
ULONG m_cchLastName;
LPSTR m_pszCompanyName;
ULONG m_cchCompanyName;
#ifdef DEBUG
BOOL m_fJunkMailLogInit;
ILogFile * m_pILogFile;
#endif // DEBUG
public:
// Constructor/destructor
CJunkFilter();
~CJunkFilter();
// IUnknown members
STDMETHODIMP QueryInterface(REFIID riid, void ** ppvObject);
STDMETHODIMP_(ULONG) AddRef(void);
STDMETHODIMP_(ULONG) Release(void);
// IOEJunkFilter
STDMETHODIMP SetIdentity(LPCSTR pszFirstName, LPCSTR pszLastName, LPCSTR pszCompanyName);
STDMETHODIMP LoadDataFile(LPCSTR pszFilePath);
STDMETHODIMP SetSpamThresh(ULONG ulThresh);
STDMETHODIMP GetSpamThresh(ULONG * pulThresh);
STDMETHODIMP GetDefaultSpamThresh(DOUBLE * pdblThresh);
STDMETHODIMP CalcJunkProb(DWORD dwFlags, IMimePropertySet * pIMPropSet, IMimeMessage * pIMMsg, double * pdblProb);
// returns default value for SpamCutoff. read from SVM output file.
// should call FSetSVMDataLocation before calling this function
DOUBLE DblGetDefaultSpamCutoff(VOID){Assert(NULL != m_pszLOCPath); return m_dblDefaultThresh;}
// Calculates the probability that the current message (defined by the properties of the message) is spam.
// !Note! that the IN string params may be modified by the function.
// Returns the probability (0 to 1) that the message is spam in pdblSpamProb
// the boolean return is determined by comparing to the spam cutoff
// if the value of a boolean param is unknown use false, use 0 for unknown time.
BOOL FCalculateSpamProb(LPSTR pszFrom, LPSTR pszTo, LPSTR pszSubject, IStream * pIStmBody,
BOOL fDirectMessage, BOOL fHasAttach, FILETIME * pftMessageSent,
DOUBLE * pdblSpamProb, BOOL * pfIsSpam);
// Reads the default spam cutoff without parsing entire file
// Use GetDefaultSpamCutoff if using FSetSVMDataLocation;
static HRESULT HrReadDefaultSpamCutoff(LPSTR pszFullPath, DOUBLE * pdblDefCutoff);
private: // members
WORD m_rgiBodyList[CPBLIST_MAX];
BODYLIST * m_pblistBodyList;
USHORT m_cblistBodyList;
FEATURECOMP * m_rgfeaturecomps;
// weights from SVM output
DOUBLE * m_rgdblSVMWeights;
// Other SVM file variables
DOUBLE m_dblCC;
DOUBLE m_dblDD;
DOUBLE m_dblThresh;
DOUBLE m_dblDefaultThresh;
DOUBLE m_dblMostThresh;
DOUBLE m_dblLeastThresh;
// Counts
ULONG m_cFeatures;
ULONG m_cFeatureComps;
// is Feature present? -1 indicates not yet set, 0 indicates not present, 1 indicates present
ULONG * m_rgulFeatureStatus;
// Set via FSetSVMDataLocation() and SetSpamCutoff()
LPSTR m_pszLOCPath;
DOUBLE m_dblSpamCutoff;
// Properties of the message
LPSTR m_pszFrom;
LPSTR m_pszTo;
LPSTR m_pszSubject;
IStream * m_pIStmBody;
ULONG m_cbBody;
BOOL m_fDirectMessage;
FILETIME m_ftMessageSent;
BOOL m_fHasAttach;
// Cached special rule results used during spam calculations
BOOL m_fRule14;
BOOL m_fRule17;
private: // methods
HRESULT _HrReadSVMOutput(LPCSTR lpszFileName);
void _EvaluateFeatureComponents(VOID);
VOID _ProcessFeatureComponentPresence(VOID);
DOUBLE _DblDoSVMCalc(VOID);
BOOL _FInvokeSpecialRule(UINT iRuleNum);
VOID _HandleCaseSensitiveSpecialRules(VOID);
VOID _EvaluateBodyFeatures(VOID);
HRESULT _HrBuildBodyList(USHORT cBodyItems);
#ifdef DEBUG
HRESULT _HrCreateLogFile(VOID);
VOID _PrintFeatureToLog(ULONG ulIndex);
VOID _PrintSpecialFeatureToLog(UINT iRuleNum);
#endif // DEBUG
};