/* SVMHANDLER.H (c) copyright 1998 Microsoft Corp Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection Robert Rounthwaite (RobertRo@microsoft.com) */ #if _MSC_VER > 1000 #pragma once #endif #include #ifdef DEBUG interface ILogFile; #endif // DEBUG enum boolop { boolopOr = 0, boolopAnd }; enum FeatureLocation { locNil = 0, locBody = 1, locSubj = 2, locFrom = 3, locTo = 4, locSpecial = 5 }; const DOUBLE THRESH_DEFAULT = 0.90; const DOUBLE THRESH_MOST = 0.99; const DOUBLE THRESH_LEAST = 0.80; typedef struct tagFEATURECOMP { FeatureLocation loc; union { LPSTR pszFeature; ULONG ulRuleNum; // used with locSpecial }; // map feature to location in dst file/location in SVM output // more than one feature component may map to the same location, combined with the op ULONG ulFeature; boolop bop; // first feature in group is alway bopOr BOOL fPresent; DWORD dwFlags; USHORT cchFeature; } FEATURECOMP, * PFEATURECOMP; static const int CPBLIST_MAX = 256; typedef struct tagBODYLIST { USHORT usItem; USHORT iNext; } BODYLIST, * PBODYLIST; class CJunkFilter : public IOEJunkFilter { private: enum { STATE_UNINIT = 0x00000000, STATE_INITIALIZED = 0x00000001 }; private: LONG m_cRef; CRITICAL_SECTION m_cs; DWORD m_dwState; // Properties of the user LPSTR m_pszFirstName; ULONG m_cchFirstName; LPSTR m_pszLastName; ULONG m_cchLastName; LPSTR m_pszCompanyName; ULONG m_cchCompanyName; #ifdef DEBUG BOOL m_fJunkMailLogInit; ILogFile * m_pILogFile; #endif // DEBUG public: // Constructor/destructor CJunkFilter(); ~CJunkFilter(); // IUnknown members STDMETHODIMP QueryInterface(REFIID riid, void ** ppvObject); STDMETHODIMP_(ULONG) AddRef(void); STDMETHODIMP_(ULONG) Release(void); // IOEJunkFilter STDMETHODIMP SetIdentity(LPCSTR pszFirstName, LPCSTR pszLastName, LPCSTR pszCompanyName); STDMETHODIMP LoadDataFile(LPCSTR pszFilePath); STDMETHODIMP SetSpamThresh(ULONG ulThresh); STDMETHODIMP GetSpamThresh(ULONG * pulThresh); STDMETHODIMP GetDefaultSpamThresh(DOUBLE * pdblThresh); STDMETHODIMP CalcJunkProb(DWORD dwFlags, IMimePropertySet * pIMPropSet, IMimeMessage * pIMMsg, double * pdblProb); // returns default value for SpamCutoff. read from SVM output file. // should call FSetSVMDataLocation before calling this function DOUBLE DblGetDefaultSpamCutoff(VOID){Assert(NULL != m_pszLOCPath); return m_dblDefaultThresh;} // Calculates the probability that the current message (defined by the properties of the message) is spam. // !Note! that the IN string params may be modified by the function. // Returns the probability (0 to 1) that the message is spam in pdblSpamProb // the boolean return is determined by comparing to the spam cutoff // if the value of a boolean param is unknown use false, use 0 for unknown time. BOOL FCalculateSpamProb(LPSTR pszFrom, LPSTR pszTo, LPSTR pszSubject, IStream * pIStmBody, BOOL fDirectMessage, BOOL fHasAttach, FILETIME * pftMessageSent, DOUBLE * pdblSpamProb, BOOL * pfIsSpam); // Reads the default spam cutoff without parsing entire file // Use GetDefaultSpamCutoff if using FSetSVMDataLocation; static HRESULT HrReadDefaultSpamCutoff(LPSTR pszFullPath, DOUBLE * pdblDefCutoff); private: // members WORD m_rgiBodyList[CPBLIST_MAX]; BODYLIST * m_pblistBodyList; USHORT m_cblistBodyList; FEATURECOMP * m_rgfeaturecomps; // weights from SVM output DOUBLE * m_rgdblSVMWeights; // Other SVM file variables DOUBLE m_dblCC; DOUBLE m_dblDD; DOUBLE m_dblThresh; DOUBLE m_dblDefaultThresh; DOUBLE m_dblMostThresh; DOUBLE m_dblLeastThresh; // Counts ULONG m_cFeatures; ULONG m_cFeatureComps; // is Feature present? -1 indicates not yet set, 0 indicates not present, 1 indicates present ULONG * m_rgulFeatureStatus; // Set via FSetSVMDataLocation() and SetSpamCutoff() LPSTR m_pszLOCPath; DOUBLE m_dblSpamCutoff; // Properties of the message LPSTR m_pszFrom; LPSTR m_pszTo; LPSTR m_pszSubject; IStream * m_pIStmBody; ULONG m_cbBody; BOOL m_fDirectMessage; FILETIME m_ftMessageSent; BOOL m_fHasAttach; // Cached special rule results used during spam calculations BOOL m_fRule14; BOOL m_fRule17; private: // methods HRESULT _HrReadSVMOutput(LPCSTR lpszFileName); void _EvaluateFeatureComponents(VOID); VOID _ProcessFeatureComponentPresence(VOID); DOUBLE _DblDoSVMCalc(VOID); BOOL _FInvokeSpecialRule(UINT iRuleNum); VOID _HandleCaseSensitiveSpecialRules(VOID); VOID _EvaluateBodyFeatures(VOID); HRESULT _HrBuildBodyList(USHORT cBodyItems); #ifdef DEBUG HRESULT _HrCreateLogFile(VOID); VOID _PrintFeatureToLog(ULONG ulIndex); VOID _PrintSpecialFeatureToLog(UINT iRuleNum); #endif // DEBUG };