Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

190 lines
5.7 KiB

  1. /*
  2. SVMHANDLER.H
  3. (c) copyright 1998 Microsoft Corp
  4. Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection
  5. Robert Rounthwaite (RobertRo@microsoft.com)
  6. */
  7. #if _MSC_VER > 1000
  8. #pragma once
  9. #endif
  10. #include <msoejunk.h>
  11. #ifdef DEBUG
  12. interface ILogFile;
  13. #endif // DEBUG
  14. enum boolop
  15. {
  16. boolopOr = 0,
  17. boolopAnd
  18. };
  19. enum FeatureLocation
  20. {
  21. locNil = 0,
  22. locBody = 1,
  23. locSubj = 2,
  24. locFrom = 3,
  25. locTo = 4,
  26. locSpecial = 5
  27. };
  28. const DOUBLE THRESH_DEFAULT = 0.90;
  29. const DOUBLE THRESH_MOST = 0.99;
  30. const DOUBLE THRESH_LEAST = 0.80;
  31. typedef struct tagFEATURECOMP
  32. {
  33. FeatureLocation loc;
  34. union
  35. {
  36. LPSTR pszFeature;
  37. ULONG ulRuleNum; // used with locSpecial
  38. };
  39. // map feature to location in dst file/location in SVM output
  40. // more than one feature component may map to the same location, combined with the op
  41. ULONG ulFeature;
  42. boolop bop; // first feature in group is alway bopOr
  43. BOOL fPresent;
  44. DWORD dwFlags;
  45. USHORT cchFeature;
  46. } FEATURECOMP, * PFEATURECOMP;
  47. static const int CPBLIST_MAX = 256;
  48. typedef struct tagBODYLIST
  49. {
  50. USHORT usItem;
  51. USHORT iNext;
  52. } BODYLIST, * PBODYLIST;
  53. class CJunkFilter : public IOEJunkFilter
  54. {
  55. private:
  56. enum
  57. {
  58. STATE_UNINIT = 0x00000000,
  59. STATE_INITIALIZED = 0x00000001
  60. };
  61. private:
  62. LONG m_cRef;
  63. CRITICAL_SECTION m_cs;
  64. DWORD m_dwState;
  65. // Properties of the user
  66. LPSTR m_pszFirstName;
  67. ULONG m_cchFirstName;
  68. LPSTR m_pszLastName;
  69. ULONG m_cchLastName;
  70. LPSTR m_pszCompanyName;
  71. ULONG m_cchCompanyName;
  72. #ifdef DEBUG
  73. BOOL m_fJunkMailLogInit;
  74. ILogFile * m_pILogFile;
  75. #endif // DEBUG
  76. public:
  77. // Constructor/destructor
  78. CJunkFilter();
  79. ~CJunkFilter();
  80. // IUnknown members
  81. STDMETHODIMP QueryInterface(REFIID riid, void ** ppvObject);
  82. STDMETHODIMP_(ULONG) AddRef(void);
  83. STDMETHODIMP_(ULONG) Release(void);
  84. // IOEJunkFilter
  85. STDMETHODIMP SetIdentity(LPCSTR pszFirstName, LPCSTR pszLastName, LPCSTR pszCompanyName);
  86. STDMETHODIMP LoadDataFile(LPCSTR pszFilePath);
  87. STDMETHODIMP SetSpamThresh(ULONG ulThresh);
  88. STDMETHODIMP GetSpamThresh(ULONG * pulThresh);
  89. STDMETHODIMP GetDefaultSpamThresh(DOUBLE * pdblThresh);
  90. STDMETHODIMP CalcJunkProb(DWORD dwFlags, IMimePropertySet * pIMPropSet, IMimeMessage * pIMMsg, double * pdblProb);
  91. // returns default value for SpamCutoff. read from SVM output file.
  92. // should call FSetSVMDataLocation before calling this function
  93. DOUBLE DblGetDefaultSpamCutoff(VOID){Assert(NULL != m_pszLOCPath); return m_dblDefaultThresh;}
  94. // Calculates the probability that the current message (defined by the properties of the message) is spam.
  95. // !Note! that the IN string params may be modified by the function.
  96. // Returns the probability (0 to 1) that the message is spam in pdblSpamProb
  97. // the boolean return is determined by comparing to the spam cutoff
  98. // if the value of a boolean param is unknown use false, use 0 for unknown time.
  99. BOOL FCalculateSpamProb(LPSTR pszFrom, LPSTR pszTo, LPSTR pszSubject, IStream * pIStmBody,
  100. BOOL fDirectMessage, BOOL fHasAttach, FILETIME * pftMessageSent,
  101. DOUBLE * pdblSpamProb, BOOL * pfIsSpam);
  102. // Reads the default spam cutoff without parsing entire file
  103. // Use GetDefaultSpamCutoff if using FSetSVMDataLocation;
  104. static HRESULT HrReadDefaultSpamCutoff(LPSTR pszFullPath, DOUBLE * pdblDefCutoff);
  105. private: // members
  106. WORD m_rgiBodyList[CPBLIST_MAX];
  107. BODYLIST * m_pblistBodyList;
  108. USHORT m_cblistBodyList;
  109. FEATURECOMP * m_rgfeaturecomps;
  110. // weights from SVM output
  111. DOUBLE * m_rgdblSVMWeights;
  112. // Other SVM file variables
  113. DOUBLE m_dblCC;
  114. DOUBLE m_dblDD;
  115. DOUBLE m_dblThresh;
  116. DOUBLE m_dblDefaultThresh;
  117. DOUBLE m_dblMostThresh;
  118. DOUBLE m_dblLeastThresh;
  119. // Counts
  120. ULONG m_cFeatures;
  121. ULONG m_cFeatureComps;
  122. // is Feature present? -1 indicates not yet set, 0 indicates not present, 1 indicates present
  123. ULONG * m_rgulFeatureStatus;
  124. // Set via FSetSVMDataLocation() and SetSpamCutoff()
  125. LPSTR m_pszLOCPath;
  126. DOUBLE m_dblSpamCutoff;
  127. // Properties of the message
  128. LPSTR m_pszFrom;
  129. LPSTR m_pszTo;
  130. LPSTR m_pszSubject;
  131. IStream * m_pIStmBody;
  132. ULONG m_cbBody;
  133. BOOL m_fDirectMessage;
  134. FILETIME m_ftMessageSent;
  135. BOOL m_fHasAttach;
  136. // Cached special rule results used during spam calculations
  137. BOOL m_fRule14;
  138. BOOL m_fRule17;
  139. private: // methods
  140. HRESULT _HrReadSVMOutput(LPCSTR lpszFileName);
  141. void _EvaluateFeatureComponents(VOID);
  142. VOID _ProcessFeatureComponentPresence(VOID);
  143. DOUBLE _DblDoSVMCalc(VOID);
  144. BOOL _FInvokeSpecialRule(UINT iRuleNum);
  145. VOID _HandleCaseSensitiveSpecialRules(VOID);
  146. VOID _EvaluateBodyFeatures(VOID);
  147. HRESULT _HrBuildBodyList(USHORT cBodyItems);
  148. #ifdef DEBUG
  149. HRESULT _HrCreateLogFile(VOID);
  150. VOID _PrintFeatureToLog(ULONG ulIndex);
  151. VOID _PrintSpecialFeatureToLog(UINT iRuleNum);
  152. #endif // DEBUG
  153. };