Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1926 lines
53 KiB

  1. /*
  2. SVMHANDLER.CPP
  3. (c) copyright 1998 Microsoft Corp
  4. Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection
  5. Robert Rounthwaite (RobertRo@microsoft.com)
  6. */
  7. #include <pch.hxx>
  8. #include "junkeng.h"
  9. #include "junkutil.h"
  10. #include "parsestm.h"
  11. #include <iert.h>
  12. #include <math.h>
  13. #include <limits.h>
  14. class CBodyBuff
  15. {
  16. private:
  17. enum
  18. {
  19. CB_BODYBUFF_MAX = 4096
  20. };
  21. enum
  22. {
  23. BBF_CLEAR = 0x00000000,
  24. BBF_SET = 0x00000001,
  25. BBF_ALPHA = 0x00000002,
  26. BBF_NUM = 0x00000004,
  27. BBF_SPACE = 0x00000008,
  28. BBF_MASK = 0x0000000F
  29. };
  30. private:
  31. IStream * m_pIStream;
  32. ULONG m_cbStream;
  33. ULONG m_ibStream;
  34. BYTE m_rgbBuff[CB_BODYBUFF_MAX];
  35. ULONG m_cbBuffTotal;
  36. BYTE * m_pbBuffCurr;
  37. DWORD m_dwFlagsCurr;
  38. BYTE * m_pbBuffGood;
  39. BYTE * m_pbBuffPrev;
  40. DWORD m_dwFlagsPrev;
  41. public:
  42. CBodyBuff() : m_pIStream(NULL), m_cbStream(0), m_ibStream(0),
  43. m_cbBuffTotal(0), m_pbBuffCurr(m_rgbBuff), m_dwFlagsCurr(BBF_CLEAR),
  44. m_pbBuffGood(m_rgbBuff), m_pbBuffPrev(NULL), m_dwFlagsPrev(BBF_CLEAR) {}
  45. ~CBodyBuff() {SafeRelease(m_pIStream);}
  46. HRESULT HrInit(DWORD dwFlags, IStream * pIStream);
  47. HRESULT HrGetCurrChar(CHAR * pchNext);
  48. BOOL FDoMatch(FEATURECOMP * pfcomp);
  49. HRESULT HrMoveNext(VOID)
  50. {
  51. m_pbBuffPrev = m_pbBuffCurr;
  52. m_dwFlagsPrev = m_dwFlagsCurr;
  53. m_pbBuffCurr = (BYTE *) CharNext((LPSTR) m_pbBuffCurr);
  54. m_dwFlagsCurr = BBF_CLEAR;
  55. return S_OK;
  56. }
  57. private:
  58. HRESULT _HrFillBuffer(VOID);
  59. };
  60. static const LPSTR szCountFeatureComp = "FeatureComponentCount = ";
  61. static const LPSTR szDefaultThresh = "dThresh = ";
  62. static const LPSTR szMostThresh = "mThresh = ";
  63. static const LPSTR szLeastThresh = "lThresh = ";
  64. static const LPSTR szThresh = "Threshold = ";
  65. static const LPSTR szNumberofDim = "NumDim = ";
  66. #ifdef DEBUG
  67. static const LPSTR STR_REG_PATH_FLAT = "Software\\Microsoft\\Outlook Express";
  68. static const LPSTR szJunkMailPrefix = "JUNKMAIL";
  69. static const LPSTR szJunkMailLog = "JUNKMAIL.LOG";
  70. static const LPSTR LOG_TAGLINE = "Calculating Junk Mail for message: %s";
  71. static const LPSTR LOG_FIRSTNAME = "User's First Name: %s";
  72. static const LPSTR LOG_LASTNAME = "User's Last Name: %s";
  73. static const LPSTR LOG_COMPANYNAME = "User's Company Name: %s";
  74. static const LPSTR LOG_BODY = "Body contains: %s";
  75. static const LPSTR LOG_SUBJECT = "Subject contains: %s";
  76. static const LPSTR LOG_TO = "To line contains: %s";
  77. static const LPSTR LOG_FROM = "From line contains: %s";
  78. static const LPSTR LOG_FINAL = "Junk Mail percentage: %0.1d.%0.6d\r\n";
  79. #endif // DEBUG
  80. BOOL FReadDouble(LPSTR pszLine, LPSTR pszToken, DOUBLE * pdblVal);
  81. #ifdef DEBUG
  82. VOID PrintToLogFile(ILogFile * pILogFile, LPSTR pszTmpl, LPSTR pszArg);
  83. #endif // DEBUG
  84. HRESULT CBodyBuff::HrInit(DWORD dwFlags, IStream * pIStream)
  85. {
  86. HRESULT hr = S_OK;
  87. // Check incoming params
  88. if (NULL == pIStream)
  89. {
  90. hr = E_INVALIDARG;
  91. goto exit;
  92. }
  93. // Set the stream
  94. m_pIStream = pIStream;
  95. m_pIStream->AddRef();
  96. // Get the stream size
  97. hr = HrGetStreamSize(m_pIStream, &m_cbStream);
  98. if (FAILED(hr))
  99. {
  100. goto exit;
  101. }
  102. // Reset the stream to the beginning
  103. hr = HrRewindStream(m_pIStream);
  104. if (FAILED(hr))
  105. {
  106. goto exit;
  107. }
  108. // Start from the beginning
  109. m_ibStream = 0;
  110. exit:
  111. return hr;
  112. }
  113. HRESULT CBodyBuff::HrGetCurrChar(CHAR * pchNext)
  114. {
  115. HRESULT hr = S_OK;
  116. // Check incoming params
  117. Assert(NULL != pchNext);
  118. // Do we need to get any more characters?
  119. if (m_pbBuffCurr >= m_pbBuffGood)
  120. {
  121. // If we couldn't get any more characters
  122. if (S_OK != _HrFillBuffer())
  123. {
  124. hr = E_FAIL;
  125. goto exit;
  126. }
  127. }
  128. // Get the current char
  129. *pchNext = *m_pbBuffCurr;
  130. hr = S_OK;
  131. exit:
  132. return hr;
  133. }
  134. BOOL CBodyBuff::FDoMatch(FEATURECOMP * pfcomp)
  135. {
  136. BOOL fRet = FALSE;
  137. BYTE * pbSearch = NULL;
  138. ULONG cchSearch = 0;
  139. LPSTR pszMatch = NULL;
  140. DWORD dwFlags = 0;
  141. // Check incoming params
  142. Assert(NULL != pfcomp);
  143. Assert(NULL != pfcomp->pszFeature);
  144. Assert(0 != pfcomp->cchFeature);
  145. // Set up some locals
  146. cchSearch = pfcomp->cchFeature;
  147. // Do we need more characters for the match?
  148. // Include the character after the string, just in case
  149. // we have a match and need to check the character after
  150. // the string for a word break
  151. if ((cchSearch + 1) > (ULONG) (m_pbBuffGood - m_pbBuffCurr))
  152. {
  153. // Get more characters
  154. // If this fails, we still might be good, since
  155. // we might just have enough characters to do the
  156. // full match at the end of the stream.
  157. (VOID) _HrFillBuffer();
  158. // Could we get enough?
  159. if (cchSearch > (ULONG) (m_pbBuffGood - m_pbBuffCurr))
  160. {
  161. // No Match
  162. fRet = FALSE;
  163. goto exit;
  164. }
  165. }
  166. // Do match
  167. pbSearch = m_pbBuffCurr;
  168. pszMatch = pfcomp->pszFeature;
  169. while (0 != cchSearch--)
  170. {
  171. if (*(pszMatch++) != *(pbSearch++))
  172. {
  173. // No Match
  174. fRet = FALSE;
  175. goto exit;
  176. }
  177. }
  178. // Validate the match
  179. // Do we need to figure out if it starts with a word break?
  180. if (0 != (pfcomp->dwFlags & CT_START_SET))
  181. {
  182. dwFlags = pfcomp->dwFlags;
  183. }
  184. else
  185. {
  186. Assert(CT_END_SET != (dwFlags & CT_END_SET));
  187. dwFlags = m_dwFlagsCurr;
  188. }
  189. Assert(CT_START_SET == BBF_SET);
  190. Assert(CT_START_ALPHA == BBF_ALPHA);
  191. fRet = FMatchToken((NULL == m_pbBuffPrev),
  192. ((m_ibStream >= m_cbStream) && ((m_pbBuffCurr + pfcomp->cchFeature) >= m_pbBuffGood)),
  193. (LPCSTR) m_pbBuffPrev, &m_dwFlagsPrev, pfcomp->pszFeature,
  194. pfcomp->cchFeature, &dwFlags, (LPCSTR) (m_pbBuffCurr + pfcomp->cchFeature));
  195. // Save the changed flags
  196. pfcomp->dwFlags = dwFlags;
  197. // Cache the current character's state
  198. m_dwFlagsCurr = (dwFlags & BBF_MASK);
  199. exit:
  200. return fRet;
  201. }
  202. HRESULT CBodyBuff::_HrFillBuffer(VOID)
  203. {
  204. HRESULT hr = S_OK;
  205. LONG cbExtra = 0;
  206. ULONG cbRead = 0;
  207. ULONG cbToRead = 0;
  208. // If there isn't any more of the stream to grab
  209. if (m_ibStream >= m_cbStream)
  210. {
  211. hr = S_FALSE;
  212. goto exit;
  213. }
  214. // If this is the first time through, save nothing
  215. if (NULL == m_pbBuffPrev)
  216. {
  217. cbExtra = 0;
  218. }
  219. else
  220. {
  221. // How much space should I save?
  222. cbExtra = (ULONG) (m_cbBuffTotal - (m_pbBuffPrev - m_rgbBuff));
  223. Assert(cbExtra > 0);
  224. // Save the unused data
  225. MoveMemory(m_rgbBuff, m_pbBuffPrev, (int)min(cbExtra, sizeof(m_rgbBuff)));
  226. // Reset the current pointer
  227. m_pbBuffCurr = m_rgbBuff + (m_pbBuffCurr - m_pbBuffPrev);
  228. // Reset the previous pointer
  229. m_pbBuffPrev = m_rgbBuff;
  230. }
  231. // Read in more data
  232. cbToRead = (int)min(CB_BODYBUFF_MAX - cbExtra - 1, (LONG) (m_cbStream - m_ibStream));
  233. hr = m_pIStream->Read(m_rgbBuff + cbExtra, cbToRead, &cbRead);
  234. if ((FAILED(hr)) || (0 == cbRead))
  235. {
  236. // End of stream
  237. hr = S_FALSE;
  238. }
  239. else
  240. {
  241. hr = S_OK;
  242. }
  243. // Track the number of bytes read
  244. m_ibStream += cbRead;
  245. // Set the total buffer size
  246. m_cbBuffTotal = cbExtra + cbRead;
  247. // Terminate the buffer, just in case
  248. m_rgbBuff[m_cbBuffTotal] = '\0';
  249. // Uppercase the buffer
  250. m_pbBuffGood = m_rgbBuff + CharUpperBuff((CHAR *) m_rgbBuff, m_cbBuffTotal);
  251. exit:
  252. return hr;
  253. }
  254. HRESULT CJunkFilter::_HrBuildBodyList(USHORT cBodyItems)
  255. {
  256. HRESULT hr = S_OK;
  257. USHORT usIndex = 0;
  258. FEATURECOMP * pfcomp = NULL;
  259. USHORT iBodyList = 0;
  260. // Check incoming params
  261. if (0 == cBodyItems)
  262. {
  263. hr = E_INVALIDARG;
  264. goto exit;
  265. }
  266. Assert(USHRT_MAX > cBodyItems);
  267. // Make sure the old items are freed
  268. SafeMemFree(m_pblistBodyList);
  269. m_cblistBodyList = 0;
  270. // Initialize the list
  271. ZeroMemory(m_rgiBodyList, sizeof(m_rgiBodyList));
  272. // Allocate space to hold all of the items
  273. hr = HrAlloc((VOID **) &m_pblistBodyList, sizeof(*m_pblistBodyList) * (cBodyItems + 1));
  274. if (FAILED(hr))
  275. {
  276. goto exit;
  277. }
  278. // Initialize the body list
  279. ZeroMemory(m_pblistBodyList, sizeof(*m_pblistBodyList) * (cBodyItems + 1));
  280. // For each feature
  281. for (usIndex = 0, iBodyList = 1, pfcomp = m_rgfeaturecomps; usIndex < m_cFeatureComps; usIndex++, pfcomp++)
  282. {
  283. // If it's a body feature
  284. if (locBody == pfcomp->loc)
  285. {
  286. // Initialize it
  287. m_pblistBodyList[iBodyList].usItem = usIndex;
  288. // Add it to the list
  289. m_pblistBodyList[iBodyList].iNext = m_rgiBodyList[(UCHAR) (pfcomp->pszFeature[0])];
  290. m_rgiBodyList[(UCHAR) (pfcomp->pszFeature[0])] = iBodyList;
  291. // Move to the next body item
  292. iBodyList++;
  293. }
  294. }
  295. // Save the number of items
  296. m_cblistBodyList = cBodyItems + 1;
  297. // Set the return value
  298. hr = S_OK;
  299. exit:
  300. return hr;
  301. }
  302. /////////////////////////////////////////////////////////////////////////////
  303. // _FReadSVMOutput
  304. //
  305. // Read the SVM output from a file (".LKO file")
  306. /////////////////////////////////////////////////////////////////////////////
  307. HRESULT CJunkFilter::_HrReadSVMOutput(LPCSTR pszFileName)
  308. {
  309. HRESULT hr = S_OK;
  310. CParseStream parsestm;
  311. ULONG ulIndex = 0;
  312. LPSTR pszBuff = NULL;
  313. ULONG cchBuff = 0;
  314. LPSTR pszDummy = NULL;
  315. LPSTR pszDefThresh = NULL;
  316. ULONG cFeatureComponents = 0;
  317. LPSTR pszFeature = NULL;
  318. ULONG ulFeatureComp = 0;
  319. USHORT cBodyItems = 0;
  320. FEATURECOMP * pfeaturecomp = NULL;
  321. if ((NULL == pszFileName) || ('\0' == *pszFileName))
  322. {
  323. hr = E_INVALIDARG;
  324. goto exit;
  325. }
  326. // Get the parse stream
  327. hr = parsestm.HrSetFile(0, pszFileName);
  328. if (FAILED(hr))
  329. {
  330. goto exit;
  331. }
  332. // skip first two lines
  333. for (ulIndex = 0; ulIndex < 3; ulIndex++)
  334. {
  335. SafeMemFree(pszBuff);
  336. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  337. if (FAILED(hr))
  338. {
  339. goto exit;
  340. }
  341. }
  342. // parse 3rd line: only care about CC and DD
  343. if (FALSE == FReadDouble(pszBuff, "cc = ", &m_dblCC))
  344. {
  345. hr = E_FAIL;
  346. goto exit;
  347. }
  348. if (FALSE == FReadDouble(pszBuff, "dd = ", &m_dblDD))
  349. {
  350. hr = E_FAIL;
  351. goto exit;
  352. }
  353. SafeMemFree(pszBuff);
  354. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  355. if (FAILED(hr))
  356. {
  357. goto exit;
  358. }
  359. if (FALSE == FReadDouble(pszBuff, szDefaultThresh, &m_dblDefaultThresh))
  360. {
  361. m_dblDefaultThresh = THRESH_DEFAULT;
  362. }
  363. if (0 == m_dblSpamCutoff)
  364. {
  365. m_dblSpamCutoff = m_dblDefaultThresh;
  366. }
  367. if (FALSE == FReadDouble(pszBuff, szThresh, &m_dblThresh))
  368. {
  369. hr = E_FAIL;
  370. goto exit;
  371. }
  372. SafeMemFree(pszBuff);
  373. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  374. if (FAILED(hr))
  375. {
  376. goto exit;
  377. }
  378. if (FALSE == FReadDouble(pszBuff, szMostThresh, &m_dblMostThresh))
  379. {
  380. m_dblMostThresh = THRESH_MOST;
  381. }
  382. if (FALSE == FReadDouble(pszBuff, szLeastThresh, &m_dblLeastThresh))
  383. {
  384. m_dblLeastThresh = THRESH_LEAST;
  385. }
  386. SafeMemFree(pszBuff);
  387. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  388. if (FAILED(hr))
  389. {
  390. goto exit;
  391. }
  392. m_cFeatures = StrToInt(pszBuff + lstrlen(szNumberofDim));
  393. if (0 == m_cFeatures)
  394. {
  395. hr = E_FAIL;
  396. goto exit;
  397. }
  398. // We only support up to USHRT_MAX features
  399. if (m_cFeatures >= USHRT_MAX)
  400. {
  401. hr = E_OUTOFMEMORY;
  402. goto exit;
  403. }
  404. SafeMemFree(pszBuff);
  405. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  406. if (FAILED(hr))
  407. {
  408. goto exit;
  409. }
  410. pszDummy = StrStr(pszBuff, szCountFeatureComp);
  411. if (NULL != pszDummy)
  412. {
  413. pszDummy += lstrlen(szCountFeatureComp);
  414. cFeatureComponents = StrToInt(pszDummy);
  415. }
  416. if (cFeatureComponents < m_cFeatures)
  417. {
  418. cFeatureComponents = m_cFeatures * 2;
  419. }
  420. while (0 != lstrcmp(pszBuff, "Weights"))
  421. {
  422. SafeMemFree(pszBuff);
  423. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  424. if (FAILED(hr))
  425. {
  426. goto exit;
  427. }
  428. }
  429. SafeMemFree(m_rgdblSVMWeights);
  430. hr = HrAlloc((void **) &m_rgdblSVMWeights, sizeof(*m_rgdblSVMWeights) * m_cFeatures);
  431. if (FAILED(hr))
  432. {
  433. goto exit;
  434. }
  435. SafeMemFree(m_rgulFeatureStatus);
  436. hr = HrAlloc((void **) &m_rgulFeatureStatus, sizeof(*m_rgulFeatureStatus) * m_cFeatures);
  437. if (FAILED(hr))
  438. {
  439. goto exit;
  440. }
  441. FillMemory(m_rgulFeatureStatus, sizeof(*m_rgulFeatureStatus) * m_cFeatures, -1);
  442. SafeMemFree(m_rgfeaturecomps);
  443. hr = HrAlloc((void **) &m_rgfeaturecomps, sizeof(*m_rgfeaturecomps) * cFeatureComponents);
  444. if (FAILED(hr))
  445. {
  446. goto exit;
  447. }
  448. // Initialize the features
  449. ZeroMemory(m_rgfeaturecomps, sizeof(*m_rgfeaturecomps) * cFeatureComponents);
  450. for (ulIndex = 0; ulIndex < m_cFeatures; ulIndex++)
  451. {
  452. UINT uiLoc;
  453. USHORT cbStr;
  454. boolop bop;
  455. BOOL fContinue;
  456. BOOL fNegative;
  457. SafeMemFree(pszBuff);
  458. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  459. if (FAILED(hr))
  460. {
  461. goto exit;
  462. }
  463. // read the SVM weight
  464. pszDummy = pszBuff;
  465. fNegative = ('-' == *pszDummy);
  466. pszDummy++;
  467. m_rgdblSVMWeights[ulIndex] = StrToDbl(pszDummy, &pszDummy);
  468. if (FALSE != fNegative)
  469. {
  470. m_rgdblSVMWeights[ulIndex] *= -1;
  471. }
  472. pszDummy++; // skip the separator
  473. bop = boolopOr;
  474. fContinue = false;
  475. do
  476. {
  477. pfeaturecomp = &m_rgfeaturecomps[ulFeatureComp++];
  478. // Skip over white space
  479. UlStripWhitespace(pszDummy, TRUE, FALSE, NULL);
  480. // Location (or "special")
  481. uiLoc = StrToInt(pszDummy);
  482. pszDummy = StrStr(pszDummy, ":"); // skip the separator
  483. pszDummy++;
  484. pfeaturecomp->loc = (FeatureLocation)uiLoc;
  485. pfeaturecomp->ulFeature = ulIndex;
  486. pfeaturecomp->bop = bop;
  487. if (locBody == pfeaturecomp->loc)
  488. {
  489. cBodyItems++;
  490. }
  491. if (uiLoc == 5)
  492. {
  493. UINT uiRuleNumber = StrToInt(pszDummy);
  494. pszDummy += StrSpn(pszDummy, "0123456789");
  495. pfeaturecomp->ulRuleNum = uiRuleNumber;
  496. }
  497. else
  498. {
  499. cbStr = (USHORT) StrToInt(pszDummy);
  500. pszDummy = StrStr(pszDummy, ":");
  501. pszDummy++;
  502. // We only support strings up to USHRT_MAX
  503. if (cbStr >= USHRT_MAX)
  504. {
  505. hr = E_OUTOFMEMORY;
  506. goto exit;
  507. }
  508. hr = HrAlloc((void **) &pszFeature, sizeof(*pszFeature) * (cbStr + 1));
  509. if (FAILED(hr))
  510. {
  511. goto exit;
  512. }
  513. StrCpyN(pszFeature, pszDummy, cbStr + 1);
  514. pszDummy += cbStr;
  515. if ('\0' != *pszDummy)
  516. {
  517. pszDummy++; // skip the separator
  518. }
  519. pszFeature[cbStr] = '\0';
  520. Assert(cbStr == strlen(pszFeature));
  521. // Save off the string
  522. pfeaturecomp->pszFeature = pszFeature;
  523. pszFeature = NULL;
  524. pfeaturecomp->cchFeature = cbStr;
  525. }
  526. UlStripWhitespace(pszDummy, TRUE, FALSE, NULL);
  527. switch(*pszDummy)
  528. {
  529. case '|':
  530. bop = boolopOr;
  531. fContinue = TRUE;
  532. break;
  533. case '&':
  534. bop = boolopAnd;
  535. fContinue = TRUE;
  536. break;
  537. default:
  538. fContinue = FALSE;
  539. break;
  540. }
  541. pszDummy++;
  542. }
  543. while (fContinue);
  544. }
  545. m_cFeatureComps = ulFeatureComp;
  546. // Build up body items...
  547. hr = _HrBuildBodyList(cBodyItems);
  548. if (FAILED(hr))
  549. {
  550. goto exit;
  551. }
  552. hr = S_OK;
  553. exit:
  554. SafeMemFree(pszFeature);
  555. SafeMemFree(pszBuff);
  556. return hr;
  557. }
  558. /////////////////////////////////////////////////////////////////////////////
  559. // _FInvokeSpecialRule
  560. //
  561. // Invokes the special rule that is this FEATURECOMP.
  562. // Returns the state of the feature.
  563. /////////////////////////////////////////////////////////////////////////////
  564. BOOL CJunkFilter::_FInvokeSpecialRule(UINT iRuleNum)
  565. {
  566. BOOL fRet = FALSE;
  567. SYSTEMTIME stSent;
  568. CHAR rgchYear[6];
  569. ULONG cbSize = 0;
  570. DWORD dwDummy = 0;
  571. switch (iRuleNum)
  572. {
  573. case 1:
  574. fRet = FStreamStringSearch(m_pIStmBody, &dwDummy, m_pszFirstName, m_cchFirstName, 0);
  575. break;
  576. case 2:
  577. fRet = FStreamStringSearch(m_pIStmBody, &dwDummy, m_pszLastName, m_cchLastName, 0);
  578. break;
  579. case 3:
  580. fRet = FStreamStringSearch(m_pIStmBody, &dwDummy, m_pszCompanyName, m_cchCompanyName, 0);
  581. break;
  582. case 4:
  583. // year message received
  584. if (FALSE == FTimeEmpty(&m_ftMessageSent))
  585. {
  586. // Convert to system time so we can get the year
  587. SideAssert(FALSE != FileTimeToSystemTime(&m_ftMessageSent, &stSent));
  588. wnsprintf(rgchYear, ARRAYSIZE(rgchYear), "%d", stSent.wYear);
  589. dwDummy = CT_START_SET | CT_START_NUM | CT_END_SET | CT_END_NUM;
  590. fRet = FStreamStringSearch(m_pIStmBody, &dwDummy, rgchYear, lstrlen(rgchYear), SSF_CASESENSITIVE);
  591. }
  592. break;
  593. case 5:
  594. // message received in the wee hours (>= 7pm or <6am
  595. if (FALSE == FTimeEmpty(&m_ftMessageSent))
  596. {
  597. // Convert to system time so we can get the year
  598. SideAssert(FALSE != FileTimeToSystemTime(&m_ftMessageSent, &stSent));
  599. fRet = (stSent.wHour >= (7 + 12)) || (stSent.wHour < 6);
  600. }
  601. break;
  602. case 6:
  603. // message received on weekend
  604. if (FALSE == FTimeEmpty(&m_ftMessageSent))
  605. {
  606. // Convert to system time so we can get the year
  607. SideAssert(FALSE != FileTimeToSystemTime(&m_ftMessageSent, &stSent));
  608. fRet = ((0 == stSent.wDayOfWeek) || (6 == stSent.wDayOfWeek));
  609. }
  610. break;
  611. case 14:
  612. fRet = m_fRule14; // set in _HandleCaseSensitiveSpecialRules()
  613. break;
  614. case 15:
  615. fRet = FSpecialFeatureNonAlphaStm(m_pIStmBody);
  616. break;
  617. case 16:
  618. fRet = m_fDirectMessage;
  619. break;
  620. case 17:
  621. fRet = m_fRule17; // set in _HandleCaseSensitiveSpecialRules()
  622. break;
  623. case 18:
  624. fRet = FSpecialFeatureNonAlpha(m_pszSubject);
  625. break;
  626. case 19:
  627. fRet = ((NULL == m_pszTo) || ('\0' == *m_pszTo));
  628. break;
  629. case 20:
  630. fRet = m_fHasAttach;
  631. break;
  632. case 40:
  633. fRet = (m_cbBody >= 125);
  634. break;
  635. case 41:
  636. fRet = (m_cbBody >= 250);
  637. break;
  638. case 42:
  639. fRet = (m_cbBody >= 500);
  640. break;
  641. case 43:
  642. fRet = (m_cbBody >= 1000);
  643. break;
  644. case 44:
  645. fRet = (m_cbBody >= 2000);
  646. break;
  647. case 45:
  648. fRet = (m_cbBody >= 4000);
  649. break;
  650. case 46:
  651. fRet = (m_cbBody >= 8000);
  652. break;
  653. case 47:
  654. fRet = (m_cbBody >= 16000);
  655. break;
  656. default:
  657. AssertSz(FALSE, "unsupported special feature");
  658. break;
  659. }
  660. return fRet;
  661. }
  662. /////////////////////////////////////////////////////////////////////////////
  663. // _HandleCaseSensitiveSpecialRules
  664. //
  665. // Called from _EvaluateFeatureComponents().
  666. // Some special rules are case sensitive, so if they're present, we'll
  667. // evaluate them before we make the texts uppercase and cache the result
  668. // for when they are actually used.
  669. /////////////////////////////////////////////////////////////////////////////
  670. VOID CJunkFilter::_HandleCaseSensitiveSpecialRules()
  671. {
  672. ULONG ulIndex = 0;
  673. for (ulIndex = 0; ulIndex < m_cFeatureComps; ulIndex++)
  674. {
  675. if (m_rgfeaturecomps[ulIndex].loc == locSpecial)
  676. {
  677. switch (m_rgfeaturecomps[ulIndex].ulRuleNum)
  678. {
  679. case 14:
  680. m_fRule14 = FSpecialFeatureUpperCaseWordsStm(m_pIStmBody);
  681. break;
  682. case 17:
  683. m_fRule17 = FSpecialFeatureUpperCaseWords(m_pszSubject);
  684. break;
  685. default:
  686. break;
  687. }
  688. }
  689. }
  690. return;
  691. }
  692. VOID CJunkFilter::_EvaluateBodyFeatures(VOID)
  693. {
  694. CBodyBuff buffBody;
  695. CHAR chMatch = '\0';
  696. ULONG ulIndex = 0;
  697. FEATURECOMP * pfcomp = NULL;
  698. USHORT iBodyList = 0;
  699. // Check to see if we have work to do
  700. if (NULL == m_pIStmBody)
  701. {
  702. goto exit;
  703. }
  704. // Set the stream into the buffer
  705. if (FAILED(buffBody.HrInit(0, m_pIStmBody)))
  706. {
  707. goto exit;
  708. }
  709. // Initialize all the body features to no found
  710. for (iBodyList = 1; iBodyList < m_cblistBodyList; iBodyList++)
  711. {
  712. // Set it to not found
  713. m_rgfeaturecomps[m_pblistBodyList[iBodyList].usItem].fPresent = FALSE;
  714. }
  715. // While we have more bytes to read
  716. for (; S_OK == buffBody.HrGetCurrChar(&chMatch); buffBody.HrMoveNext())
  717. {
  718. // Search for a match through the feature list
  719. for (iBodyList = m_rgiBodyList[(UCHAR) chMatch]; 0 != iBodyList; iBodyList = m_pblistBodyList[iBodyList].iNext)
  720. {
  721. pfcomp = &(m_rgfeaturecomps[m_pblistBodyList[iBodyList].usItem]);
  722. // If we have a body item and it hasn't been found yet
  723. if (FALSE == pfcomp->fPresent)
  724. {
  725. // Could this item be a possible match???
  726. Assert(NULL != pfcomp->pszFeature);
  727. // Try to do the comparison
  728. pfcomp->fPresent = buffBody.FDoMatch(pfcomp);
  729. }
  730. }
  731. }
  732. exit:
  733. return;
  734. }
  735. /////////////////////////////////////////////////////////////////////////////
  736. // _EvaluateFeatureComponents
  737. //
  738. // Evaluates all of the feature components. Sets fPresent in each component
  739. // to true if the feature is present, false otherwise
  740. /////////////////////////////////////////////////////////////////////////////
  741. VOID CJunkFilter::_EvaluateFeatureComponents(VOID)
  742. {
  743. ULONG ulIndex = 0;
  744. FEATURECOMP * pfcomp = NULL;
  745. _HandleCaseSensitiveSpecialRules();
  746. if (NULL != m_pszFrom)
  747. {
  748. CharUpperBuff(m_pszFrom, lstrlen(m_pszFrom));
  749. }
  750. if (NULL != m_pszTo)
  751. {
  752. CharUpperBuff(m_pszTo, lstrlen(m_pszTo));
  753. }
  754. if (NULL != m_pszSubject)
  755. {
  756. CharUpperBuff(m_pszSubject, lstrlen(m_pszSubject));
  757. }
  758. for (ulIndex = 0; ulIndex < m_cFeatureComps; ulIndex++)
  759. {
  760. pfcomp = &m_rgfeaturecomps[ulIndex];
  761. switch(pfcomp->loc)
  762. {
  763. case locNil:
  764. Assert(locNil != pfcomp->loc);
  765. pfcomp->fPresent = FALSE;
  766. break;
  767. case locSubj:
  768. pfcomp->fPresent = FWordPresent(m_pszSubject, &(pfcomp->dwFlags), pfcomp->pszFeature, pfcomp->cchFeature, NULL);
  769. break;
  770. case locFrom:
  771. pfcomp->fPresent = FWordPresent(m_pszFrom, &(pfcomp->dwFlags), pfcomp->pszFeature, pfcomp->cchFeature, NULL);
  772. break;
  773. case locTo:
  774. pfcomp->fPresent = FWordPresent(m_pszTo, &(pfcomp->dwFlags), pfcomp->pszFeature, pfcomp->cchFeature, NULL);
  775. break;
  776. case locSpecial:
  777. pfcomp->fPresent = _FInvokeSpecialRule(pfcomp->ulRuleNum);
  778. break;
  779. }
  780. }
  781. }
  782. /////////////////////////////////////////////////////////////////////////////
  783. // ProcessFeatureComponentPresence
  784. //
  785. // Processes the presence (or absence) of the individual feature components,
  786. // setting the feature status of each feature (which may me made up of
  787. // multiple feature components).
  788. /////////////////////////////////////////////////////////////////////////////
  789. VOID CJunkFilter::_ProcessFeatureComponentPresence(VOID)
  790. {
  791. ULONG ulIndex = 0;
  792. FEATURECOMP * pfcomp = NULL;
  793. ULONG ulFeature = 0;
  794. for (ulIndex = 0; ulIndex < m_cFeatureComps; ulIndex++)
  795. {
  796. pfcomp = &m_rgfeaturecomps[ulIndex];
  797. ulFeature = pfcomp->ulFeature;
  798. if (-1 == m_rgulFeatureStatus[ulFeature]) // first feature of this feature
  799. {
  800. if (FALSE != pfcomp->fPresent)
  801. {
  802. m_rgulFeatureStatus[ulFeature] = 1;
  803. }
  804. else
  805. {
  806. m_rgulFeatureStatus[ulFeature] = 0;
  807. }
  808. }
  809. else
  810. {
  811. switch (pfcomp->bop)
  812. {
  813. case boolopOr:
  814. if (pfcomp->fPresent)
  815. {
  816. m_rgulFeatureStatus[ulFeature] = 1;
  817. }
  818. break;
  819. case boolopAnd:
  820. if (!pfcomp->fPresent)
  821. {
  822. m_rgulFeatureStatus[ulFeature] = 0;
  823. }
  824. break;
  825. default:
  826. Assert(FALSE);
  827. break;
  828. }
  829. }
  830. }
  831. }
  832. /////////////////////////////////////////////////////////////////////////////
  833. // _DblDoSVMCalc
  834. //
  835. // Does the actual support vector machine calculation.
  836. // Returns the probability that the message is spam
  837. /////////////////////////////////////////////////////////////////////////////
  838. DOUBLE CJunkFilter::_DblDoSVMCalc(VOID)
  839. {
  840. DOUBLE dblAccum;
  841. DOUBLE dblResult;
  842. ULONG ulIndex = 0;
  843. dblAccum = 0.0;
  844. for (ulIndex = 0; ulIndex < m_cFeatures; ulIndex++)
  845. {
  846. if (m_rgulFeatureStatus[ulIndex] == 1)
  847. {
  848. dblAccum += m_rgdblSVMWeights[ulIndex];
  849. #ifdef DEBUG
  850. if (NULL != m_pILogFile)
  851. {
  852. _PrintFeatureToLog(ulIndex);
  853. }
  854. #endif // DEBUG
  855. }
  856. else if (m_rgulFeatureStatus[ulIndex] != 0)
  857. {
  858. AssertSz(FALSE, "What happened here!");
  859. }
  860. }
  861. // Apply threshold;
  862. dblAccum -= m_dblThresh;
  863. // Apply sigmoid
  864. dblResult = (1 / (1 + exp((m_dblCC * dblAccum) + m_dblDD)));
  865. return dblResult;
  866. }
  867. /////////////////////////////////////////////////////////////////////////////
  868. // BCalculateSpamProb
  869. //
  870. // Calculates the probability that the current message is spam.
  871. // Returns the probability (0 to 1) that the message is spam in prSpamProb
  872. // the boolean return is determined by comparing to the spam cutoff
  873. /////////////////////////////////////////////////////////////////////////////
  874. BOOL CJunkFilter::FCalculateSpamProb(LPSTR pszFrom, LPSTR pszTo, LPSTR pszSubject, IStream * pIStmBody,
  875. BOOL fDirectMessage, BOOL fHasAttach, FILETIME * pftMessageSent,
  876. DOUBLE * pdblSpamProb, BOOL * pfIsSpam)
  877. {
  878. #ifdef DEBUG
  879. CHAR rgchBuff[1024];
  880. DWORD dwVal = 0;
  881. #endif // DEBUG
  882. m_pszFrom = pszFrom;
  883. m_pszTo = pszTo;
  884. m_pszSubject = pszSubject;
  885. m_pIStmBody = pIStmBody;
  886. m_fDirectMessage = fDirectMessage;
  887. m_fHasAttach = fHasAttach;
  888. m_ftMessageSent = *pftMessageSent;
  889. // Set the size of the body
  890. if ((NULL == m_pIStmBody) || (FAILED(HrGetStreamSize(m_pIStmBody, &m_cbBody))))
  891. {
  892. m_cbBody = 0;
  893. }
  894. #ifdef DEBUG
  895. // Get the logfile if we need it
  896. if (NULL == m_pILogFile)
  897. {
  898. _HrCreateLogFile();
  899. }
  900. if (NULL != m_pILogFile)
  901. {
  902. PrintToLogFile(m_pILogFile, LOG_TAGLINE, pszSubject);
  903. PrintToLogFile(m_pILogFile, LOG_FIRSTNAME, m_pszFirstName);
  904. PrintToLogFile(m_pILogFile, LOG_LASTNAME, m_pszLastName);
  905. PrintToLogFile(m_pILogFile, LOG_COMPANYNAME, m_pszCompanyName);
  906. }
  907. #endif // DEBUG
  908. _EvaluateBodyFeatures();
  909. _EvaluateFeatureComponents();
  910. _ProcessFeatureComponentPresence();
  911. *pdblSpamProb = _DblDoSVMCalc();
  912. #ifdef DEBUG
  913. if (NULL != m_pILogFile)
  914. {
  915. dwVal = ( DWORD ) ((*pdblSpamProb * 1000000) + 0.5);
  916. wnsprintf(rgchBuff, ARRAYSIZE(rgchBuff), LOG_FINAL, dwVal / 1000000, dwVal % 1000000);
  917. m_pILogFile->WriteLog(LOGFILE_DB, rgchBuff);
  918. m_pILogFile->WriteLog(LOGFILE_DB, "");
  919. }
  920. #endif // DEBUG
  921. *pfIsSpam = (*pdblSpamProb > m_dblSpamCutoff);
  922. return TRUE;
  923. }
  924. /////////////////////////////////////////////////////////////////////////////
  925. // BReadDefaultSpamCutoff
  926. //
  927. // Reads the default spam cutoff without parsing entire file
  928. // Use GetDefaultSpamCutoff if using HrSetSVMDataLocation;
  929. // static member function
  930. /////////////////////////////////////////////////////////////////////////////
  931. HRESULT CJunkFilter::HrReadDefaultSpamCutoff(LPSTR pszFullPath, DOUBLE * pdblDefCutoff)
  932. {
  933. HRESULT hr = S_OK;
  934. CParseStream parsestm;
  935. LPSTR pszBuff = NULL;
  936. ULONG cchBuff = 0;
  937. LPSTR pszDefThresh = NULL;
  938. ULONG ulIndex = 0;
  939. LPSTR pszDummy = NULL;
  940. if ((NULL == pszFullPath) || ('\0' == *pszFullPath) || (NULL == pdblDefCutoff))
  941. {
  942. hr = E_INVALIDARG;
  943. goto exit;
  944. }
  945. // Get the parse stream
  946. hr = parsestm.HrSetFile(0, pszFullPath);
  947. if (FAILED(hr))
  948. {
  949. goto exit;
  950. }
  951. // skip first three lines
  952. for (ulIndex = 0; ulIndex < 4; ulIndex++)
  953. {
  954. SafeMemFree(pszBuff);
  955. hr = parsestm.HrGetLine(0, &pszBuff, &cchBuff);
  956. if (FAILED(hr))
  957. {
  958. goto exit;
  959. }
  960. }
  961. // Find the default threshold
  962. pszDefThresh = StrStr(pszBuff, ::szDefaultThresh);
  963. if (NULL == pszDefThresh)
  964. {
  965. hr = E_FAIL;
  966. goto exit;
  967. }
  968. // Grab the value
  969. pszDefThresh += lstrlen(::szDefaultThresh);
  970. *pdblDefCutoff = StrToDbl(pszDefThresh, &pszDummy);
  971. // Set the proper return value
  972. hr = S_OK;
  973. exit:
  974. SafeMemFree(pszBuff);
  975. return hr;
  976. }
  977. /////////////////////////////////////////////////////////////////////////////
  978. // Constructor/destructor
  979. //
  980. /////////////////////////////////////////////////////////////////////////////
  981. CJunkFilter::CJunkFilter() : m_cRef(0), m_pszFirstName(NULL), m_cchFirstName(0), m_pszLastName(NULL),
  982. m_cchLastName(0), m_pszCompanyName(NULL), m_cchCompanyName(0), m_pblistBodyList(NULL),
  983. m_cblistBodyList(0), m_rgfeaturecomps(NULL), m_rgdblSVMWeights(NULL), m_dblCC(0), m_dblDD(0),
  984. m_dblThresh(-1), m_dblDefaultThresh(-1), m_dblMostThresh(0), m_dblLeastThresh(0), m_cFeatures(0),
  985. m_cFeatureComps(0), m_rgulFeatureStatus(0),
  986. m_pszLOCPath(NULL), m_dblSpamCutoff(0), m_pszFrom(NULL), m_pszTo(NULL), m_pszSubject(NULL),
  987. m_pIStmBody(NULL), m_cbBody(0), m_fDirectMessage(FALSE), m_fHasAttach(FALSE),
  988. m_fRule14(FALSE), m_fRule17(FALSE)
  989. {
  990. ZeroMemory(m_rgiBodyList, sizeof(m_rgiBodyList));
  991. ZeroMemory(&m_ftMessageSent, sizeof(m_ftMessageSent));
  992. InitializeCriticalSection(&m_cs);
  993. #ifdef DEBUG
  994. m_fJunkMailLogInit = FALSE;
  995. m_pILogFile = NULL;
  996. #endif // DEBUG
  997. }
  998. CJunkFilter::~CJunkFilter()
  999. {
  1000. ULONG ulIndex = 0;
  1001. SafeMemFree(m_pszFirstName);
  1002. SafeMemFree(m_pszLastName);
  1003. SafeMemFree(m_pszCompanyName);
  1004. #ifdef DEBUG
  1005. SafeRelease(m_pILogFile);
  1006. #endif // DEBUG
  1007. for (ulIndex = 0; ulIndex < m_cFeatureComps; ulIndex++)
  1008. {
  1009. if ((locNil != m_rgfeaturecomps[ulIndex].loc) && (locSpecial != m_rgfeaturecomps[ulIndex].loc))
  1010. {
  1011. SafeMemFree(m_rgfeaturecomps[ulIndex].pszFeature);
  1012. }
  1013. }
  1014. SafeMemFree(m_pblistBodyList);
  1015. m_cblistBodyList = 0;
  1016. ZeroMemory(m_rgiBodyList, sizeof(m_rgiBodyList));
  1017. SafeMemFree(m_rgdblSVMWeights);
  1018. SafeMemFree(m_rgulFeatureStatus);
  1019. SafeMemFree(m_rgfeaturecomps);
  1020. DeleteCriticalSection(&m_cs);
  1021. }
  1022. STDMETHODIMP_(ULONG) CJunkFilter::AddRef()
  1023. {
  1024. return ::InterlockedIncrement(&m_cRef);
  1025. }
  1026. STDMETHODIMP_(ULONG) CJunkFilter::Release()
  1027. {
  1028. LONG cRef = 0;
  1029. cRef = ::InterlockedDecrement(&m_cRef);
  1030. if (0 == cRef)
  1031. {
  1032. delete this;
  1033. return cRef;
  1034. }
  1035. return cRef;
  1036. }
  1037. STDMETHODIMP CJunkFilter::QueryInterface(REFIID riid, void ** ppvObject)
  1038. {
  1039. HRESULT hr = S_OK;
  1040. // Check the incoming params
  1041. if (NULL == ppvObject)
  1042. {
  1043. hr = E_INVALIDARG;
  1044. goto exit;
  1045. }
  1046. // Initialize outgoing param
  1047. *ppvObject = NULL;
  1048. if ((riid == IID_IUnknown) || (riid == IID_IOEJunkFilter))
  1049. {
  1050. *ppvObject = static_cast<IOEJunkFilter *>(this);
  1051. }
  1052. else
  1053. {
  1054. hr = E_NOINTERFACE;
  1055. goto exit;
  1056. }
  1057. reinterpret_cast<IUnknown *>(*ppvObject)->AddRef();
  1058. hr = S_OK;
  1059. exit:
  1060. return hr;
  1061. }
  1062. STDMETHODIMP CJunkFilter::SetIdentity(LPCSTR pszFirstName, LPCSTR pszLastName, LPCSTR pszCompanyName)
  1063. {
  1064. HRESULT hr = S_OK;
  1065. //Set the new first name
  1066. SafeMemFree(m_pszFirstName);
  1067. m_cchFirstName = 0;
  1068. if (NULL != pszFirstName)
  1069. {
  1070. m_pszFirstName = PszDupA(pszFirstName);
  1071. if (NULL == m_pszFirstName)
  1072. {
  1073. hr = E_OUTOFMEMORY;
  1074. goto exit;
  1075. }
  1076. m_cchFirstName = CharUpperBuff(m_pszFirstName, lstrlen(m_pszFirstName));
  1077. }
  1078. // Set the new last name
  1079. SafeMemFree(m_pszLastName);
  1080. m_cchLastName = 0;
  1081. if (NULL != pszLastName)
  1082. {
  1083. m_pszLastName = PszDupA(pszLastName);
  1084. if (NULL == m_pszLastName)
  1085. {
  1086. hr = E_OUTOFMEMORY;
  1087. goto exit;
  1088. }
  1089. m_cchLastName = CharUpperBuff(m_pszLastName, lstrlen(m_pszLastName));
  1090. }
  1091. // Set the new company name
  1092. SafeMemFree(m_pszCompanyName);
  1093. m_cchCompanyName = 0;
  1094. if (NULL != pszCompanyName)
  1095. {
  1096. m_pszCompanyName = PszDupA(pszCompanyName);
  1097. if (NULL == m_pszCompanyName)
  1098. {
  1099. hr = E_OUTOFMEMORY;
  1100. goto exit;
  1101. }
  1102. m_cchCompanyName = CharUpperBuff(m_pszCompanyName, lstrlen(m_pszCompanyName));
  1103. }
  1104. hr = S_OK;
  1105. exit:
  1106. return hr;
  1107. }
  1108. STDMETHODIMP CJunkFilter::LoadDataFile(LPCSTR pszFilePath)
  1109. {
  1110. HRESULT hr = S_OK;
  1111. if ((NULL == pszFilePath) || ('\0' == *pszFilePath))
  1112. {
  1113. hr = E_INVALIDARG;
  1114. goto exit;
  1115. }
  1116. hr = _HrReadSVMOutput(pszFilePath);
  1117. if (FAILED(hr))
  1118. {
  1119. AssertSz(FALSE, "Unable to successfully read filter params");
  1120. goto exit;
  1121. }
  1122. // Set the proper return value
  1123. hr = S_OK;
  1124. exit:
  1125. return hr;
  1126. }
  1127. STDMETHODIMP CJunkFilter::SetSpamThresh(ULONG ulThresh)
  1128. {
  1129. HRESULT hr = S_OK;
  1130. switch (ulThresh)
  1131. {
  1132. case STF_USE_MOST:
  1133. m_dblSpamCutoff = m_dblMostThresh;
  1134. break;
  1135. case STF_USE_MORE:
  1136. m_dblSpamCutoff = m_dblDefaultThresh + ((m_dblMostThresh - m_dblDefaultThresh) / 2);
  1137. break;
  1138. case STF_USE_DEFAULT:
  1139. m_dblSpamCutoff = m_dblDefaultThresh;
  1140. break;
  1141. case STF_USE_LESS:
  1142. m_dblSpamCutoff = m_dblDefaultThresh - ((m_dblDefaultThresh - m_dblLeastThresh) / 2);
  1143. break;
  1144. case STF_USE_LEAST:
  1145. m_dblSpamCutoff = m_dblLeastThresh;
  1146. break;
  1147. default:
  1148. hr = E_INVALIDARG;
  1149. goto exit;
  1150. }
  1151. hr = S_OK;
  1152. exit:
  1153. return hr;
  1154. }
  1155. STDMETHODIMP CJunkFilter::GetSpamThresh(ULONG * pulThresh)
  1156. {
  1157. HRESULT hr = S_OK;
  1158. ULONG ulThresh = 0;
  1159. // Check the incoming params
  1160. if (NULL == pulThresh)
  1161. {
  1162. hr = E_INVALIDARG;
  1163. goto exit;
  1164. }
  1165. // Initialize outgoing params
  1166. if (m_dblDefaultThresh == m_dblSpamCutoff)
  1167. {
  1168. ulThresh = STF_USE_DEFAULT;
  1169. }
  1170. else if (m_dblMostThresh == m_dblSpamCutoff)
  1171. {
  1172. ulThresh = STF_USE_MOST;
  1173. }
  1174. else if (m_dblLeastThresh == m_dblSpamCutoff)
  1175. {
  1176. ulThresh = STF_USE_LEAST;
  1177. }
  1178. else if (m_dblSpamCutoff > m_dblDefaultThresh)
  1179. {
  1180. ulThresh = STF_USE_MORE;
  1181. }
  1182. else
  1183. {
  1184. ulThresh = STF_USE_LESS;
  1185. }
  1186. hr = S_OK;
  1187. exit:
  1188. return hr;
  1189. }
  1190. STDMETHODIMP CJunkFilter::GetDefaultSpamThresh(DOUBLE * pdblThresh)
  1191. {
  1192. HRESULT hr = S_OK;
  1193. // Check the incoming params
  1194. if (NULL == pdblThresh)
  1195. {
  1196. hr = E_INVALIDARG;
  1197. goto exit;
  1198. }
  1199. // Initialize outgoing params
  1200. *pdblThresh = m_dblDefaultThresh * 100.0;
  1201. hr = S_OK;
  1202. exit:
  1203. return hr;
  1204. }
  1205. STDMETHODIMP CJunkFilter::CalcJunkProb(DWORD dwFlags, IMimePropertySet * pIMPropSet, IMimeMessage * pIMMsg, double * pdblProb)
  1206. {
  1207. HRESULT hr = S_OK;
  1208. BOOL fSpam = FALSE;
  1209. PROPVARIANT propvar = {0};
  1210. DWORD dwFlagsMsg = 0;
  1211. FILETIME ftMsgSent = {0};
  1212. LPSTR pszFrom = NULL;
  1213. LPSTR pszTo = NULL;
  1214. LPSTR pszSubject = NULL;
  1215. IStream * pIStmBody = NULL;
  1216. IStream * pIStmHtml = NULL;
  1217. BOOL fSentToMe = FALSE;
  1218. BOOL fHasAttachments = FALSE;
  1219. if ((NULL == pIMPropSet) || (NULL == pIMMsg))
  1220. {
  1221. hr = E_INVALIDARG;
  1222. goto exit;
  1223. }
  1224. // Get Message Flags
  1225. if (SUCCEEDED(pIMMsg->GetFlags(&dwFlagsMsg)))
  1226. {
  1227. fHasAttachments = (0 != (dwFlagsMsg & IMF_ATTACHMENTS));
  1228. }
  1229. // Was the message sent to me
  1230. fSentToMe = (0 != (dwFlags & CJPF_SENT_TO_ME));
  1231. // Get the from field
  1232. propvar.vt = VT_LPSTR;
  1233. hr = pIMPropSet->GetProp(PIDTOSTR(PID_HDR_FROM), NOFLAGS, &propvar);
  1234. if (SUCCEEDED(hr))
  1235. {
  1236. pszFrom = propvar.pszVal;
  1237. }
  1238. // Get the To field
  1239. propvar.vt = VT_LPSTR;
  1240. hr = pIMPropSet->GetProp(PIDTOSTR(PID_HDR_TO), NOFLAGS, &propvar);
  1241. if (SUCCEEDED(hr))
  1242. {
  1243. pszTo = propvar.pszVal;
  1244. }
  1245. // Try to Get the Plain Text Stream
  1246. if (FAILED(pIMMsg->GetTextBody(TXT_PLAIN, IET_DECODED, &pIStmBody, NULL)))
  1247. {
  1248. // Try to get the text version from the HTML stream
  1249. if ((FAILED(pIMMsg->GetTextBody(TXT_HTML, IET_DECODED, &pIStmHtml, NULL))) ||
  1250. (FAILED(HrConvertHTMLToPlainText(pIStmHtml, &pIStmBody))))
  1251. {
  1252. pIStmBody = NULL;
  1253. }
  1254. }
  1255. // Get the Subject field
  1256. propvar.vt = VT_LPSTR;
  1257. hr = pIMPropSet->GetProp(PIDTOSTR(PID_HDR_SUBJECT), NOFLAGS, &propvar);
  1258. if (SUCCEEDED(hr))
  1259. {
  1260. pszSubject = propvar.pszVal;
  1261. }
  1262. // Is this a direct message
  1263. // When was the message sent?
  1264. propvar.vt = VT_FILETIME;
  1265. hr = pIMPropSet->GetProp(PIDTOSTR(PID_ATT_SENTTIME), 0, &propvar);
  1266. if (SUCCEEDED(hr))
  1267. {
  1268. ftMsgSent = propvar.filetime;
  1269. }
  1270. FillMemory(m_rgulFeatureStatus, sizeof(*m_rgulFeatureStatus) * m_cFeatures, -1);
  1271. if (FALSE == FCalculateSpamProb(pszFrom, pszTo, pszSubject, pIStmBody,
  1272. fSentToMe, fHasAttachments, &ftMsgSent,
  1273. pdblProb, &fSpam))
  1274. {
  1275. hr = E_FAIL;
  1276. goto exit;
  1277. }
  1278. hr = (FALSE != fSpam) ? S_OK : S_FALSE;
  1279. exit:
  1280. SafeRelease(pIStmHtml);
  1281. SafeRelease(pIStmBody);
  1282. SafeMemFree(pszSubject);
  1283. SafeMemFree(pszTo);
  1284. SafeMemFree(pszFrom);
  1285. return hr;
  1286. }
  1287. ///////////////////////////////////////////////////////////////////////////////
  1288. //
  1289. // HrCreateJunkFilter
  1290. //
  1291. // This creates a junk filter.
  1292. //
  1293. // ppIRule - pointer to return the junk filter
  1294. //
  1295. // Returns: S_OK, on success
  1296. // E_OUTOFMEMORY, if can't create the Junk Filter object
  1297. //
  1298. ///////////////////////////////////////////////////////////////////////////////
  1299. HRESULT WINAPI HrCreateJunkFilter(DWORD dwFlags, IOEJunkFilter ** ppIJunkFilter)
  1300. {
  1301. CJunkFilter * pJunk = NULL;
  1302. HRESULT hr = S_OK;
  1303. // Check the incoming params
  1304. if (NULL == ppIJunkFilter)
  1305. {
  1306. hr = E_INVALIDARG;
  1307. goto exit;
  1308. }
  1309. // Initialize outgoing params
  1310. *ppIJunkFilter = NULL;
  1311. // Create the rules manager object
  1312. pJunk = new CJunkFilter;
  1313. if (NULL == pJunk)
  1314. {
  1315. hr = E_OUTOFMEMORY;
  1316. goto exit;
  1317. }
  1318. // Get the rules manager interface
  1319. hr = pJunk->QueryInterface(IID_IOEJunkFilter, (void **) ppIJunkFilter);
  1320. if (FAILED(hr))
  1321. {
  1322. goto exit;
  1323. }
  1324. pJunk = NULL;
  1325. // Set the proper return value
  1326. hr = S_OK;
  1327. exit:
  1328. if (NULL != pJunk)
  1329. {
  1330. delete pJunk;
  1331. }
  1332. return hr;
  1333. }
  1334. BOOL FReadDouble(LPSTR pszLine, LPSTR pszToken, DOUBLE * pdblVal)
  1335. {
  1336. BOOL fRet = FALSE;
  1337. LPSTR pszVal = NULL;
  1338. BOOL fNegative = FALSE;
  1339. // Search for token
  1340. pszVal = StrStr(pszLine, pszToken);
  1341. // If token isn't found then bail
  1342. if (NULL == pszVal)
  1343. {
  1344. fRet = FALSE;
  1345. goto exit;
  1346. }
  1347. // Skip over the token
  1348. pszVal += lstrlen(pszToken);
  1349. // Check to see if the value is negative
  1350. if ('-' == *pszVal)
  1351. {
  1352. fNegative = TRUE;
  1353. pszVal++;
  1354. }
  1355. // Read in value
  1356. *pdblVal = StrToDbl(pszVal, &pszVal);
  1357. // Negate the value if neccessary
  1358. if (FALSE != fNegative)
  1359. {
  1360. *pdblVal *= -1;
  1361. }
  1362. fRet = TRUE;
  1363. exit:
  1364. return fRet;
  1365. }
  1366. #ifdef DEBUG
  1367. static const LPSTR LOG_SPECIAL_BODY_FIRSTNAME = "Special: Body contains the First Name";
  1368. static const LPSTR LOG_SPECIAL_BODY_LASTNAME = "Special: Body contains the Last Name";
  1369. static const LPSTR LOG_SPECIAL_BODY_COMPANYNAME = "Special: Body contains the Company Name";
  1370. static const LPSTR LOG_SPECIAL_BODY_YEARRECVD = "Special: Body contains the year message received";
  1371. static const LPSTR LOG_SPECIAL_SENTTIME_WEEHRS = "Special: Sent time was between 7PM and 6AM";
  1372. static const LPSTR LOG_SPECIAL_SENTTIME_WKEND = "Special: Sent time was on the weekend (Sat or Sun)";
  1373. static const LPSTR LOG_SPECIAL_BODY_25PCTUPCWDS = "Special: Body contains 25% uppercase words out of the first 50 words";
  1374. static const LPSTR LOG_SPECIAL_BODY_8PCTNONALPHA = "Special: Body contains 8% non-alpha characters out of the first 200 characters";
  1375. static const LPSTR LOG_SPECIAL_SENT_DIRECT = "Special: Sent directly to user";
  1376. static const LPSTR LOG_SPECIAL_SUBJECT_25PCTUPCWDS = "Special: Subject contains 25% uppercase words out of the first 50 words";
  1377. static const LPSTR LOG_SPECIAL_SUBJECT_8PCTNONALPHA = "Special: Subject contains 8% non-alpha characters out of the first 200 characters";
  1378. static const LPSTR LOG_SPECIAL_TO_EMPTY = "Special: To line is empty";
  1379. static const LPSTR LOG_SPECIAL_HASATTACH = "Special: Message has an attachment";
  1380. static const LPSTR LOG_SPECIAL_BODY_GT125B = "Special: Body is greater than 125 Bytes";
  1381. static const LPSTR LOG_SPECIAL_BODY_GT250B = "Special: Body is greater than 250 Bytes";
  1382. static const LPSTR LOG_SPECIAL_BODY_GT500B = "Special: Body is greater than 500 Bytes";
  1383. static const LPSTR LOG_SPECIAL_BODY_GT1000B = "Special: Body is greater than 1000 Bytes";
  1384. static const LPSTR LOG_SPECIAL_BODY_GT2000B = "Special: Body is greater than 2000 Bytes";
  1385. static const LPSTR LOG_SPECIAL_BODY_GT4000B = "Special: Body is greater than 4000 Bytes";
  1386. static const LPSTR LOG_SPECIAL_BODY_GT8000B = "Special: Body is greater than 8000 Bytes";
  1387. static const LPSTR LOG_SPECIAL_BODY_GT16000B = "Special: Body is greater than 16000 Bytes";
  1388. VOID CJunkFilter::_PrintSpecialFeatureToLog(UINT iRuleNum)
  1389. {
  1390. Assert(NULL != m_pILogFile);
  1391. switch (iRuleNum)
  1392. {
  1393. case 1:
  1394. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_FIRSTNAME);
  1395. break;
  1396. case 2:
  1397. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_LASTNAME);
  1398. break;
  1399. case 3:
  1400. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_COMPANYNAME);
  1401. break;
  1402. case 4:
  1403. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_YEARRECVD);
  1404. break;
  1405. case 5:
  1406. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_SENTTIME_WEEHRS);
  1407. break;
  1408. case 6:
  1409. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_SENTTIME_WKEND);
  1410. break;
  1411. case 14:
  1412. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_25PCTUPCWDS);
  1413. break;
  1414. case 15:
  1415. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_8PCTNONALPHA);
  1416. break;
  1417. case 16:
  1418. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_SENT_DIRECT);
  1419. break;
  1420. case 17:
  1421. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_SUBJECT_25PCTUPCWDS);
  1422. break;
  1423. case 18:
  1424. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_SUBJECT_8PCTNONALPHA);
  1425. break;
  1426. case 19:
  1427. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_TO_EMPTY);
  1428. break;
  1429. case 20:
  1430. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_HASATTACH);
  1431. break;
  1432. case 40:
  1433. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT125B);
  1434. break;
  1435. case 41:
  1436. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT250B);
  1437. break;
  1438. case 42:
  1439. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT500B);
  1440. break;
  1441. case 43:
  1442. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT1000B);
  1443. break;
  1444. case 44:
  1445. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT2000B);
  1446. break;
  1447. case 45:
  1448. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT4000B);
  1449. break;
  1450. case 46:
  1451. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT8000B);
  1452. break;
  1453. case 47:
  1454. m_pILogFile->WriteLog(LOGFILE_DB, LOG_SPECIAL_BODY_GT16000B);
  1455. break;
  1456. default:
  1457. AssertSz(FALSE, "unsupported special feature");
  1458. break;
  1459. }
  1460. return;
  1461. }
  1462. VOID CJunkFilter::_PrintFeatureToLog(ULONG ulIndex)
  1463. {
  1464. LPSTR pszBuff = NULL;
  1465. LPSTR pszTag = NULL;
  1466. // Figure out which tag line to use
  1467. switch (m_rgfeaturecomps[ulIndex].loc)
  1468. {
  1469. case locNil:
  1470. goto exit;
  1471. break;
  1472. case locBody:
  1473. pszTag = LOG_BODY;
  1474. break;
  1475. case locSubj:
  1476. pszTag = LOG_SUBJECT;
  1477. break;
  1478. case locFrom:
  1479. pszTag = LOG_FROM;
  1480. break;
  1481. case locTo:
  1482. pszTag = LOG_TO;
  1483. break;
  1484. case locSpecial:
  1485. _PrintSpecialFeatureToLog(m_rgfeaturecomps[ulIndex].ulRuleNum);
  1486. goto exit;
  1487. break;
  1488. }
  1489. // Write out the feature to the log
  1490. PrintToLogFile(m_pILogFile, pszTag, m_rgfeaturecomps[ulIndex].pszFeature);
  1491. exit:
  1492. SafeMemFree(pszBuff);
  1493. return;
  1494. }
  1495. HRESULT CJunkFilter::_HrCreateLogFile(VOID)
  1496. {
  1497. HRESULT hr = S_OK;
  1498. LPSTR pszLogFile = NULL;
  1499. ULONG cbData = 0;
  1500. ILogFile * pILogFile = NULL;
  1501. DWORD dwData = 0;
  1502. if (FALSE != m_fJunkMailLogInit)
  1503. {
  1504. hr = S_FALSE;
  1505. goto exit;
  1506. }
  1507. m_fJunkMailLogInit = TRUE;
  1508. // Get the size of the path to Outlook Express
  1509. cbData = sizeof(dwData);
  1510. if ((ERROR_SUCCESS != SHGetValue(HKEY_LOCAL_MACHINE, STR_REG_PATH_FLAT, "JunkMailLog", NULL, (BYTE *) &dwData, &cbData)) ||
  1511. (0 == dwData))
  1512. {
  1513. hr = S_FALSE;
  1514. goto exit;
  1515. }
  1516. // Get the size of the path to Outlook Express
  1517. if (ERROR_SUCCESS != SHGetValue(HKEY_LOCAL_MACHINE, STR_REG_PATH_FLAT, "InstallRoot", NULL, NULL, &cbData))
  1518. {
  1519. hr = E_FAIL;
  1520. goto exit;
  1521. }
  1522. // How much room do we need to build up the path
  1523. cbData += lstrlen(szJunkMailLog) + 2;
  1524. // Allocate space to hold the path
  1525. hr = HrAlloc((VOID **) &pszLogFile, cbData);
  1526. if (FAILED(hr))
  1527. {
  1528. goto exit;
  1529. }
  1530. // Get the path to Outlook Express
  1531. ULONG cbBuffer = cbData;
  1532. if (ERROR_SUCCESS != SHGetValue(HKEY_LOCAL_MACHINE, STR_REG_PATH_FLAT, "InstallRoot", NULL, (BYTE *) pszLogFile, &cbBuffer))
  1533. {
  1534. hr = E_FAIL;
  1535. goto exit;
  1536. }
  1537. // Build up the path to the Junk DLL
  1538. if ('\\' != pszLogFile[lstrlen(pszLogFile)])
  1539. {
  1540. StrCatBuff(pszLogFile, "\\", cbData);
  1541. }
  1542. StrCatBuff(pszLogFile, szJunkMailLog, cbData);
  1543. hr = CreateLogFile(g_hInst, pszLogFile, szJunkMailPrefix, DONT_TRUNCATE, &pILogFile, FILE_SHARE_READ | FILE_SHARE_WRITE);
  1544. if (FAILED(hr))
  1545. {
  1546. goto exit;
  1547. }
  1548. SafeRelease(m_pILogFile);
  1549. m_pILogFile = pILogFile;
  1550. hr = S_OK;
  1551. exit:
  1552. SafeMemFree(pszLogFile);
  1553. return hr;
  1554. }
  1555. VOID PrintToLogFile(ILogFile * pILogFile, LPSTR pszTmpl, LPSTR pszArg)
  1556. {
  1557. LPSTR pszBuff = NULL;
  1558. ULONG cchBuff = 0;
  1559. Assert(NULL != pILogFile);
  1560. Assert(NULL != pszTmpl);
  1561. if (NULL == pszArg)
  1562. {
  1563. pszArg = "";
  1564. }
  1565. // Figure out the size of the resulting buffer
  1566. cchBuff = lstrlen(pszTmpl) + lstrlen(pszArg) + 2;
  1567. // Allocate the needed space
  1568. if (FAILED(HrAlloc((VOID **) &pszBuff, cchBuff * sizeof(*pszBuff))))
  1569. {
  1570. goto exit;
  1571. }
  1572. // Create the output string
  1573. wnsprintf(pszBuff, cchBuff, pszTmpl, pszArg);
  1574. // Print the buffer to the log file
  1575. pILogFile->WriteLog(LOGFILE_DB, pszBuff);
  1576. exit:
  1577. SafeMemFree(pszBuff);
  1578. return;
  1579. }
  1580. #endif // DEBUG