Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

704 lines
17 KiB

  1. /*
  2. SVMHANDLER.CPP
  3. (c) copyright 1998 Microsoft Corp
  4. Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection
  5. Robert Rounthwaite (RobertRo@microsoft.com)
  6. */
  7. #include <afx.h>
  8. #include <stdlib.h>
  9. #include <memory.h>
  10. #include <string.h>
  11. #include <math.h>
  12. #include <assert.h>
  13. #include "svmhandler.h"
  14. typedef unsigned int UINT;
  15. #ifdef _UNICODE
  16. #define stoul wcstoul
  17. #define stod wcstod
  18. #else
  19. #define stoul strtoul
  20. #define stod strtod
  21. #endif
  22. char *szCountFeatureComp = "FeatureComponentCount =";
  23. char *szDefaultThresh = "dThresh =";
  24. /////////////////////////////////////////////////////////////////////////////
  25. // ReadSVMOutput
  26. //
  27. // Read the SVM output from a file (".LKO file")
  28. /////////////////////////////////////////////////////////////////////////////
  29. bool MAILFILTER::ReadSVMOutput(LPCTSTR lpszFileName)
  30. {
  31. try
  32. {
  33. CStdioFile sfile(lpszFileName, CFile::modeRead);
  34. CString strBuf;
  35. int iBufPos;
  36. BOOL bComplete = false;
  37. UINT iSVMW; // index to rgrSVMWeights;
  38. UINT iFeatureComp = 0;
  39. int cFeatureComponents;
  40. // skip first two lines
  41. if ((!sfile.ReadString(strBuf)) ||
  42. (!sfile.ReadString(strBuf)) ||
  43. (!sfile.ReadString(strBuf)))
  44. {
  45. return false;
  46. }
  47. LPCTSTR szBuf = (LPCTSTR)strBuf;
  48. LPTSTR szBufPtr = NULL;
  49. // parse 3rd line: only care about CC and DD
  50. _rCC = stod(&((LPCTSTR)strBuf)[34], NULL);
  51. _rDD = stod(&((LPCTSTR)strBuf)[49], NULL);
  52. if (!sfile.ReadString(strBuf))
  53. {
  54. return false;
  55. }
  56. char *pszDefThresh = strstr(&((LPCTSTR)strBuf)[11], ::szDefaultThresh);
  57. assert(pszDefThresh != NULL);
  58. if (pszDefThresh == NULL)
  59. {
  60. return false;
  61. }
  62. pszDefThresh += strlen(::szDefaultThresh);
  63. _rDefaultThresh = stod(pszDefThresh, NULL);
  64. if (_rSpamCutoff == -1)
  65. {
  66. _rSpamCutoff = _rDefaultThresh;
  67. }
  68. _rThresh = stod(&((LPCTSTR)strBuf)[11], NULL);
  69. if (!sfile.ReadString(strBuf))
  70. {
  71. return false;
  72. }
  73. _cFeatures = stoul(&((LPCTSTR)strBuf)[8], NULL, 10);
  74. if (!sfile.ReadString(strBuf))
  75. {
  76. return false;
  77. }
  78. iBufPos = strBuf.Find(szCountFeatureComp) + strlen(szCountFeatureComp);
  79. cFeatureComponents = stoul(&((LPCTSTR)strBuf)[iBufPos], NULL, 10);
  80. if (cFeatureComponents < _cFeatures)
  81. cFeatureComponents = _cFeatures * 2;
  82. while (strBuf != "Weights")
  83. {
  84. if (!sfile.ReadString(strBuf)) // skip "Weights" line
  85. {
  86. return false;
  87. }
  88. }
  89. rgrSVMWeights = (REAL *)malloc(sizeof(REAL) * _cFeatures);
  90. _rgiFeatureStatus = (int *)malloc(sizeof(int) * _cFeatures);
  91. memset(_rgiFeatureStatus, -1, sizeof(int) * _cFeatures);
  92. rgfeaturecomps = (FeatureComponent *)malloc(sizeof(FeatureComponent) * cFeatureComponents);
  93. for (iSVMW = 0; iSVMW < _cFeatures; iSVMW++)
  94. {
  95. UINT uiLoc;
  96. UINT cbStr;
  97. boolop bop;
  98. char *szFeature;
  99. bool fContinue;
  100. if (!sfile.ReadString(strBuf))
  101. {
  102. return false;
  103. }
  104. // read the SVM weight
  105. rgrSVMWeights[iSVMW] = stod(strBuf, &szBufPtr);
  106. szBufPtr++; // skip the separator
  107. bop = boolopOr;
  108. fContinue = false;
  109. // load all of the feature components
  110. do
  111. {
  112. FeatureComponent *pfeaturecomp = &rgfeaturecomps[iFeatureComp++];
  113. // Location (or "special")
  114. uiLoc = stoul(szBufPtr, &szBufPtr, 10);
  115. szBufPtr++; // skip the separator
  116. pfeaturecomp->loc = (FeatureLocation)uiLoc;
  117. pfeaturecomp->iFeature = iSVMW;
  118. pfeaturecomp->bop = bop;
  119. if (uiLoc == 5) // special feature
  120. {
  121. UINT uiRuleNumber = stoul(szBufPtr, &szBufPtr, 10);
  122. szBufPtr++; // skip the separator
  123. pfeaturecomp->iRuleNum = uiRuleNumber;
  124. }
  125. else // it is a standard string component
  126. {
  127. cbStr = stoul(szBufPtr, &szBufPtr, 10);
  128. szBufPtr++;
  129. szFeature = (char *)malloc((cbStr + 1)*sizeof(char));
  130. memcpy(szFeature, szBufPtr, cbStr);
  131. szBufPtr += cbStr;
  132. if (*szBufPtr != '\0')
  133. {
  134. szBufPtr++; // skip the separator
  135. }
  136. szFeature[cbStr] = '\0';
  137. assert(strlen(szFeature) == cbStr);
  138. pfeaturecomp->szFeature = szFeature;
  139. }
  140. switch(*szBufPtr)
  141. {
  142. case '|':
  143. bop = boolopOr;
  144. fContinue = true;
  145. break;
  146. case '&':
  147. bop = boolopAnd;
  148. fContinue = true;
  149. break;
  150. default:
  151. fContinue = false;
  152. break;
  153. }
  154. szBufPtr++;
  155. }
  156. while (fContinue);
  157. }
  158. _cFeatureComps = iFeatureComp;
  159. }
  160. catch (CFileException *)
  161. {
  162. return false;
  163. }
  164. return true;
  165. }
  166. /////////////////////////////////////////////////////////////////////////////
  167. // SetSpamCutoff
  168. //
  169. // Sets the Spam cutoff percentage. Must be in range from 0 to 100
  170. /////////////////////////////////////////////////////////////////////////////
  171. bool MAILFILTER::SetSpamCutoff(REAL rCutoff)
  172. {
  173. if ((rCutoff >= 0) && (rCutoff <= 100))
  174. {
  175. _rSpamCutoff = rCutoff;
  176. return true;
  177. }
  178. else
  179. {
  180. return false;
  181. }
  182. }
  183. /////////////////////////////////////////////////////////////////////////////
  184. // GetSpamCutoff
  185. //
  186. // returns value set with SetSpamCutoff. Defaults == DefaultSpamCutoff
  187. // if no value has been set when SVM output file is read
  188. /////////////////////////////////////////////////////////////////////////////
  189. REAL MAILFILTER::GetSpamCutoff()
  190. {
  191. return _rSpamCutoff;
  192. }
  193. /////////////////////////////////////////////////////////////////////////////
  194. // GetDefaultSpamCutoff
  195. //
  196. // returns default value for SpamCutoff. read from SVM output file.
  197. // should call FSetSVMDataLocation before calling this function
  198. /////////////////////////////////////////////////////////////////////////////
  199. REAL MAILFILTER::GetDefaultSpamCutoff()
  200. {
  201. assert(!_strFName.IsEmpty());
  202. return _rDefaultThresh;
  203. }
  204. /////////////////////////////////////////////////////////////////////////////
  205. // FInvokeSpecialRule
  206. //
  207. // Invokes the special rule that is this FeatureComponent.
  208. // Returns the state of the feature.
  209. /////////////////////////////////////////////////////////////////////////////
  210. bool MAILFILTER::FInvokeSpecialRule(UINT iRuleNum)
  211. {
  212. switch (iRuleNum)
  213. {
  214. case 1:
  215. return FWordPresent(_szBody, _szFirstName);
  216. break;
  217. case 2:
  218. return FWordPresent(_szBody, _szLastName);
  219. break;
  220. case 3:
  221. return FWordPresent(_szBody, _szCompanyName);
  222. break;
  223. case 4:
  224. // year message received
  225. if (FTimeEmpty(_tMessageSent))
  226. {
  227. return false;
  228. }
  229. else
  230. {
  231. CTime time(_tMessageSent, -1);
  232. char szYear[6];
  233. wnsprintf(szYear, ARRAYSIZE(szYear), "%i", time.GetYear());
  234. return FWordPresent(_szBody, szYear);
  235. }
  236. break;
  237. case 5:
  238. // message received in the wee hours (>= 7pm or <6am
  239. if (FTimeEmpty(_tMessageSent))
  240. {
  241. return false;
  242. }
  243. else
  244. {
  245. CTime time(_tMessageSent, -1);
  246. return (time.GetHour() >= (7+12)) || (time.GetHour() < 6);
  247. }
  248. break;
  249. case 6:
  250. // message received on weekend
  251. if (FTimeEmpty(_tMessageSent))
  252. {
  253. return false;
  254. }
  255. else
  256. {
  257. CTime time(_tMessageSent, -1);
  258. return ((time.GetDayOfWeek() == 7) || (time.GetDayOfWeek() == 1));
  259. }
  260. break;
  261. case 14:
  262. return _bRule14; // set in HandleCaseSensitiveSpecialRules()
  263. break;
  264. case 15:
  265. return SpecialFeatureNonAlpha(_szBody);
  266. break;
  267. case 16:
  268. return _bDirectMessage;
  269. break;
  270. case 17:
  271. return _bRule17; // set in HandleCaseSensitiveSpecialRules()
  272. break;
  273. case 18:
  274. return SpecialFeatureNonAlpha(_szSubject);
  275. break;
  276. case 19:
  277. return (*_szTo=='\0');
  278. break;
  279. case 20:
  280. return _bHasAttach;
  281. break;
  282. case 40:
  283. return (strlen(_szBody) >= 125);
  284. case 41:
  285. return (strlen(_szBody) >= 250);
  286. case 42:
  287. return (strlen(_szBody) >= 500);
  288. case 43:
  289. return (strlen(_szBody) >= 1000);
  290. case 44:
  291. return (strlen(_szBody) >= 2000);
  292. case 45:
  293. return (strlen(_szBody) >= 4000);
  294. case 46:
  295. return (strlen(_szBody) >= 8000);
  296. case 47:
  297. return (strlen(_szBody) >= 16000);
  298. default:
  299. return false;
  300. //assert(false == "unsupported special feature");
  301. break;
  302. }
  303. return true;
  304. }
  305. /////////////////////////////////////////////////////////////////////////////
  306. // HandleCaseSensitiveSpecialRules
  307. //
  308. // Called from EvaluateFeatureComponents().
  309. // Some special rules are case sensitive, so if they're present, we'll
  310. // evaluate them before we make the texts uppercase and cache the result
  311. // for when they are actually used.
  312. /////////////////////////////////////////////////////////////////////////////
  313. void MAILFILTER::HandleCaseSensitiveSpecialRules()
  314. {
  315. for (UINT i = 0; i<_cFeatureComps; i++)
  316. {
  317. FeatureComponent *pfcomp = &rgfeaturecomps[i];
  318. if (pfcomp->loc == locSpecial)
  319. {
  320. switch (pfcomp->iRuleNum)
  321. {
  322. case 14:
  323. _bRule14 = SpecialFeatureUpperCaseWords(_szBody);
  324. break;
  325. case 17:
  326. _bRule17 = SpecialFeatureUpperCaseWords(_szSubject);
  327. break;
  328. default:
  329. ;// nothing
  330. }
  331. }
  332. }
  333. }
  334. /////////////////////////////////////////////////////////////////////////////
  335. // EvaluateFeatureComponents
  336. //
  337. // Evaluates all of the feature components. Sets fPresent in each component
  338. // to true if the feature is present, false otherwise
  339. /////////////////////////////////////////////////////////////////////////////
  340. void MAILFILTER::EvaluateFeatureComponents()
  341. {
  342. HandleCaseSensitiveSpecialRules();
  343. _strupr(_szFrom);
  344. _strupr(_szTo);
  345. _strupr(_szSubject);
  346. _strupr(_szBody);
  347. for (UINT i = 0; i<_cFeatureComps; i++)
  348. {
  349. FeatureComponent *pfcomp = &rgfeaturecomps[i];
  350. switch(pfcomp->loc)
  351. {
  352. case locNil:
  353. assert(pfcomp->loc != locNil);
  354. pfcomp->fPresent = false;
  355. break;
  356. case locBody:
  357. pfcomp->fPresent = FWordPresent(_szBody, pfcomp->szFeature);
  358. break;
  359. case locSubj:
  360. pfcomp->fPresent = FWordPresent(_szSubject, pfcomp->szFeature);
  361. break;
  362. case locFrom:
  363. pfcomp->fPresent = FWordPresent(_szFrom, pfcomp->szFeature);
  364. break;
  365. case locTo:
  366. pfcomp->fPresent = FWordPresent(_szTo, pfcomp->szFeature);
  367. break;
  368. case locSpecial:
  369. pfcomp->fPresent = FInvokeSpecialRule(pfcomp->iRuleNum);
  370. break;
  371. }
  372. }
  373. }
  374. /////////////////////////////////////////////////////////////////////////////
  375. // ProcessFeatureComponentPresence
  376. //
  377. // Processes the presence (or absence) of the individual feature components,
  378. // setting the feature status of each feature (which may me made up of
  379. // multiple feature components).
  380. /////////////////////////////////////////////////////////////////////////////
  381. void MAILFILTER::ProcessFeatureComponentPresence()
  382. {
  383. for (UINT i = 0; i < _cFeatureComps; i++)
  384. {
  385. FeatureComponent *pfcomp = &rgfeaturecomps[i];
  386. UINT iFeature = pfcomp->iFeature;
  387. if (_rgiFeatureStatus[iFeature] == -1) // first feature of this feature
  388. {
  389. if (pfcomp->fPresent)
  390. {
  391. _rgiFeatureStatus[iFeature] = 1;
  392. }
  393. else
  394. {
  395. _rgiFeatureStatus[iFeature] = 0;
  396. }
  397. }
  398. else
  399. {
  400. switch (pfcomp->bop)
  401. {
  402. case boolopOr:
  403. if (pfcomp->fPresent)
  404. {
  405. _rgiFeatureStatus[iFeature] = 1;
  406. }
  407. break;
  408. case boolopAnd:
  409. if (!pfcomp->fPresent)
  410. {
  411. _rgiFeatureStatus[iFeature] = 0;
  412. }
  413. break;
  414. default:
  415. assert(false);
  416. break;
  417. }
  418. }
  419. }
  420. }
  421. /////////////////////////////////////////////////////////////////////////////
  422. // RDoSVMCalc
  423. //
  424. // Does the actual support vector machine calculation.
  425. // Returns the probability that the message is spam
  426. /////////////////////////////////////////////////////////////////////////////
  427. REAL MAILFILTER::RDoSVMCalc()
  428. {
  429. REAL rAccum; // accumulator for result
  430. REAL rResult;
  431. rAccum = 0.0;
  432. for (UINT i = 0; i < _cFeatures; i++)
  433. {
  434. if (_rgiFeatureStatus[i] == 1)
  435. rAccum+=rgrSVMWeights[i];
  436. else if (_rgiFeatureStatus[i] != 0)
  437. assert(false);
  438. }
  439. // Apply threshold;
  440. rAccum -= _rThresh;
  441. // Apply sigmoid
  442. rResult = (1 / (1 + exp((_rCC * rAccum) + _rDD)));
  443. return rResult;
  444. }
  445. /*
  446. // for timing version
  447. #include <sys\\types.h>
  448. #include <sys\\timeb.h>
  449. */
  450. //#include "..\SpamLearner\MailIndexer.cpp"
  451. /////////////////////////////////////////////////////////////////////////////
  452. // BCalculateSpamProb
  453. //
  454. // Calculates the probability that the current message is spam.
  455. // Returns the probability (0 to 1) that the message is spam in prSpamProb
  456. // the boolean return is determined by comparing to the spam cutoff
  457. /////////////////////////////////////////////////////////////////////////////
  458. bool MAILFILTER::BCalculateSpamProb(/* IN params */
  459. char *szFrom,
  460. char *szTo,
  461. char *szSubject,
  462. char *szBody,
  463. bool bDirectMessage,
  464. bool bHasAttach,
  465. FILETIME tMessageSent,
  466. /* OUT params */
  467. REAL *prSpamProb,
  468. bool * pbIsSpam)
  469. {
  470. //_strFName = "d:\\test\\test.lko";
  471. //_strFName = "G:\\SPAM\\SPAM.lko";
  472. _szFrom = szFrom;
  473. _szTo = szTo;
  474. _szSubject = szSubject;
  475. _szBody = szBody;
  476. _bDirectMessage = bDirectMessage;
  477. _bHasAttach = bHasAttach;
  478. _tMessageSent = tMessageSent;
  479. EvaluateFeatureComponents();
  480. //ProcessMessage(_szFrom, _szTo, _szSubject, _szBody);
  481. ProcessFeatureComponentPresence();
  482. *prSpamProb = RDoSVMCalc();
  483. *pbIsSpam = (*prSpamProb>(_rSpamCutoff/100));
  484. return true;
  485. /* timing version
  486. _timeb start, finish;
  487. int ij = strlen(szBody);
  488. _ftime( &start );
  489. ReadSVMOutput("d:\\test\\test.lko");
  490. for (int i=0;i<1000;i++)
  491. {
  492. ProcessMessage(szFrom, szTo, szSubject, szBody);
  493. DetermineFeatureStatus(bDirectMessage);
  494. *pr = RDoSVMCalc();
  495. }
  496. _ftime( &finish );
  497. *pr = (finish.time-start.time + (finish.millitm-start.millitm)/1000.0);
  498. return true;
  499. */
  500. }
  501. /////////////////////////////////////////////////////////////////////////////
  502. // BReadDefaultSpamCutoff
  503. //
  504. // Reads the default spam cutoff without parsing entire file
  505. // Use GetDefaultSpamCutoff if using FSetSVMDataLocation;
  506. // static member function
  507. /////////////////////////////////////////////////////////////////////////////
  508. bool MAILFILTER::BReadDefaultSpamCutoff(char *szFullPath, REAL *prDefCutoff)
  509. {
  510. try
  511. {
  512. CStdioFile sfile(szFullPath, CFile::modeRead);
  513. CString strBuf;
  514. // skip first three lines
  515. if ((!sfile.ReadString(strBuf)) ||
  516. (!sfile.ReadString(strBuf)) ||
  517. (!sfile.ReadString(strBuf)) ||
  518. (!sfile.ReadString(strBuf)))
  519. {
  520. return false;
  521. }
  522. char *pszDefThresh = strstr(&((LPCTSTR)strBuf)[11], ::szDefaultThresh);
  523. assert(pszDefThresh != NULL);
  524. if (pszDefThresh == NULL)
  525. {
  526. return false;
  527. }
  528. pszDefThresh += strlen(::szDefaultThresh);
  529. *prDefCutoff = stod(pszDefThresh, NULL);
  530. if (*prDefCutoff < .9 ) // since the default has been shifted to 2 std dev, we only take it if it is greater than .9
  531. {
  532. *prDefCutoff = 0.9;
  533. }
  534. }
  535. catch (CFileException *)
  536. {
  537. return false;
  538. }
  539. return true;
  540. }
  541. /////////////////////////////////////////////////////////////////////////////
  542. // FSetSVMDataLocation
  543. //
  544. // Sets the location of the SVM Data file(.LKO file). Must be called before
  545. // calling any other methods
  546. // Data file must be present at time function is called
  547. // returns true if successful, false otherwise
  548. /////////////////////////////////////////////////////////////////////////////
  549. bool MAILFILTER::FSetSVMDataLocation(char *szFullPath)
  550. {
  551. if (_strFName != szFullPath)
  552. {
  553. _strFName = szFullPath;
  554. if (!ReadSVMOutput(_strFName))
  555. {
  556. #ifdef DEBUG
  557. char szErr[200];
  558. wnsprintf(szErr, ARRAYSIZE(szErr), "Unable to successfully read filter params from %s", _strFName);
  559. MessageBox(NULL, szErr, "Junk mail filter error", MB_APPLMODAL | MB_OK);
  560. #endif
  561. return false;
  562. }
  563. }
  564. return true;
  565. }
  566. /////////////////////////////////////////////////////////////////////////////
  567. // Property set methods
  568. //
  569. /////////////////////////////////////////////////////////////////////////////
  570. void MAILFILTER::SetFirstName(char *szFirstName)
  571. {
  572. SAFE_FREE( _szFirstName );
  573. if (szFirstName!=NULL)
  574. {
  575. _szFirstName = strdup(szFirstName);
  576. _strupr(_szFirstName);
  577. }
  578. else
  579. {
  580. _szFirstName = NULL;
  581. }
  582. }
  583. void MAILFILTER::SetLastName(char *szLastName)
  584. {
  585. SAFE_FREE( _szLastName );
  586. if (szLastName!=NULL)
  587. {
  588. _szLastName = strdup(szLastName);
  589. _strupr(_szLastName);
  590. }
  591. else
  592. {
  593. _szLastName = NULL;
  594. }
  595. }
  596. void MAILFILTER::SetCompanyName(char *szCompanyName)
  597. {
  598. SAFE_FREE( _szCompanyName );
  599. if (szCompanyName!=NULL)
  600. {
  601. _szCompanyName = strdup(szCompanyName);
  602. _strupr(_szCompanyName);
  603. }
  604. else
  605. {
  606. _szCompanyName = NULL;
  607. }
  608. }
  609. /////////////////////////////////////////////////////////////////////////////
  610. // Constructor/destructor
  611. //
  612. /////////////////////////////////////////////////////////////////////////////
  613. MAILFILTER::MAILFILTER()
  614. {
  615. _szFirstName = NULL;
  616. _szLastName = NULL;
  617. _szCompanyName = NULL;
  618. _rDefaultThresh = -1;
  619. _rThresh = -1;
  620. _cFeatureComps = 0;
  621. rgrSVMWeights = NULL;
  622. }
  623. MAILFILTER::~MAILFILTER()
  624. {
  625. SAFE_FREE( _szFirstName );
  626. SAFE_FREE( _szLastName );
  627. SAFE_FREE( _szCompanyName );
  628. for (unsigned int i=0;i<_cFeatureComps;i++)
  629. rgfeaturecomps[i].~FeatureComponent();
  630. SAFE_FREE( rgrSVMWeights );
  631. SAFE_FREE( _rgiFeatureStatus );
  632. SAFE_FREE( rgfeaturecomps );
  633. }