/* SVMHANDLER.CPP (c) copyright 1998 Microsoft Corp Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection Robert Rounthwaite (RobertRo@microsoft.com) */ #include #include #include #include #include #include #include "svmhandler.h" typedef unsigned int UINT; #ifdef _UNICODE #define stoul wcstoul #define stod wcstod #else #define stoul strtoul #define stod strtod #endif char *szCountFeatureComp = "FeatureComponentCount ="; char *szDefaultThresh = "dThresh ="; ///////////////////////////////////////////////////////////////////////////// // ReadSVMOutput // // Read the SVM output from a file (".LKO file") ///////////////////////////////////////////////////////////////////////////// bool MAILFILTER::ReadSVMOutput(LPCTSTR lpszFileName) { try { CStdioFile sfile(lpszFileName, CFile::modeRead); CString strBuf; int iBufPos; BOOL bComplete = false; UINT iSVMW; // index to rgrSVMWeights; UINT iFeatureComp = 0; int cFeatureComponents; // skip first two lines if ((!sfile.ReadString(strBuf)) || (!sfile.ReadString(strBuf)) || (!sfile.ReadString(strBuf))) { return false; } LPCTSTR szBuf = (LPCTSTR)strBuf; LPTSTR szBufPtr = NULL; // parse 3rd line: only care about CC and DD _rCC = stod(&((LPCTSTR)strBuf)[34], NULL); _rDD = stod(&((LPCTSTR)strBuf)[49], NULL); if (!sfile.ReadString(strBuf)) { return false; } char *pszDefThresh = strstr(&((LPCTSTR)strBuf)[11], ::szDefaultThresh); assert(pszDefThresh != NULL); if (pszDefThresh == NULL) { return false; } pszDefThresh += strlen(::szDefaultThresh); _rDefaultThresh = stod(pszDefThresh, NULL); if (_rSpamCutoff == -1) { _rSpamCutoff = _rDefaultThresh; } _rThresh = stod(&((LPCTSTR)strBuf)[11], NULL); if (!sfile.ReadString(strBuf)) { return false; } _cFeatures = stoul(&((LPCTSTR)strBuf)[8], NULL, 10); if (!sfile.ReadString(strBuf)) { return false; } iBufPos = strBuf.Find(szCountFeatureComp) + strlen(szCountFeatureComp); cFeatureComponents = stoul(&((LPCTSTR)strBuf)[iBufPos], NULL, 10); if (cFeatureComponents < _cFeatures) cFeatureComponents = _cFeatures * 2; while (strBuf != "Weights") { if (!sfile.ReadString(strBuf)) // skip "Weights" line { return false; } } rgrSVMWeights = (REAL *)malloc(sizeof(REAL) * _cFeatures); _rgiFeatureStatus = (int *)malloc(sizeof(int) * _cFeatures); memset(_rgiFeatureStatus, -1, sizeof(int) * _cFeatures); rgfeaturecomps = (FeatureComponent *)malloc(sizeof(FeatureComponent) * cFeatureComponents); for (iSVMW = 0; iSVMW < _cFeatures; iSVMW++) { UINT uiLoc; UINT cbStr; boolop bop; char *szFeature; bool fContinue; if (!sfile.ReadString(strBuf)) { return false; } // read the SVM weight rgrSVMWeights[iSVMW] = stod(strBuf, &szBufPtr); szBufPtr++; // skip the separator bop = boolopOr; fContinue = false; // load all of the feature components do { FeatureComponent *pfeaturecomp = &rgfeaturecomps[iFeatureComp++]; // Location (or "special") uiLoc = stoul(szBufPtr, &szBufPtr, 10); szBufPtr++; // skip the separator pfeaturecomp->loc = (FeatureLocation)uiLoc; pfeaturecomp->iFeature = iSVMW; pfeaturecomp->bop = bop; if (uiLoc == 5) // special feature { UINT uiRuleNumber = stoul(szBufPtr, &szBufPtr, 10); szBufPtr++; // skip the separator pfeaturecomp->iRuleNum = uiRuleNumber; } else // it is a standard string component { cbStr = stoul(szBufPtr, &szBufPtr, 10); szBufPtr++; szFeature = (char *)malloc((cbStr + 1)*sizeof(char)); memcpy(szFeature, szBufPtr, cbStr); szBufPtr += cbStr; if (*szBufPtr != '\0') { szBufPtr++; // skip the separator } szFeature[cbStr] = '\0'; assert(strlen(szFeature) == cbStr); pfeaturecomp->szFeature = szFeature; } switch(*szBufPtr) { case '|': bop = boolopOr; fContinue = true; break; case '&': bop = boolopAnd; fContinue = true; break; default: fContinue = false; break; } szBufPtr++; } while (fContinue); } _cFeatureComps = iFeatureComp; } catch (CFileException *) { return false; } return true; } ///////////////////////////////////////////////////////////////////////////// // SetSpamCutoff // // Sets the Spam cutoff percentage. Must be in range from 0 to 100 ///////////////////////////////////////////////////////////////////////////// bool MAILFILTER::SetSpamCutoff(REAL rCutoff) { if ((rCutoff >= 0) && (rCutoff <= 100)) { _rSpamCutoff = rCutoff; return true; } else { return false; } } ///////////////////////////////////////////////////////////////////////////// // GetSpamCutoff // // returns value set with SetSpamCutoff. Defaults == DefaultSpamCutoff // if no value has been set when SVM output file is read ///////////////////////////////////////////////////////////////////////////// REAL MAILFILTER::GetSpamCutoff() { return _rSpamCutoff; } ///////////////////////////////////////////////////////////////////////////// // GetDefaultSpamCutoff // // returns default value for SpamCutoff. read from SVM output file. // should call FSetSVMDataLocation before calling this function ///////////////////////////////////////////////////////////////////////////// REAL MAILFILTER::GetDefaultSpamCutoff() { assert(!_strFName.IsEmpty()); return _rDefaultThresh; } ///////////////////////////////////////////////////////////////////////////// // FInvokeSpecialRule // // Invokes the special rule that is this FeatureComponent. // Returns the state of the feature. ///////////////////////////////////////////////////////////////////////////// bool MAILFILTER::FInvokeSpecialRule(UINT iRuleNum) { switch (iRuleNum) { case 1: return FWordPresent(_szBody, _szFirstName); break; case 2: return FWordPresent(_szBody, _szLastName); break; case 3: return FWordPresent(_szBody, _szCompanyName); break; case 4: // year message received if (FTimeEmpty(_tMessageSent)) { return false; } else { CTime time(_tMessageSent, -1); char szYear[6]; wnsprintf(szYear, ARRAYSIZE(szYear), "%i", time.GetYear()); return FWordPresent(_szBody, szYear); } break; case 5: // message received in the wee hours (>= 7pm or <6am if (FTimeEmpty(_tMessageSent)) { return false; } else { CTime time(_tMessageSent, -1); return (time.GetHour() >= (7+12)) || (time.GetHour() < 6); } break; case 6: // message received on weekend if (FTimeEmpty(_tMessageSent)) { return false; } else { CTime time(_tMessageSent, -1); return ((time.GetDayOfWeek() == 7) || (time.GetDayOfWeek() == 1)); } break; case 14: return _bRule14; // set in HandleCaseSensitiveSpecialRules() break; case 15: return SpecialFeatureNonAlpha(_szBody); break; case 16: return _bDirectMessage; break; case 17: return _bRule17; // set in HandleCaseSensitiveSpecialRules() break; case 18: return SpecialFeatureNonAlpha(_szSubject); break; case 19: return (*_szTo=='\0'); break; case 20: return _bHasAttach; break; case 40: return (strlen(_szBody) >= 125); case 41: return (strlen(_szBody) >= 250); case 42: return (strlen(_szBody) >= 500); case 43: return (strlen(_szBody) >= 1000); case 44: return (strlen(_szBody) >= 2000); case 45: return (strlen(_szBody) >= 4000); case 46: return (strlen(_szBody) >= 8000); case 47: return (strlen(_szBody) >= 16000); default: return false; //assert(false == "unsupported special feature"); break; } return true; } ///////////////////////////////////////////////////////////////////////////// // HandleCaseSensitiveSpecialRules // // Called from EvaluateFeatureComponents(). // Some special rules are case sensitive, so if they're present, we'll // evaluate them before we make the texts uppercase and cache the result // for when they are actually used. ///////////////////////////////////////////////////////////////////////////// void MAILFILTER::HandleCaseSensitiveSpecialRules() { for (UINT i = 0; i<_cFeatureComps; i++) { FeatureComponent *pfcomp = &rgfeaturecomps[i]; if (pfcomp->loc == locSpecial) { switch (pfcomp->iRuleNum) { case 14: _bRule14 = SpecialFeatureUpperCaseWords(_szBody); break; case 17: _bRule17 = SpecialFeatureUpperCaseWords(_szSubject); break; default: ;// nothing } } } } ///////////////////////////////////////////////////////////////////////////// // EvaluateFeatureComponents // // Evaluates all of the feature components. Sets fPresent in each component // to true if the feature is present, false otherwise ///////////////////////////////////////////////////////////////////////////// void MAILFILTER::EvaluateFeatureComponents() { HandleCaseSensitiveSpecialRules(); _strupr(_szFrom); _strupr(_szTo); _strupr(_szSubject); _strupr(_szBody); for (UINT i = 0; i<_cFeatureComps; i++) { FeatureComponent *pfcomp = &rgfeaturecomps[i]; switch(pfcomp->loc) { case locNil: assert(pfcomp->loc != locNil); pfcomp->fPresent = false; break; case locBody: pfcomp->fPresent = FWordPresent(_szBody, pfcomp->szFeature); break; case locSubj: pfcomp->fPresent = FWordPresent(_szSubject, pfcomp->szFeature); break; case locFrom: pfcomp->fPresent = FWordPresent(_szFrom, pfcomp->szFeature); break; case locTo: pfcomp->fPresent = FWordPresent(_szTo, pfcomp->szFeature); break; case locSpecial: pfcomp->fPresent = FInvokeSpecialRule(pfcomp->iRuleNum); break; } } } ///////////////////////////////////////////////////////////////////////////// // ProcessFeatureComponentPresence // // Processes the presence (or absence) of the individual feature components, // setting the feature status of each feature (which may me made up of // multiple feature components). ///////////////////////////////////////////////////////////////////////////// void MAILFILTER::ProcessFeatureComponentPresence() { for (UINT i = 0; i < _cFeatureComps; i++) { FeatureComponent *pfcomp = &rgfeaturecomps[i]; UINT iFeature = pfcomp->iFeature; if (_rgiFeatureStatus[iFeature] == -1) // first feature of this feature { if (pfcomp->fPresent) { _rgiFeatureStatus[iFeature] = 1; } else { _rgiFeatureStatus[iFeature] = 0; } } else { switch (pfcomp->bop) { case boolopOr: if (pfcomp->fPresent) { _rgiFeatureStatus[iFeature] = 1; } break; case boolopAnd: if (!pfcomp->fPresent) { _rgiFeatureStatus[iFeature] = 0; } break; default: assert(false); break; } } } } ///////////////////////////////////////////////////////////////////////////// // RDoSVMCalc // // Does the actual support vector machine calculation. // Returns the probability that the message is spam ///////////////////////////////////////////////////////////////////////////// REAL MAILFILTER::RDoSVMCalc() { REAL rAccum; // accumulator for result REAL rResult; rAccum = 0.0; for (UINT i = 0; i < _cFeatures; i++) { if (_rgiFeatureStatus[i] == 1) rAccum+=rgrSVMWeights[i]; else if (_rgiFeatureStatus[i] != 0) assert(false); } // Apply threshold; rAccum -= _rThresh; // Apply sigmoid rResult = (1 / (1 + exp((_rCC * rAccum) + _rDD))); return rResult; } /* // for timing version #include #include */ //#include "..\SpamLearner\MailIndexer.cpp" ///////////////////////////////////////////////////////////////////////////// // BCalculateSpamProb // // Calculates the probability that the current message is spam. // Returns the probability (0 to 1) that the message is spam in prSpamProb // the boolean return is determined by comparing to the spam cutoff ///////////////////////////////////////////////////////////////////////////// bool MAILFILTER::BCalculateSpamProb(/* IN params */ char *szFrom, char *szTo, char *szSubject, char *szBody, bool bDirectMessage, bool bHasAttach, FILETIME tMessageSent, /* OUT params */ REAL *prSpamProb, bool * pbIsSpam) { //_strFName = "d:\\test\\test.lko"; //_strFName = "G:\\SPAM\\SPAM.lko"; _szFrom = szFrom; _szTo = szTo; _szSubject = szSubject; _szBody = szBody; _bDirectMessage = bDirectMessage; _bHasAttach = bHasAttach; _tMessageSent = tMessageSent; EvaluateFeatureComponents(); //ProcessMessage(_szFrom, _szTo, _szSubject, _szBody); ProcessFeatureComponentPresence(); *prSpamProb = RDoSVMCalc(); *pbIsSpam = (*prSpamProb>(_rSpamCutoff/100)); return true; /* timing version _timeb start, finish; int ij = strlen(szBody); _ftime( &start ); ReadSVMOutput("d:\\test\\test.lko"); for (int i=0;i<1000;i++) { ProcessMessage(szFrom, szTo, szSubject, szBody); DetermineFeatureStatus(bDirectMessage); *pr = RDoSVMCalc(); } _ftime( &finish ); *pr = (finish.time-start.time + (finish.millitm-start.millitm)/1000.0); return true; */ } ///////////////////////////////////////////////////////////////////////////// // BReadDefaultSpamCutoff // // Reads the default spam cutoff without parsing entire file // Use GetDefaultSpamCutoff if using FSetSVMDataLocation; // static member function ///////////////////////////////////////////////////////////////////////////// bool MAILFILTER::BReadDefaultSpamCutoff(char *szFullPath, REAL *prDefCutoff) { try { CStdioFile sfile(szFullPath, CFile::modeRead); CString strBuf; // skip first three lines if ((!sfile.ReadString(strBuf)) || (!sfile.ReadString(strBuf)) || (!sfile.ReadString(strBuf)) || (!sfile.ReadString(strBuf))) { return false; } char *pszDefThresh = strstr(&((LPCTSTR)strBuf)[11], ::szDefaultThresh); assert(pszDefThresh != NULL); if (pszDefThresh == NULL) { return false; } pszDefThresh += strlen(::szDefaultThresh); *prDefCutoff = stod(pszDefThresh, NULL); if (*prDefCutoff < .9 ) // since the default has been shifted to 2 std dev, we only take it if it is greater than .9 { *prDefCutoff = 0.9; } } catch (CFileException *) { return false; } return true; } ///////////////////////////////////////////////////////////////////////////// // FSetSVMDataLocation // // Sets the location of the SVM Data file(.LKO file). Must be called before // calling any other methods // Data file must be present at time function is called // returns true if successful, false otherwise ///////////////////////////////////////////////////////////////////////////// bool MAILFILTER::FSetSVMDataLocation(char *szFullPath) { if (_strFName != szFullPath) { _strFName = szFullPath; if (!ReadSVMOutput(_strFName)) { #ifdef DEBUG char szErr[200]; wnsprintf(szErr, ARRAYSIZE(szErr), "Unable to successfully read filter params from %s", _strFName); MessageBox(NULL, szErr, "Junk mail filter error", MB_APPLMODAL | MB_OK); #endif return false; } } return true; } ///////////////////////////////////////////////////////////////////////////// // Property set methods // ///////////////////////////////////////////////////////////////////////////// void MAILFILTER::SetFirstName(char *szFirstName) { SAFE_FREE( _szFirstName ); if (szFirstName!=NULL) { _szFirstName = strdup(szFirstName); _strupr(_szFirstName); } else { _szFirstName = NULL; } } void MAILFILTER::SetLastName(char *szLastName) { SAFE_FREE( _szLastName ); if (szLastName!=NULL) { _szLastName = strdup(szLastName); _strupr(_szLastName); } else { _szLastName = NULL; } } void MAILFILTER::SetCompanyName(char *szCompanyName) { SAFE_FREE( _szCompanyName ); if (szCompanyName!=NULL) { _szCompanyName = strdup(szCompanyName); _strupr(_szCompanyName); } else { _szCompanyName = NULL; } } ///////////////////////////////////////////////////////////////////////////// // Constructor/destructor // ///////////////////////////////////////////////////////////////////////////// MAILFILTER::MAILFILTER() { _szFirstName = NULL; _szLastName = NULL; _szCompanyName = NULL; _rDefaultThresh = -1; _rThresh = -1; _cFeatureComps = 0; rgrSVMWeights = NULL; } MAILFILTER::~MAILFILTER() { SAFE_FREE( _szFirstName ); SAFE_FREE( _szLastName ); SAFE_FREE( _szCompanyName ); for (unsigned int i=0;i<_cFeatureComps;i++) rgfeaturecomps[i].~FeatureComponent(); SAFE_FREE( rgrSVMWeights ); SAFE_FREE( _rgiFeatureStatus ); SAFE_FREE( rgfeaturecomps ); }