Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

704 lines
17 KiB

/*
SVMHANDLER.CPP
(c) copyright 1998 Microsoft Corp
Contains the class encapsulating the Support Vector Machine used to do on the fly spam detection
Robert Rounthwaite ([email protected])
*/
#include <afx.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include "svmhandler.h"
typedef unsigned int UINT;
#ifdef _UNICODE
#define stoul wcstoul
#define stod wcstod
#else
#define stoul strtoul
#define stod strtod
#endif
char *szCountFeatureComp = "FeatureComponentCount =";
char *szDefaultThresh = "dThresh =";
/////////////////////////////////////////////////////////////////////////////
// ReadSVMOutput
//
// Read the SVM output from a file (".LKO file")
/////////////////////////////////////////////////////////////////////////////
bool MAILFILTER::ReadSVMOutput(LPCTSTR lpszFileName)
{
try
{
CStdioFile sfile(lpszFileName, CFile::modeRead);
CString strBuf;
int iBufPos;
BOOL bComplete = false;
UINT iSVMW; // index to rgrSVMWeights;
UINT iFeatureComp = 0;
int cFeatureComponents;
// skip first two lines
if ((!sfile.ReadString(strBuf)) ||
(!sfile.ReadString(strBuf)) ||
(!sfile.ReadString(strBuf)))
{
return false;
}
LPCTSTR szBuf = (LPCTSTR)strBuf;
LPTSTR szBufPtr = NULL;
// parse 3rd line: only care about CC and DD
_rCC = stod(&((LPCTSTR)strBuf)[34], NULL);
_rDD = stod(&((LPCTSTR)strBuf)[49], NULL);
if (!sfile.ReadString(strBuf))
{
return false;
}
char *pszDefThresh = strstr(&((LPCTSTR)strBuf)[11], ::szDefaultThresh);
assert(pszDefThresh != NULL);
if (pszDefThresh == NULL)
{
return false;
}
pszDefThresh += strlen(::szDefaultThresh);
_rDefaultThresh = stod(pszDefThresh, NULL);
if (_rSpamCutoff == -1)
{
_rSpamCutoff = _rDefaultThresh;
}
_rThresh = stod(&((LPCTSTR)strBuf)[11], NULL);
if (!sfile.ReadString(strBuf))
{
return false;
}
_cFeatures = stoul(&((LPCTSTR)strBuf)[8], NULL, 10);
if (!sfile.ReadString(strBuf))
{
return false;
}
iBufPos = strBuf.Find(szCountFeatureComp) + strlen(szCountFeatureComp);
cFeatureComponents = stoul(&((LPCTSTR)strBuf)[iBufPos], NULL, 10);
if (cFeatureComponents < _cFeatures)
cFeatureComponents = _cFeatures * 2;
while (strBuf != "Weights")
{
if (!sfile.ReadString(strBuf)) // skip "Weights" line
{
return false;
}
}
rgrSVMWeights = (REAL *)malloc(sizeof(REAL) * _cFeatures);
_rgiFeatureStatus = (int *)malloc(sizeof(int) * _cFeatures);
memset(_rgiFeatureStatus, -1, sizeof(int) * _cFeatures);
rgfeaturecomps = (FeatureComponent *)malloc(sizeof(FeatureComponent) * cFeatureComponents);
for (iSVMW = 0; iSVMW < _cFeatures; iSVMW++)
{
UINT uiLoc;
UINT cbStr;
boolop bop;
char *szFeature;
bool fContinue;
if (!sfile.ReadString(strBuf))
{
return false;
}
// read the SVM weight
rgrSVMWeights[iSVMW] = stod(strBuf, &szBufPtr);
szBufPtr++; // skip the separator
bop = boolopOr;
fContinue = false;
// load all of the feature components
do
{
FeatureComponent *pfeaturecomp = &rgfeaturecomps[iFeatureComp++];
// Location (or "special")
uiLoc = stoul(szBufPtr, &szBufPtr, 10);
szBufPtr++; // skip the separator
pfeaturecomp->loc = (FeatureLocation)uiLoc;
pfeaturecomp->iFeature = iSVMW;
pfeaturecomp->bop = bop;
if (uiLoc == 5) // special feature
{
UINT uiRuleNumber = stoul(szBufPtr, &szBufPtr, 10);
szBufPtr++; // skip the separator
pfeaturecomp->iRuleNum = uiRuleNumber;
}
else // it is a standard string component
{
cbStr = stoul(szBufPtr, &szBufPtr, 10);
szBufPtr++;
szFeature = (char *)malloc((cbStr + 1)*sizeof(char));
memcpy(szFeature, szBufPtr, cbStr);
szBufPtr += cbStr;
if (*szBufPtr != '\0')
{
szBufPtr++; // skip the separator
}
szFeature[cbStr] = '\0';
assert(strlen(szFeature) == cbStr);
pfeaturecomp->szFeature = szFeature;
}
switch(*szBufPtr)
{
case '|':
bop = boolopOr;
fContinue = true;
break;
case '&':
bop = boolopAnd;
fContinue = true;
break;
default:
fContinue = false;
break;
}
szBufPtr++;
}
while (fContinue);
}
_cFeatureComps = iFeatureComp;
}
catch (CFileException *)
{
return false;
}
return true;
}
/////////////////////////////////////////////////////////////////////////////
// SetSpamCutoff
//
// Sets the Spam cutoff percentage. Must be in range from 0 to 100
/////////////////////////////////////////////////////////////////////////////
bool MAILFILTER::SetSpamCutoff(REAL rCutoff)
{
if ((rCutoff >= 0) && (rCutoff <= 100))
{
_rSpamCutoff = rCutoff;
return true;
}
else
{
return false;
}
}
/////////////////////////////////////////////////////////////////////////////
// GetSpamCutoff
//
// returns value set with SetSpamCutoff. Defaults == DefaultSpamCutoff
// if no value has been set when SVM output file is read
/////////////////////////////////////////////////////////////////////////////
REAL MAILFILTER::GetSpamCutoff()
{
return _rSpamCutoff;
}
/////////////////////////////////////////////////////////////////////////////
// GetDefaultSpamCutoff
//
// returns default value for SpamCutoff. read from SVM output file.
// should call FSetSVMDataLocation before calling this function
/////////////////////////////////////////////////////////////////////////////
REAL MAILFILTER::GetDefaultSpamCutoff()
{
assert(!_strFName.IsEmpty());
return _rDefaultThresh;
}
/////////////////////////////////////////////////////////////////////////////
// FInvokeSpecialRule
//
// Invokes the special rule that is this FeatureComponent.
// Returns the state of the feature.
/////////////////////////////////////////////////////////////////////////////
bool MAILFILTER::FInvokeSpecialRule(UINT iRuleNum)
{
switch (iRuleNum)
{
case 1:
return FWordPresent(_szBody, _szFirstName);
break;
case 2:
return FWordPresent(_szBody, _szLastName);
break;
case 3:
return FWordPresent(_szBody, _szCompanyName);
break;
case 4:
// year message received
if (FTimeEmpty(_tMessageSent))
{
return false;
}
else
{
CTime time(_tMessageSent, -1);
char szYear[6];
wnsprintf(szYear, ARRAYSIZE(szYear), "%i", time.GetYear());
return FWordPresent(_szBody, szYear);
}
break;
case 5:
// message received in the wee hours (>= 7pm or <6am
if (FTimeEmpty(_tMessageSent))
{
return false;
}
else
{
CTime time(_tMessageSent, -1);
return (time.GetHour() >= (7+12)) || (time.GetHour() < 6);
}
break;
case 6:
// message received on weekend
if (FTimeEmpty(_tMessageSent))
{
return false;
}
else
{
CTime time(_tMessageSent, -1);
return ((time.GetDayOfWeek() == 7) || (time.GetDayOfWeek() == 1));
}
break;
case 14:
return _bRule14; // set in HandleCaseSensitiveSpecialRules()
break;
case 15:
return SpecialFeatureNonAlpha(_szBody);
break;
case 16:
return _bDirectMessage;
break;
case 17:
return _bRule17; // set in HandleCaseSensitiveSpecialRules()
break;
case 18:
return SpecialFeatureNonAlpha(_szSubject);
break;
case 19:
return (*_szTo=='\0');
break;
case 20:
return _bHasAttach;
break;
case 40:
return (strlen(_szBody) >= 125);
case 41:
return (strlen(_szBody) >= 250);
case 42:
return (strlen(_szBody) >= 500);
case 43:
return (strlen(_szBody) >= 1000);
case 44:
return (strlen(_szBody) >= 2000);
case 45:
return (strlen(_szBody) >= 4000);
case 46:
return (strlen(_szBody) >= 8000);
case 47:
return (strlen(_szBody) >= 16000);
default:
return false;
//assert(false == "unsupported special feature");
break;
}
return true;
}
/////////////////////////////////////////////////////////////////////////////
// HandleCaseSensitiveSpecialRules
//
// Called from EvaluateFeatureComponents().
// Some special rules are case sensitive, so if they're present, we'll
// evaluate them before we make the texts uppercase and cache the result
// for when they are actually used.
/////////////////////////////////////////////////////////////////////////////
void MAILFILTER::HandleCaseSensitiveSpecialRules()
{
for (UINT i = 0; i<_cFeatureComps; i++)
{
FeatureComponent *pfcomp = &rgfeaturecomps[i];
if (pfcomp->loc == locSpecial)
{
switch (pfcomp->iRuleNum)
{
case 14:
_bRule14 = SpecialFeatureUpperCaseWords(_szBody);
break;
case 17:
_bRule17 = SpecialFeatureUpperCaseWords(_szSubject);
break;
default:
;// nothing
}
}
}
}
/////////////////////////////////////////////////////////////////////////////
// EvaluateFeatureComponents
//
// Evaluates all of the feature components. Sets fPresent in each component
// to true if the feature is present, false otherwise
/////////////////////////////////////////////////////////////////////////////
void MAILFILTER::EvaluateFeatureComponents()
{
HandleCaseSensitiveSpecialRules();
_strupr(_szFrom);
_strupr(_szTo);
_strupr(_szSubject);
_strupr(_szBody);
for (UINT i = 0; i<_cFeatureComps; i++)
{
FeatureComponent *pfcomp = &rgfeaturecomps[i];
switch(pfcomp->loc)
{
case locNil:
assert(pfcomp->loc != locNil);
pfcomp->fPresent = false;
break;
case locBody:
pfcomp->fPresent = FWordPresent(_szBody, pfcomp->szFeature);
break;
case locSubj:
pfcomp->fPresent = FWordPresent(_szSubject, pfcomp->szFeature);
break;
case locFrom:
pfcomp->fPresent = FWordPresent(_szFrom, pfcomp->szFeature);
break;
case locTo:
pfcomp->fPresent = FWordPresent(_szTo, pfcomp->szFeature);
break;
case locSpecial:
pfcomp->fPresent = FInvokeSpecialRule(pfcomp->iRuleNum);
break;
}
}
}
/////////////////////////////////////////////////////////////////////////////
// ProcessFeatureComponentPresence
//
// Processes the presence (or absence) of the individual feature components,
// setting the feature status of each feature (which may me made up of
// multiple feature components).
/////////////////////////////////////////////////////////////////////////////
void MAILFILTER::ProcessFeatureComponentPresence()
{
for (UINT i = 0; i < _cFeatureComps; i++)
{
FeatureComponent *pfcomp = &rgfeaturecomps[i];
UINT iFeature = pfcomp->iFeature;
if (_rgiFeatureStatus[iFeature] == -1) // first feature of this feature
{
if (pfcomp->fPresent)
{
_rgiFeatureStatus[iFeature] = 1;
}
else
{
_rgiFeatureStatus[iFeature] = 0;
}
}
else
{
switch (pfcomp->bop)
{
case boolopOr:
if (pfcomp->fPresent)
{
_rgiFeatureStatus[iFeature] = 1;
}
break;
case boolopAnd:
if (!pfcomp->fPresent)
{
_rgiFeatureStatus[iFeature] = 0;
}
break;
default:
assert(false);
break;
}
}
}
}
/////////////////////////////////////////////////////////////////////////////
// RDoSVMCalc
//
// Does the actual support vector machine calculation.
// Returns the probability that the message is spam
/////////////////////////////////////////////////////////////////////////////
REAL MAILFILTER::RDoSVMCalc()
{
REAL rAccum; // accumulator for result
REAL rResult;
rAccum = 0.0;
for (UINT i = 0; i < _cFeatures; i++)
{
if (_rgiFeatureStatus[i] == 1)
rAccum+=rgrSVMWeights[i];
else if (_rgiFeatureStatus[i] != 0)
assert(false);
}
// Apply threshold;
rAccum -= _rThresh;
// Apply sigmoid
rResult = (1 / (1 + exp((_rCC * rAccum) + _rDD)));
return rResult;
}
/*
// for timing version
#include <sys\\types.h>
#include <sys\\timeb.h>
*/
//#include "..\SpamLearner\MailIndexer.cpp"
/////////////////////////////////////////////////////////////////////////////
// BCalculateSpamProb
//
// Calculates the probability that the current message is spam.
// Returns the probability (0 to 1) that the message is spam in prSpamProb
// the boolean return is determined by comparing to the spam cutoff
/////////////////////////////////////////////////////////////////////////////
bool MAILFILTER::BCalculateSpamProb(/* IN params */
char *szFrom,
char *szTo,
char *szSubject,
char *szBody,
bool bDirectMessage,
bool bHasAttach,
FILETIME tMessageSent,
/* OUT params */
REAL *prSpamProb,
bool * pbIsSpam)
{
//_strFName = "d:\\test\\test.lko";
//_strFName = "G:\\SPAM\\SPAM.lko";
_szFrom = szFrom;
_szTo = szTo;
_szSubject = szSubject;
_szBody = szBody;
_bDirectMessage = bDirectMessage;
_bHasAttach = bHasAttach;
_tMessageSent = tMessageSent;
EvaluateFeatureComponents();
//ProcessMessage(_szFrom, _szTo, _szSubject, _szBody);
ProcessFeatureComponentPresence();
*prSpamProb = RDoSVMCalc();
*pbIsSpam = (*prSpamProb>(_rSpamCutoff/100));
return true;
/* timing version
_timeb start, finish;
int ij = strlen(szBody);
_ftime( &start );
ReadSVMOutput("d:\\test\\test.lko");
for (int i=0;i<1000;i++)
{
ProcessMessage(szFrom, szTo, szSubject, szBody);
DetermineFeatureStatus(bDirectMessage);
*pr = RDoSVMCalc();
}
_ftime( &finish );
*pr = (finish.time-start.time + (finish.millitm-start.millitm)/1000.0);
return true;
*/
}
/////////////////////////////////////////////////////////////////////////////
// BReadDefaultSpamCutoff
//
// Reads the default spam cutoff without parsing entire file
// Use GetDefaultSpamCutoff if using FSetSVMDataLocation;
// static member function
/////////////////////////////////////////////////////////////////////////////
bool MAILFILTER::BReadDefaultSpamCutoff(char *szFullPath, REAL *prDefCutoff)
{
try
{
CStdioFile sfile(szFullPath, CFile::modeRead);
CString strBuf;
// skip first three lines
if ((!sfile.ReadString(strBuf)) ||
(!sfile.ReadString(strBuf)) ||
(!sfile.ReadString(strBuf)) ||
(!sfile.ReadString(strBuf)))
{
return false;
}
char *pszDefThresh = strstr(&((LPCTSTR)strBuf)[11], ::szDefaultThresh);
assert(pszDefThresh != NULL);
if (pszDefThresh == NULL)
{
return false;
}
pszDefThresh += strlen(::szDefaultThresh);
*prDefCutoff = stod(pszDefThresh, NULL);
if (*prDefCutoff < .9 ) // since the default has been shifted to 2 std dev, we only take it if it is greater than .9
{
*prDefCutoff = 0.9;
}
}
catch (CFileException *)
{
return false;
}
return true;
}
/////////////////////////////////////////////////////////////////////////////
// FSetSVMDataLocation
//
// Sets the location of the SVM Data file(.LKO file). Must be called before
// calling any other methods
// Data file must be present at time function is called
// returns true if successful, false otherwise
/////////////////////////////////////////////////////////////////////////////
bool MAILFILTER::FSetSVMDataLocation(char *szFullPath)
{
if (_strFName != szFullPath)
{
_strFName = szFullPath;
if (!ReadSVMOutput(_strFName))
{
#ifdef DEBUG
char szErr[200];
wnsprintf(szErr, ARRAYSIZE(szErr), "Unable to successfully read filter params from %s", _strFName);
MessageBox(NULL, szErr, "Junk mail filter error", MB_APPLMODAL | MB_OK);
#endif
return false;
}
}
return true;
}
/////////////////////////////////////////////////////////////////////////////
// Property set methods
//
/////////////////////////////////////////////////////////////////////////////
void MAILFILTER::SetFirstName(char *szFirstName)
{
SAFE_FREE( _szFirstName );
if (szFirstName!=NULL)
{
_szFirstName = strdup(szFirstName);
_strupr(_szFirstName);
}
else
{
_szFirstName = NULL;
}
}
void MAILFILTER::SetLastName(char *szLastName)
{
SAFE_FREE( _szLastName );
if (szLastName!=NULL)
{
_szLastName = strdup(szLastName);
_strupr(_szLastName);
}
else
{
_szLastName = NULL;
}
}
void MAILFILTER::SetCompanyName(char *szCompanyName)
{
SAFE_FREE( _szCompanyName );
if (szCompanyName!=NULL)
{
_szCompanyName = strdup(szCompanyName);
_strupr(_szCompanyName);
}
else
{
_szCompanyName = NULL;
}
}
/////////////////////////////////////////////////////////////////////////////
// Constructor/destructor
//
/////////////////////////////////////////////////////////////////////////////
MAILFILTER::MAILFILTER()
{
_szFirstName = NULL;
_szLastName = NULL;
_szCompanyName = NULL;
_rDefaultThresh = -1;
_rThresh = -1;
_cFeatureComps = 0;
rgrSVMWeights = NULL;
}
MAILFILTER::~MAILFILTER()
{
SAFE_FREE( _szFirstName );
SAFE_FREE( _szLastName );
SAFE_FREE( _szCompanyName );
for (unsigned int i=0;i<_cFeatureComps;i++)
rgfeaturecomps[i].~FeatureComponent();
SAFE_FREE( rgrSVMWeights );
SAFE_FREE( _rgiFeatureStatus );
SAFE_FREE( rgfeaturecomps );
}