/****************************************************************************** Copyright (c) 2000 Microsoft Corporation Module Name: SearchResults.cpp Abstract: This file contains the implementation of the keyword search. Revision History: Davide Massarenti (Dmassare) 05/28/2000 created ******************************************************************************/ #include "stdafx.h" //////////////////////////////////////////////////////////////////////////////// Taxonomy::KeywordSearch::Token::Token() { m_type = TOKEN_INVALID; // TOKEN m_type; // MPC::wstring m_strToken; // WeightedMatchSet m_results; // m_left = NULL; // Token* m_left; m_right = NULL; // Token* m_right; } Taxonomy::KeywordSearch::Token::~Token() { if(m_left ) delete m_left; if(m_right) delete m_right; } ////////////////////////////////////////////////// bool Taxonomy::KeywordSearch::Token::HasNOT() { if(m_type == TOKEN_NOT) return true; if(m_left && m_left ->HasNOT()) return true; if(m_right && m_right->HasNOT()) return true; return false; } bool Taxonomy::KeywordSearch::Token::HasExplicitOperators() { switch(m_type) { case TOKEN_NOT: case TOKEN_AND: case TOKEN_OR : return true; } if(m_left && m_left ->HasExplicitOperators()) return true; if(m_right && m_right->HasExplicitOperators()) return true; return false; } void Taxonomy::KeywordSearch::Token::AddHit( /*[in]*/ long ID, /*[in]*/ long priority ) { std::pair ins = m_results.insert( WeightedMatchSet::value_type( ID, 0 ) ); ins.first->second += priority; } HRESULT Taxonomy::KeywordSearch::Token::ExecuteText( /*[in]*/ LPCWSTR szKeyword , /*[in]*/ RS_Keywords* rsKeywords , /*[in]*/ RS_Matches* rsMatches ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::Token::Execute" ); HRESULT hr; bool fFound; __MPC_EXIT_IF_METHOD_FAILS(hr, rsKeywords->Seek_ByName( szKeyword, &fFound )); if(fFound) { __MPC_EXIT_IF_METHOD_FAILS(hr, rsMatches->Seek_ByKeyword( rsKeywords->m_ID_keyword, &fFound )); while(fFound) { AddHit( rsMatches->m_ID_topic, rsMatches->m_lPriority ); __MPC_EXIT_IF_METHOD_FAILS(hr, rsMatches->Move( 0, JET_MoveNext, &fFound )); } } hr = S_OK; __HCP_FUNC_CLEANUP; __HCP_FUNC_EXIT(hr); } HRESULT Taxonomy::KeywordSearch::Token::Execute( /*[in]*/ MatchSet& setAllTheTopics , /*[in]*/ Updater& updater , /*[in]*/ RS_Keywords* rsKeywords , /*[in]*/ RS_Matches* rsMatches ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::Token::Execute" ); HRESULT hr; if(m_type == TOKEN_TEXT) { MPC::WStringList lst; MPC::WStringIter it; __MPC_EXIT_IF_METHOD_FAILS(hr, ExecuteText( m_strToken.c_str(), rsKeywords, rsMatches )); __MPC_EXIT_IF_METHOD_FAILS(hr, updater.LocateSynonyms( m_strToken.c_str(), lst, /*fMatchOwner*/false )); for(it=lst.begin(); it!=lst.end(); it++) { __MPC_EXIT_IF_METHOD_FAILS(hr, ExecuteText( it->c_str(), rsKeywords, rsMatches )); } } if(m_type == TOKEN_AND_IMPLICIT || m_type == TOKEN_AND ) { WeightedMatchSet* master; WeightedMatchSet* slave; WeightedMatchIter it; if(m_left == NULL || m_right == NULL ) { __MPC_SET_ERROR_AND_EXIT(hr, S_OK); } __MPC_EXIT_IF_METHOD_FAILS(hr, m_left->Execute( setAllTheTopics, updater, rsKeywords, rsMatches )); if(m_left->m_results.size() == 0) { __MPC_SET_ERROR_AND_EXIT(hr, S_OK); } __MPC_EXIT_IF_METHOD_FAILS(hr, m_right->Execute( setAllTheTopics, updater, rsKeywords, rsMatches )); if(m_right->m_results.size() == 0) { __MPC_SET_ERROR_AND_EXIT(hr, S_OK); } // // Select the shorter for the outer loop (that is linear). // if(m_left->m_results.size() < m_right->m_results.size()) { master = &m_left ->m_results; slave = &m_right->m_results; } else { master = &m_right->m_results; slave = &m_left ->m_results; } for(it=master->begin(); it!=master->end(); it++) { if(slave->find( it->first ) != slave->end()) { AddHit( it->first, it->second ); } } } if(m_type == TOKEN_OR) { WeightedMatchIter it; if(m_left) { __MPC_EXIT_IF_METHOD_FAILS(hr, m_left->Execute( setAllTheTopics, updater, rsKeywords, rsMatches )); for(it=m_left->m_results.begin(); it!=m_left->m_results.end(); it++) { AddHit( it->first, it->second ); } } if(m_right) { __MPC_EXIT_IF_METHOD_FAILS(hr, m_right->Execute( setAllTheTopics, updater, rsKeywords, rsMatches )); for(it=m_right->m_results.begin(); it!=m_right->m_results.end(); it++) { AddHit( it->first, it->second ); } } } if(m_type == TOKEN_NOT) { MatchIter it; if(m_left) { __MPC_EXIT_IF_METHOD_FAILS(hr, m_left->Execute( setAllTheTopics, updater, rsKeywords, rsMatches )); } for(it=setAllTheTopics.begin(); it!=setAllTheTopics.end(); it++) { if(m_left == NULL || m_left->m_results.find( *it ) == m_left->m_results.end()) { AddHit( *it, 0 ); } } } hr = S_OK; __HCP_FUNC_CLEANUP; __HCP_FUNC_EXIT(hr); } void Taxonomy::KeywordSearch::Token::CollectKeywords( /*[in/out]*/ MPC::WStringList& lst ) const { if(m_type == TOKEN_TEXT)lst.push_back( m_strToken ); if(m_left ) m_left ->CollectKeywords( lst ); if(m_right) m_right->CollectKeywords( lst ); } HRESULT Taxonomy::KeywordSearch::Token::Stringify( /*[in]*/ MPC::wstring& strNewQuery ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::Token::Stringify" ); HRESULT hr; if(m_type == TOKEN_TEXT) { strNewQuery = m_strToken; } else { if(m_left) { __MPC_EXIT_IF_METHOD_FAILS(hr, m_left->Stringify( strNewQuery )); if(m_right) { MPC::wstring strTmp; __MPC_EXIT_IF_METHOD_FAILS(hr, m_right->Stringify( strTmp )); if(strTmp.size()) { strNewQuery += L" "; strNewQuery += strTmp; } } } else { __MPC_EXIT_IF_METHOD_FAILS(hr, m_right->Stringify( strNewQuery )); } } hr = S_OK; __HCP_FUNC_CLEANUP; __HCP_FUNC_EXIT(hr); } //////////////////////////////////////////////////////////////////////////////// LPCWSTR Taxonomy::KeywordSearch::SkipWhite( /*[in]*/ LPCWSTR szStr ) { while(iswspace( *szStr )) szStr++; return szStr; } bool Taxonomy::KeywordSearch::IsNotString( /*[in]*/ LPCWSTR szSrc , /*[in]*/ WCHAR cQuote ) { WCHAR c; while((c = *++szSrc) && !iswspace( c ) && c != cQuote); return (c != cQuote); } bool Taxonomy::KeywordSearch::IsQueryChar( WCHAR c ) { if(iswspace( c ) || iswcntrl( c ) || c == '"' || c == '(' || c == ')' ) { return false; } return true; } //////////////////////////////////////// void Taxonomy::KeywordSearch::RemoveStopSignsAtEnd( /*[in]*/ LPWSTR szText ) { WCHAR c; MPC::wstring strCmp; Taxonomy::WordIter itEnd = m_setStopSignsAtEnd->end(); LPWSTR szEnd = szText + wcslen( szText ); while(szEnd > szText) { strCmp = *--szEnd; if(m_setStopSignsAtEnd->find( strCmp ) != itEnd) { szEnd[0] = ' '; } else { break; } } } void Taxonomy::KeywordSearch::RemoveStopSignsWithoutContext( /*[in]*/ LPWSTR szText ) { WCHAR c; MPC::wstring strCmp; Taxonomy::WordIter itEnd = m_setStopSignsWithoutContext->end(); while((c = *szText++)) { strCmp = c; if(m_setStopSignsWithoutContext->find( strCmp ) != itEnd) { szText[-1] = ' '; } } } void Taxonomy::KeywordSearch::CopyAndEliminateExtraWhiteSpace( /*[in]*/ LPCWSTR szSrc, /*[out]*/ LPWSTR szDst ) { bool fWhitespace = false; WCHAR c; szSrc = SkipWhite( szSrc ); while((c = *szSrc++)) { if(iswspace(c)) { if(fWhitespace == false) { *szDst++ = ' '; fWhitespace = true; } } else { *szDst++ = c; fWhitespace = false; } } if(fWhitespace) szDst[-1] = 0; else szDst[ 0] = 0; } Taxonomy::KeywordSearch::TOKEN Taxonomy::KeywordSearch::NextToken( /*[in/out]*/ LPCWSTR& szSrc , /*[out] */ LPWSTR szToken ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::NextToken" ); TOKEN token = TOKEN_INVALID; LPCWSTR szPtr = SkipWhite( szSrc ); LPWSTR szDst = szToken; WCHAR c; // // End of query? // c = *szPtr; if(c == 0) { token = TOKEN_EMPTY; __MPC_FUNC_LEAVE; } // // Now deal with Quoted String, which may come in the form of "Quoted String" or 'Quoted String' // if(c == '"') { WCHAR cQuote = c; while((c = *++szPtr) && c != cQuote) { *szDst++ = c; } if(c) szPtr++; // Skip past the closing quote. token = TOKEN_TEXT; __MPC_FUNC_LEAVE; } // // This is a special case operator which is '||' synonim for OR. // if(c == '|') { if(szPtr[1] != '|') { token = TOKEN_INVALID; __MPC_FUNC_LEAVE; } szPtr += 2; token = TOKEN_OR; __MPC_FUNC_LEAVE; } // // Single Character Tokens we admit are '+', '&', '(' and ')', return as is, and adjust szPtr. // if(c == '(') { szPtr++; token = TOKEN_PAREN_OPEN ; __MPC_FUNC_LEAVE; } if(c == ')') { szPtr++; token = TOKEN_PAREN_CLOSE; __MPC_FUNC_LEAVE; } // if(c == '+') { szPtr++; token = TOKEN_OR ; __MPC_FUNC_LEAVE; } // if(c == '&') { szPtr++; token = TOKEN_AND ; __MPC_FUNC_LEAVE; } // if(c == '!') { szPtr++; token = TOKEN_NOT ; __MPC_FUNC_LEAVE; } // // Deal with Alphanumerics: // // KW-A, 0-A, Abcdedd, ABC2_WE all are taken as a single Query Term // if(IsQueryChar( c )) { while(c) { szPtr++; *szDst++ = c; if(IsQueryChar( c = *szPtr )) continue; // // We are not done yet, if stop character was a quote character we need to find out whether a string comes after. // if(c == '"' && IsNotString( szPtr, c )) continue; break; } *szDst = 0; { MPC::wstring strCmp( szToken ); if(m_setOpNOT->find( strCmp ) != m_setOpNOT->end()) { token = TOKEN_NOT; __MPC_FUNC_LEAVE; } if(m_setOpAND->find( strCmp ) != m_setOpAND->end()) { token = TOKEN_AND; __MPC_FUNC_LEAVE; } if(m_setOpOR ->find( strCmp ) != m_setOpOR ->end()) { token = TOKEN_OR ; __MPC_FUNC_LEAVE; } } token = TOKEN_TEXT; __MPC_FUNC_LEAVE; } __HCP_FUNC_CLEANUP; szSrc = szPtr; *szDst = 0; __HCP_FUNC_EXIT(token); } //////////////////////////////////////////////////////////////////////////////// HRESULT Taxonomy::KeywordSearch::AllocateQuery( /*[in]*/ const MPC::wstring& strQuery , /*[out]*/ LPWSTR& szInput , /*[out]*/ LPWSTR& szOutput ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::AllocateQuery" ); HRESULT hr; szInput = new WCHAR[strQuery.size()+2]; szOutput = new WCHAR[strQuery.size()+2]; if(szInput == NULL || szOutput == NULL) { __MPC_SET_ERROR_AND_EXIT(hr, E_OUTOFMEMORY); } wcscpy( szInput, strQuery.c_str() ); hr = S_OK; __HCP_FUNC_CLEANUP; __HCP_FUNC_EXIT(hr); } HRESULT Taxonomy::KeywordSearch::PreprocessQuery( /*[in/out]*/ MPC::wstring& strQuery ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::PreprocessQuery" ); HRESULT hr; LPWSTR szInput = NULL; LPWSTR szOutput = NULL; __MPC_EXIT_IF_METHOD_FAILS(hr, AllocateQuery( strQuery, szInput, szOutput )); RemoveStopSignsAtEnd ( szInput ); RemoveStopSignsWithoutContext ( szInput ); CopyAndEliminateExtraWhiteSpace( szInput, szOutput ); strQuery = szOutput; hr = S_OK; __HCP_FUNC_CLEANUP; if(szInput ) delete [] szInput; if(szOutput) delete [] szOutput; __HCP_FUNC_EXIT(hr); } //////////////////////////////////////////////////////////////////////////////// HRESULT Taxonomy::KeywordSearch::Parse( /*[in/out]*/ LPCWSTR& szInput, /*[in]*/ LPWSTR szTmpBuf, /*[in]*/ bool fSubExpr, /*[out]*/ Token*& res ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::Parse" ); HRESULT hr; Token* obj = NULL; Token* objOp = NULL; Token* objDangling = NULL; while(1) { TOKEN token = NextToken( szInput, szTmpBuf ); if(token == TOKEN_EMPTY) break; if(token == TOKEN_INVALID) { __MPC_SET_ERROR_AND_EXIT(hr, E_INVALIDARG); } // // Skip stop words. // if(token == TOKEN_TEXT && m_setStopWords->find( szTmpBuf ) != m_setStopWords->end()) continue; if(token == TOKEN_PAREN_CLOSE) { if(fSubExpr) break; __MPC_SET_ERROR_AND_EXIT(hr, E_INVALIDARG); } if(token == TOKEN_PAREN_OPEN) { __MPC_EXIT_IF_METHOD_FAILS(hr, Parse( szInput, szTmpBuf, true, obj )); // // Empty subexpression? Not allowed... // if(obj == NULL) __MPC_SET_ERROR_AND_EXIT(hr, E_INVALIDARG); // // Let's treat a subexpression as a value. // token = TOKEN_TEXT; } else { __MPC_EXIT_IF_ALLOC_FAILS(hr, obj, new Token()); obj->m_type = token; obj->m_strToken = szTmpBuf; } if(token == TOKEN_TEXT || token == TOKEN_NOT ) { if(res == NULL) // First token... { res = obj; } else if(objDangling) // Last token of a operator... { if(objDangling->m_type == TOKEN_NOT) objDangling->m_left = obj; else objDangling->m_right = obj; } else // Implicit AND... { __MPC_EXIT_IF_ALLOC_FAILS(hr, objOp, new Token()); objOp->m_type = TOKEN_AND_IMPLICIT; objOp->m_left = res; objOp->m_right = obj; res = objOp; objOp = NULL; } objDangling = (obj->m_type == TOKEN_NOT) ? obj : NULL; obj = NULL; } else { // // What's left are binary operators. // if(res == NULL || objDangling) { // // We need a left part... // __MPC_SET_ERROR_AND_EXIT(hr, E_INVALIDARG); } // // Rotate result. // obj->m_left = res; res = obj; objDangling = obj; obj = NULL; } } // // Let's make sure operators have the associated data. ' // if(objDangling) { __MPC_SET_ERROR_AND_EXIT(hr, E_INVALIDARG); } hr = S_OK; __HCP_FUNC_CLEANUP; if(obj ) delete obj; if(objOp) delete objOp; __HCP_FUNC_EXIT(hr); } HRESULT Taxonomy::KeywordSearch::GenerateResults( /*[in]*/ Token* obj , /*[in]*/ CPCHQueryResultCollection* pColl , /*[in]*/ MPC::WStringUCSet& setURLs , /*[in]*/ Taxonomy::MatchSet* psetNodes ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::GenerateResults" ); HRESULT hr; WeightedMatchIter it; bool fFound; for(it=obj->m_results.begin(); it!=obj->m_results.end(); it++) { __MPC_EXIT_IF_METHOD_FAILS(hr, m_rsTopics->Seek_SingleTopic( it->first, &fFound )); if(fFound) { MPC::wstringUC strTopicURL = m_rsTopics->m_strURI; if(setURLs.find( strTopicURL ) == setURLs.end()) { CComPtr item; CPCHQueryResult::Payload data; // // Not under a node? Skip it. // if(psetNodes && psetNodes->find( m_rsTopics->m_ID_node ) == psetNodes->end()) continue; __MPC_EXIT_IF_METHOD_FAILS(hr, pColl->CreateItem( &item )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.ExpandURL( m_rsTopics->m_strURI )); data.m_bstrTitle = m_rsTopics->m_strTitle .c_str(); data.m_bstrTopicURL = m_rsTopics->m_strURI .c_str(); data.m_bstrDescription = m_rsTopics->m_strDescription.c_str(); data.m_lType = m_rsTopics->m_lType ; data.m_lPriority = it->second; item->Initialize( data ); setURLs.insert( strTopicURL ); } } } hr = S_OK; __HCP_FUNC_CLEANUP; __HCP_FUNC_EXIT(hr); } //////////////////////////////////////////////////////////////////////////////// Taxonomy::KeywordSearch::KeywordSearch( /*[in]*/ Updater& updater ) : m_updater( updater ) { // Updater& m_updater; // m_setStopSignsAtEnd = NULL; // WordSet* m_setStopSignsAtEnd; m_setStopSignsWithoutContext = NULL; // WordSet* m_setStopSignsWithoutContext; m_setStopWords = NULL; // WordSet* m_setStopWords; m_setOpNOT = NULL; // WordSet* m_setOpNOT; m_setOpAND = NULL; // WordSet* m_setOpAND; m_setOpOR = NULL; // WordSet* m_setOpOR; } Taxonomy::KeywordSearch::~KeywordSearch() { } HRESULT Taxonomy::KeywordSearch::Execute( /*[in]*/ LPCWSTR szQuery , /*[in]*/ LPCWSTR szSubsite , /*[in]*/ CPCHQueryResultCollection* pColl , /*[in]*/ MPC::WStringList* lst ) { __HCP_FUNC_ENTRY( "Taxonomy::KeywordSearch::Execute" ); HRESULT hr; MPC::wstring strCleanedQuery; MPC::WStringUCSet setURLs; Taxonomy::MatchSet setNodes; Taxonomy::MatchSet* psetNodes = NULL; Token* mainQuery = NULL; Token* stringifyQuery = NULL; LPWSTR szInput = NULL; LPWSTR szOutput = NULL; LPCWSTR szToken; // // Initialize the database stuff. // __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetWordSet( UPDATER_SET_STOPSIGNS , &m_setStopSignsWithoutContext )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetWordSet( UPDATER_SET_STOPSIGNS_ATENDOFWORD, &m_setStopSignsAtEnd )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetWordSet( UPDATER_SET_STOPWORDS , &m_setStopWords )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetWordSet( UPDATER_SET_OPERATOR_NOT , &m_setOpNOT )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetWordSet( UPDATER_SET_OPERATOR_AND , &m_setOpAND )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetWordSet( UPDATER_SET_OPERATOR_OR , &m_setOpOR )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetTopics ( &m_rsTopics )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetKeywords( &m_rsKeywords )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.GetMatches ( &m_rsMatches )); // // Parse the query. // __MPC_EXIT_IF_METHOD_FAILS(hr, PreprocessQuery( strCleanedQuery = szQuery )); __MPC_EXIT_IF_METHOD_FAILS(hr, AllocateQuery ( strCleanedQuery, szInput, szOutput )); __MPC_EXIT_IF_METHOD_FAILS(hr, Parse( szToken = szInput, szOutput, false, mainQuery )); if(mainQuery) { MatchSet setAllTheTopics; MatchIter it; bool fFound; if(STRINGISPRESENT(szSubsite)) { long ID_node; __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.LocateTaxonomyNode( ID_node, szSubsite, /*fLookForFather*/false )); __MPC_EXIT_IF_METHOD_FAILS(hr, m_updater.LocateSubNodes ( ID_node , /*fRecurse */true , /*fOnlyVisible*/false, setNodes )); setNodes.insert( ID_node ); // Add the node itself. psetNodes = &setNodes; } if(mainQuery->HasNOT()) { // // Unfortunately, with the NOT operator we need to load all the topics... // __MPC_EXIT_IF_METHOD_FAILS(hr, m_rsTopics->Move( 0, JET_MoveFirst, &fFound )); while(fFound) { setAllTheTopics.insert( m_rsTopics->m_ID_topic ); __MPC_EXIT_IF_METHOD_FAILS(hr, m_rsTopics->Move( 0, JET_MoveNext, &fFound )); } } else if(mainQuery->HasExplicitOperators() == false && mainQuery->m_type != TOKEN_TEXT) { // // No explicit operators and more than one term, let's try to "stringify" the query... // MPC::wstring strNewQuery; __MPC_EXIT_IF_METHOD_FAILS(hr, mainQuery->Stringify( strNewQuery )); __MPC_EXIT_IF_ALLOC_FAILS(hr, stringifyQuery, new Token()); stringifyQuery->m_type = TOKEN_TEXT; stringifyQuery->m_strToken = strNewQuery; __MPC_EXIT_IF_METHOD_FAILS(hr, stringifyQuery->Execute( setAllTheTopics, m_updater, m_rsKeywords, m_rsMatches )); if(lst) stringifyQuery->CollectKeywords( *lst ); __MPC_EXIT_IF_METHOD_FAILS(hr, GenerateResults( stringifyQuery, pColl, setURLs, psetNodes )); } __MPC_EXIT_IF_METHOD_FAILS(hr, mainQuery->Execute( setAllTheTopics, m_updater, m_rsKeywords, m_rsMatches )); if(lst) mainQuery->CollectKeywords( *lst ); __MPC_EXIT_IF_METHOD_FAILS(hr, GenerateResults( mainQuery, pColl, setURLs, psetNodes )); } hr = S_OK; __HCP_FUNC_CLEANUP; if(mainQuery ) delete mainQuery; if(stringifyQuery) delete stringifyQuery; if(szInput ) delete [] szInput; if(szOutput) delete [] szOutput; __HCP_FUNC_EXIT(hr); }