|
|
/*---------------------------------------------------------------------------
File: MonitorRunning.cpp
Comments: This is the entry point for a thread which will periodically try to connect to the agents that the monitor thinks are running, to see if they are really still running.
This will keep the monitor from getting into a state where it thinks agents are still running, when they are not.
(c) Copyright 1999, Mission Critical Software, Inc., All Rights Reserved Proprietary and confidential to Mission Critical Software, Inc.
REVISION LOG ENTRY Revision By: Christy Boles
--------------------------------------------------------------------------- */ #include "stdafx.h"
#include "DetDlg.h"
#include "Common.hpp"
#include "AgRpcUtl.h"
#include "Monitor.h"
#include "ServList.hpp"
#include "ResStr.h"
//#include "..\AgtSvc\AgSvc.h"
#include "AgSvc.h"
/*#import "\bin\McsEADCTAgent.tlb" no_namespace , named_guids
//#import "\bin\McsVarSetMin.tlb" no_namespace */
//#import "Engine.tlb" no_namespace , named_guids //already #imported via DetDlg.h
#import "VarSet.tlb" no_namespace rename("property", "aproperty")
DWORD TryConnectAgent( TServerNode * node, BOOL bSignalToShutdown, // indicates whether we want to signal the agent to shut down
DWORD dwMilliSeconds // indicates the auto shut down timeout
// we should query the agent again by this time
) { DWORD rc; HRESULT hr; HANDLE hBinding = NULL; WCHAR * sBinding = NULL; WCHAR server[MAX_PATH]; IUnknown * pUnk = NULL; IVarSetPtr pVarSet; IDCTAgentPtr pAgent; _bstr_t jobID; BOOL bSuccess = FALSE; BOOL bQueryFailed = TRUE; BOOL bFinished = FALSE; CString status; BOOL bCoInitialized = FALSE;
server[0] = L'\\'; server[1] = L'\\'; UStrCpy(server+2,node->GetServer());
rc = EaxBindCreate(server,&hBinding,&sBinding,TRUE); if ( ! rc ) { hr = CoInitialize(NULL); if ( SUCCEEDED(hr) ) { bCoInitialized = TRUE; rc = DoRpcQuery(hBinding,&pUnk); } else { rc = hr; }
if ( ! rc && pUnk ) { try {
// we got an interface pointer to the agent: try to query it
pAgent = pUnk; pUnk->Release(); pUnk = NULL; jobID = node->GetJobID();
hr = pAgent->raw_QueryJobStatus(jobID,&pUnk); if ( SUCCEEDED(hr) ) { // set the auto shut down for the agent so in case we don't
// lose connection to it it will shut down automatically
// usually, we should call this function again by that time
pAgent->raw_SetAutoShutDown(dwMilliSeconds); bQueryFailed = FALSE; pVarSet = pUnk; pUnk->Release(); _bstr_t text = pVarSet->get(GET_BSTR(DCTVS_JobStatus));
if ( !UStrICmp(text,GET_STRING(IDS_DCT_Status_Completed))) { bFinished = TRUE; } else if (!UStrICmp(text,GET_STRING(IDS_DCT_Status_Completed_With_Errors))) { node->SetSeverity(2); bFinished = TRUE; } } } catch ( ... ) { // the DCOM connection didn't work
// This means we can't tell whether the agent is running or not
bQueryFailed = TRUE; }
} else { if ( rc == E_NOTIMPL ) { status.LoadString(IDS_CantMonitorOnNt351); } else { status.LoadString(IDS_CannotConnectToAgent); } bQueryFailed = TRUE; } EaxBindDestroy(&hBinding,&sBinding); }
// if trying to signal the agent to shut down, we will do our best
if (bSignalToShutdown) { if (pAgent) pAgent->raw_SignalOKToShutDown(); rc = 0; } else { node->SetMessageText(status.GetBuffer(0)); if ( bFinished ) { node->SetFinished(); } else if ( bQueryFailed ) { node->SetQueryFailed(TRUE); } // update the server entry in the list window
HWND listWnd; WCHAR sTime[32]; gData.GetListWindow(&listWnd); node->SetTimeStamp(gTTime.FormatIsoLcl( gTTime.Now( NULL ), sTime )); SendMessage(listWnd,DCT_UPDATE_ENTRY,NULL,(LPARAM)node); }
if (bCoInitialized) CoUninitialize(); return rc; }
typedef TServerNode * PSERVERNODE;
//----------------------------------------------------------------------------
// Function: IsFileReady
//
// Synopsis: This function checks if a file exists and no other
// process is trying to write to it
//
// Arguments:
//
// filename the name of file to be checked
//
// Returns: returns TRUE if the file is ready; otherwise, returns FALSE
//
// Modifies:
//----------------------------------------------------------------------------
BOOL IsFileReady(WCHAR* filename) { if (filename == NULL) return FALSE; HANDLE hResult = CreateFile((WCHAR*)filename, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if (hResult != INVALID_HANDLE_VALUE) { CloseHandle(hResult); return TRUE; } else return FALSE; }
//----------------------------------------------------------------------------
// Function: MonitorRunningAgent
//
// Synopsis: This thread entry function is responsible for monitoring the agent represented
// by arg (will be casted into a TServerNode pointer).
// A brief monitoring logic is as follows:
// a. We set up a FindFirstChangeNotification (last write) to look for results
// on the remote machine
// b. Start the agent query interval to 1 minute.
// c. Use CreateFile to test whether results are present (using FILE_SHARE_READ to make
// sure the writing is done)
// This also makes sure we don't lose any last write before the notification is set up
// d. If result present, wait on notification for 1 minute (as we don't fully trust notification)
// If result not present, query agent to see if it is finished
// if finised, go to g
// if not finished, wait on notification for 1 minute
// e. If timeout:
// if query interval has been reached, query agent (in case results cannot be written)
// if finished, go to g
// if alive, double query interval (maxes out at 20 min), go to c
// if notification, go to c.
// g. pull result
//
// Arguments:
//
// arg this is the argument for thread entry point function; will be casted into
// a TServerNode pointer
//
// Returns: always return 0 as the status will be reflected in pNode
//
// Modifies:
//
//----------------------------------------------------------------------------
DWORD __stdcall MonitorRunningAgent(void * arg) { DWORD rc = 0; BOOL bDone = FALSE; TServerNode* pNode = (TServerNode*) arg;
const DWORD dwMaxTimeout = 1200000; // 20 minutes
const DWORD dwConversionFactor = 10000; // 1 millisecond / 100 nanoseconds
const DWORD dwNotificationTimeout = 60000; // 1 minute
const DWORD dwRetryTimeout = 60000; // 1 minute
DWORD dwAgentQueryTimeout = 60000; // 1 minute
ULARGE_INTEGER uliAgentQueryTimeout; uliAgentQueryTimeout.QuadPart = (ULONGLONG) dwAgentQueryTimeout * dwConversionFactor;
// sanity check, we should not pass in NULL in the first place
_ASSERT(pNode != NULL); if (pNode == NULL) return 0; BOOL bAccntRefExpected = pNode->IsAccountReferenceResultExpected(); BOOL bJoinDomainWithRename = pNode->IsJoinDomainWithRename(); HANDLE hFindChange = INVALID_HANDLE_VALUE; ULARGE_INTEGER uliPreviousTime; ULARGE_INTEGER uliCurrentTime; _bstr_t remoteResultPath, jobFilename; _bstr_t remoteResultFilename, resultFilename; _bstr_t remoteSecrefsFilename, secrefsFilename; _bstr_t statusFilename; WCHAR resultPath[MAX_PATH]; gData.GetResultDir(resultPath);
// the following variables are for retry logic in case that agent query fails
// for "Join Domain with Rename" case, we use 5 retries to make sure joining domain could
// finish (usually, it takes under one minute but depending on the network condition and
// CPU usage of computers involved, it could take longer than one minute). Allowing five
// retries should cover it pretty well
// for other purpose, we use 2 retries.
const DWORD dwMaxNumOfQueryRetries = (bJoinDomainWithRename) ? 5 : 2; // maximum number of retries
DWORD dwNumOfQueryRetries = 0; // number of retries so far
BOOL bResultReady = FALSE; // indicates whether the file is ready on the remote machine
try { // prepare the remote and local result file names (both .result and .secrefs files)
remoteResultPath = pNode->GetRemoteResultPath(); jobFilename = pNode->GetJobFile(); remoteResultFilename = remoteResultPath + jobFilename + L".result"; resultFilename = _bstr_t(resultPath) + jobFilename + L".result"; if (bAccntRefExpected) { remoteSecrefsFilename = remoteResultPath + jobFilename + L".secrefs"; secrefsFilename = _bstr_t(resultPath) + jobFilename + L".secrefs"; }
if (bJoinDomainWithRename) statusFilename = remoteResultPath + pNode->GetJobID();
HANDLE hResult; // file handle to result file
// start monitoring
// the following are the ways to get out of the while loop
// a. results have shown up in the remote directory and either
// the agent has finished or we cannot query it
// b. results have not shown up and either we cannot query the agent
// after certain number of retries (dwMaxNumOfQueryRetries)
// or the agent has completed
GetSystemTimeAsFileTime((FILETIME*)&uliPreviousTime); // we need to get a starting time for the timeout
do { // listen to the central control as well: if we're signaled to be done, let's do so
gData.GetDone(&bDone); if (bDone) break;
// if someone else (detail dialog) has detected the status of the agent, we don't need to keep monitoring
if (!pNode->IsRunning()) { // check whether we have results back
if (IsFileReady(remoteResultFilename) && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename))) bResultReady = TRUE; break; } // if the notification has not been set up, we should try to set up
if (hFindChange == INVALID_HANDLE_VALUE) { hFindChange = FindFirstChangeNotification(remoteResultPath, FALSE, FILE_NOTIFY_CHANGE_LAST_WRITE); }
//
// let's check result files if we have not gotten results yet
//
if (bResultReady == FALSE) { // check whether the .result and .secrefs files are ready
if (IsFileReady(remoteResultFilename) && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename))) bResultReady = TRUE; }
// now query the agent status
if (bResultReady) { rc = TryConnectAgent(pNode, FALSE, dwAgentQueryTimeout + dwNotificationTimeout); if (!pNode->IsRunning() || pNode->QueryFailed()) { // if something is wrong or the agent is not running anymore
// let's get out of the loop
break; } dwNumOfQueryRetries = 0; // reset the number of retries so far to zero
} else if (bJoinDomainWithRename) { // if it is the "join domain with rename" case, we want to take a look
// at status file as well
if (IsFileReady(statusFilename)) { pNode->QueryStatusFromFile(statusFilename); // just in case, we check result files again
if (IsFileReady(remoteResultFilename) && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename))) bResultReady = TRUE; break; } }
// figure out the elapsed time to see whether you should query the agent
GetSystemTimeAsFileTime((FILETIME*)&uliCurrentTime); BOOL bNeedToQueryAgent = FALSE; // if somehow the time has been set back significantly or
// the timeout period has elapsed
// we should query the agent
// note: in the retry case, we use dwRetryTimeout instead of uliAgentQueryTimeout
// since if we do not want to wait too long before a retry
if (uliCurrentTime.QuadPart <= uliPreviousTime.QuadPart || (dwNumOfQueryRetries > 0 && uliPreviousTime.QuadPart + dwRetryTimeout <= uliCurrentTime.QuadPart) || uliPreviousTime.QuadPart + uliAgentQueryTimeout.QuadPart <= uliCurrentTime.QuadPart) { bNeedToQueryAgent = TRUE; } if (bNeedToQueryAgent) { // reset the timeout for querying agent
// if not in the retry case, we double the timeout
// otherwise, we use the same timeout value
if (dwNumOfQueryRetries == 0) { dwAgentQueryTimeout += dwAgentQueryTimeout; // if it hits the maximum timeout, it is set to the maximum value
if (dwAgentQueryTimeout > dwMaxTimeout) dwAgentQueryTimeout = dwMaxTimeout; uliAgentQueryTimeout.QuadPart = (ULONGLONG) dwAgentQueryTimeout * dwConversionFactor; } uliPreviousTime = uliCurrentTime; rc = TryConnectAgent(pNode, FALSE, dwAgentQueryTimeout + dwNotificationTimeout);
// if it is the "join domain with rename" case and we are getting ERROR_ACCESS_DENIED
// or RPC_S_SERVER_UNAVAILABLE, we should check the status file
if (bJoinDomainWithRename && (rc == ERROR_ACCESS_DENIED || rc == RPC_S_SERVER_UNAVAILABLE)) { pNode->QueryStatusFromFile(statusFilename); } if (pNode->QueryFailed()) { if (dwNumOfQueryRetries < dwMaxNumOfQueryRetries) { // in retry mode, we need to use the original timeout value
dwNumOfQueryRetries++; pNode->SetQueryFailed(FALSE); } else { // we have retried enough times, let's break out of the loop
break; } } else if (!pNode->IsRunning()) { // if something is wrong or the agent is not running anymore
// let's get out of the loop
// but first check the result files again if they are not ready yet
if (!bResultReady && IsFileReady(remoteResultFilename) && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename))) bResultReady = TRUE; break; } else { // reset the number of query of retries to zero
dwNumOfQueryRetries = 0; } }
// wait for the notification or sleep for one minute
// this is to make agent monitoring thread as robust as possible
if (hFindChange != INVALID_HANDLE_VALUE) { // if the notification is set up, let's wait on it
WaitForSingleObject(hFindChange, dwNotificationTimeout); } else { // if the notification is not set up, let's sleep for one minute
Sleep(dwNotificationTimeout); }
// find the next notification
if (hFindChange != INVALID_HANDLE_VALUE) { // this part is to make sure the code is robust
if (!FindNextChangeNotification(hFindChange)) { FindCloseChangeNotification(hFindChange); hFindChange = INVALID_HANDLE_VALUE; } } } while (!bDone);
//
// pull the result
//
pNode->SetHasResult(FALSE);
if (bResultReady) { // make sure we copy all needed files over
if (CopyFile(remoteResultFilename,resultFilename,FALSE) && (!pNode->IsAccountReferenceResultExpected() || (pNode->IsAccountReferenceResultExpected() && CopyFile(remoteSecrefsFilename,secrefsFilename,FALSE)))) { // mark that we have the result
pNode->SetHasResult(TRUE); } }
// we should always mark that we have tried to pull the result
// we do this after we tried to pull results so that the result monitoring thread
// can handle it correctly
pNode->SetResultPullingTried(TRUE);
// finally, we signal the agent to shut down
// however in the "join domain with rename" case, since we already lost contact
// with the agent, we should not attempt to call TryConnectAgent
if (!pNode->QueryFailed() && !bJoinDomainWithRename) { // tell the agent to shut down in 1 minute just in case
// note: by using TRUE here, the status will not be updated
TryConnectAgent(pNode, TRUE, 60000); }
// if we cannot query the agent, we assume it has finished
if (pNode->QueryFailed()) { if (bResultReady) { // if bResultReady is TRUE, we will clean the Agent_Status_QueryFailed bit
pNode->SetQueryFailed(FALSE); } pNode->SetFinished(); }
// one more update
HWND listWnd; WCHAR sTime[32]; gData.GetListWindow(&listWnd); pNode->SetTimeStamp(gTTime.FormatIsoLcl( gTTime.Now( NULL ), sTime )); SendMessage(listWnd,DCT_UPDATE_ENTRY,NULL,(LPARAM)pNode); } catch (_com_error& e) { pNode->SetFailed(); pNode->SetOutOfResourceToMonitor(TRUE); } // clean up
if (hFindChange != INVALID_HANDLE_VALUE) FindCloseChangeNotification(hFindChange);
pNode->SetDoneMonitoring(TRUE); return 0; }
|