Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

535 lines
20 KiB

  1. /*---------------------------------------------------------------------------
  2. File: MonitorRunning.cpp
  3. Comments: This is the entry point for a thread which will periodically try to connect
  4. to the agents that the monitor thinks are running, to see if they are really still running.
  5. This will keep the monitor from getting into a state where it thinks agents
  6. are still running, when they are not.
  7. (c) Copyright 1999, Mission Critical Software, Inc., All Rights Reserved
  8. Proprietary and confidential to Mission Critical Software, Inc.
  9. REVISION LOG ENTRY
  10. Revision By: Christy Boles
  11. ---------------------------------------------------------------------------
  12. */
  13. #include "stdafx.h"
  14. #include "DetDlg.h"
  15. #include "Common.hpp"
  16. #include "AgRpcUtl.h"
  17. #include "Monitor.h"
  18. #include "ServList.hpp"
  19. #include "ResStr.h"
  20. //#include "..\AgtSvc\AgSvc.h"
  21. #include "AgSvc.h"
  22. /*#import "\bin\McsEADCTAgent.tlb" no_namespace , named_guids
  23. //#import "\bin\McsVarSetMin.tlb" no_namespace */
  24. //#import "Engine.tlb" no_namespace , named_guids //already #imported via DetDlg.h
  25. #import "VarSet.tlb" no_namespace rename("property", "aproperty")
  26. DWORD
  27. TryConnectAgent(
  28. TServerNode * node,
  29. BOOL bSignalToShutdown, // indicates whether we want to signal the agent to shut down
  30. DWORD dwMilliSeconds // indicates the auto shut down timeout
  31. // we should query the agent again by this time
  32. )
  33. {
  34. DWORD rc;
  35. HRESULT hr;
  36. HANDLE hBinding = NULL;
  37. WCHAR * sBinding = NULL;
  38. WCHAR server[MAX_PATH];
  39. IUnknown * pUnk = NULL;
  40. IVarSetPtr pVarSet;
  41. IDCTAgentPtr pAgent;
  42. _bstr_t jobID;
  43. BOOL bSuccess = FALSE;
  44. BOOL bQueryFailed = TRUE;
  45. BOOL bFinished = FALSE;
  46. CString status;
  47. BOOL bCoInitialized = FALSE;
  48. server[0] = L'\\';
  49. server[1] = L'\\';
  50. UStrCpy(server+2,node->GetServer());
  51. rc = EaxBindCreate(server,&hBinding,&sBinding,TRUE);
  52. if ( ! rc )
  53. {
  54. hr = CoInitialize(NULL);
  55. if ( SUCCEEDED(hr) )
  56. {
  57. bCoInitialized = TRUE;
  58. rc = DoRpcQuery(hBinding,&pUnk);
  59. }
  60. else
  61. {
  62. rc = hr;
  63. }
  64. if ( ! rc && pUnk )
  65. {
  66. try {
  67. // we got an interface pointer to the agent: try to query it
  68. pAgent = pUnk;
  69. pUnk->Release();
  70. pUnk = NULL;
  71. jobID = node->GetJobID();
  72. hr = pAgent->raw_QueryJobStatus(jobID,&pUnk);
  73. if ( SUCCEEDED(hr) )
  74. {
  75. // set the auto shut down for the agent so in case we don't
  76. // lose connection to it it will shut down automatically
  77. // usually, we should call this function again by that time
  78. pAgent->raw_SetAutoShutDown(dwMilliSeconds);
  79. bQueryFailed = FALSE;
  80. pVarSet = pUnk;
  81. pUnk->Release();
  82. _bstr_t text = pVarSet->get(GET_BSTR(DCTVS_JobStatus));
  83. if ( !UStrICmp(text,GET_STRING(IDS_DCT_Status_Completed)))
  84. {
  85. bFinished = TRUE;
  86. }
  87. else if (!UStrICmp(text,GET_STRING(IDS_DCT_Status_Completed_With_Errors)))
  88. {
  89. node->SetSeverity(2);
  90. bFinished = TRUE;
  91. }
  92. }
  93. }
  94. catch ( ... )
  95. {
  96. // the DCOM connection didn't work
  97. // This means we can't tell whether the agent is running or not
  98. bQueryFailed = TRUE;
  99. }
  100. }
  101. else
  102. {
  103. if ( rc == E_NOTIMPL )
  104. {
  105. status.LoadString(IDS_CantMonitorOnNt351);
  106. }
  107. else
  108. {
  109. status.LoadString(IDS_CannotConnectToAgent);
  110. }
  111. bQueryFailed = TRUE;
  112. }
  113. EaxBindDestroy(&hBinding,&sBinding);
  114. }
  115. // if trying to signal the agent to shut down, we will do our best
  116. if (bSignalToShutdown)
  117. {
  118. if (pAgent)
  119. pAgent->raw_SignalOKToShutDown();
  120. rc = 0;
  121. }
  122. else
  123. {
  124. node->SetMessageText(status.GetBuffer(0));
  125. if ( bFinished )
  126. {
  127. node->SetFinished();
  128. }
  129. else if ( bQueryFailed )
  130. {
  131. node->SetQueryFailed(TRUE);
  132. }
  133. // update the server entry in the list window
  134. HWND listWnd;
  135. WCHAR sTime[32];
  136. gData.GetListWindow(&listWnd);
  137. node->SetTimeStamp(gTTime.FormatIsoLcl( gTTime.Now( NULL ), sTime ));
  138. SendMessage(listWnd,DCT_UPDATE_ENTRY,NULL,(LPARAM)node);
  139. }
  140. if (bCoInitialized)
  141. CoUninitialize();
  142. return rc;
  143. }
  144. typedef TServerNode * PSERVERNODE;
  145. //----------------------------------------------------------------------------
  146. // Function: IsFileReady
  147. //
  148. // Synopsis: This function checks if a file exists and no other
  149. // process is trying to write to it
  150. //
  151. // Arguments:
  152. //
  153. // filename the name of file to be checked
  154. //
  155. // Returns: returns TRUE if the file is ready; otherwise, returns FALSE
  156. //
  157. // Modifies:
  158. //----------------------------------------------------------------------------
  159. BOOL IsFileReady(WCHAR* filename)
  160. {
  161. if (filename == NULL)
  162. return FALSE;
  163. HANDLE hResult = CreateFile((WCHAR*)filename,
  164. GENERIC_READ,
  165. FILE_SHARE_READ,
  166. NULL,
  167. OPEN_EXISTING,
  168. FILE_ATTRIBUTE_NORMAL,
  169. NULL);
  170. if (hResult != INVALID_HANDLE_VALUE)
  171. {
  172. CloseHandle(hResult);
  173. return TRUE;
  174. }
  175. else
  176. return FALSE;
  177. }
  178. //----------------------------------------------------------------------------
  179. // Function: MonitorRunningAgent
  180. //
  181. // Synopsis: This thread entry function is responsible for monitoring the agent represented
  182. // by arg (will be casted into a TServerNode pointer).
  183. // A brief monitoring logic is as follows:
  184. // a. We set up a FindFirstChangeNotification (last write) to look for results
  185. // on the remote machine
  186. // b. Start the agent query interval to 1 minute.
  187. // c. Use CreateFile to test whether results are present (using FILE_SHARE_READ to make
  188. // sure the writing is done)
  189. // This also makes sure we don't lose any last write before the notification is set up
  190. // d. If result present, wait on notification for 1 minute (as we don't fully trust notification)
  191. // If result not present, query agent to see if it is finished
  192. // if finised, go to g
  193. // if not finished, wait on notification for 1 minute
  194. // e. If timeout:
  195. // if query interval has been reached, query agent (in case results cannot be written)
  196. // if finished, go to g
  197. // if alive, double query interval (maxes out at 20 min), go to c
  198. // if notification, go to c.
  199. // g. pull result
  200. //
  201. // Arguments:
  202. //
  203. // arg this is the argument for thread entry point function; will be casted into
  204. // a TServerNode pointer
  205. //
  206. // Returns: always return 0 as the status will be reflected in pNode
  207. //
  208. // Modifies:
  209. //
  210. //----------------------------------------------------------------------------
  211. DWORD __stdcall
  212. MonitorRunningAgent(void * arg)
  213. {
  214. DWORD rc = 0;
  215. BOOL bDone = FALSE;
  216. TServerNode* pNode = (TServerNode*) arg;
  217. const DWORD dwMaxTimeout = 1200000; // 20 minutes
  218. const DWORD dwConversionFactor = 10000; // 1 millisecond / 100 nanoseconds
  219. const DWORD dwNotificationTimeout = 60000; // 1 minute
  220. const DWORD dwRetryTimeout = 60000; // 1 minute
  221. DWORD dwAgentQueryTimeout = 60000; // 1 minute
  222. ULARGE_INTEGER uliAgentQueryTimeout;
  223. uliAgentQueryTimeout.QuadPart = (ULONGLONG) dwAgentQueryTimeout * dwConversionFactor;
  224. // sanity check, we should not pass in NULL in the first place
  225. _ASSERT(pNode != NULL);
  226. if (pNode == NULL)
  227. return 0;
  228. BOOL bAccntRefExpected = pNode->IsAccountReferenceResultExpected();
  229. BOOL bJoinDomainWithRename = pNode->IsJoinDomainWithRename();
  230. HANDLE hFindChange = INVALID_HANDLE_VALUE;
  231. ULARGE_INTEGER uliPreviousTime;
  232. ULARGE_INTEGER uliCurrentTime;
  233. _bstr_t remoteResultPath, jobFilename;
  234. _bstr_t remoteResultFilename, resultFilename;
  235. _bstr_t remoteSecrefsFilename, secrefsFilename;
  236. _bstr_t statusFilename;
  237. WCHAR resultPath[MAX_PATH];
  238. gData.GetResultDir(resultPath);
  239. // the following variables are for retry logic in case that agent query fails
  240. // for "Join Domain with Rename" case, we use 5 retries to make sure joining domain could
  241. // finish (usually, it takes under one minute but depending on the network condition and
  242. // CPU usage of computers involved, it could take longer than one minute). Allowing five
  243. // retries should cover it pretty well
  244. // for other purpose, we use 2 retries.
  245. const DWORD dwMaxNumOfQueryRetries = (bJoinDomainWithRename) ? 5 : 2; // maximum number of retries
  246. DWORD dwNumOfQueryRetries = 0; // number of retries so far
  247. BOOL bResultReady = FALSE; // indicates whether the file is ready on the remote machine
  248. try
  249. {
  250. // prepare the remote and local result file names (both .result and .secrefs files)
  251. remoteResultPath = pNode->GetRemoteResultPath();
  252. jobFilename = pNode->GetJobFile();
  253. remoteResultFilename = remoteResultPath + jobFilename + L".result";
  254. resultFilename = _bstr_t(resultPath) + jobFilename + L".result";
  255. if (bAccntRefExpected)
  256. {
  257. remoteSecrefsFilename = remoteResultPath + jobFilename + L".secrefs";
  258. secrefsFilename = _bstr_t(resultPath) + jobFilename + L".secrefs";
  259. }
  260. if (bJoinDomainWithRename)
  261. statusFilename = remoteResultPath + pNode->GetJobID();
  262. HANDLE hResult; // file handle to result file
  263. // start monitoring
  264. // the following are the ways to get out of the while loop
  265. // a. results have shown up in the remote directory and either
  266. // the agent has finished or we cannot query it
  267. // b. results have not shown up and either we cannot query the agent
  268. // after certain number of retries (dwMaxNumOfQueryRetries)
  269. // or the agent has completed
  270. GetSystemTimeAsFileTime((FILETIME*)&uliPreviousTime); // we need to get a starting time for the timeout
  271. do
  272. {
  273. // listen to the central control as well: if we're signaled to be done, let's do so
  274. gData.GetDone(&bDone);
  275. if (bDone)
  276. break;
  277. // if someone else (detail dialog) has detected the status of the agent, we don't need to keep monitoring
  278. if (!pNode->IsRunning())
  279. {
  280. // check whether we have results back
  281. if (IsFileReady(remoteResultFilename)
  282. && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
  283. bResultReady = TRUE;
  284. break;
  285. }
  286. // if the notification has not been set up, we should try to set up
  287. if (hFindChange == INVALID_HANDLE_VALUE)
  288. {
  289. hFindChange = FindFirstChangeNotification(remoteResultPath, FALSE, FILE_NOTIFY_CHANGE_LAST_WRITE);
  290. }
  291. //
  292. // let's check result files if we have not gotten results yet
  293. //
  294. if (bResultReady == FALSE)
  295. {
  296. // check whether the .result and .secrefs files are ready
  297. if (IsFileReady(remoteResultFilename)
  298. && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
  299. bResultReady = TRUE;
  300. }
  301. // now query the agent status
  302. if (bResultReady)
  303. {
  304. rc = TryConnectAgent(pNode, FALSE, dwAgentQueryTimeout + dwNotificationTimeout);
  305. if (!pNode->IsRunning() || pNode->QueryFailed())
  306. {
  307. // if something is wrong or the agent is not running anymore
  308. // let's get out of the loop
  309. break;
  310. }
  311. dwNumOfQueryRetries = 0; // reset the number of retries so far to zero
  312. }
  313. else if (bJoinDomainWithRename)
  314. {
  315. // if it is the "join domain with rename" case, we want to take a look
  316. // at status file as well
  317. if (IsFileReady(statusFilename))
  318. {
  319. pNode->QueryStatusFromFile(statusFilename);
  320. // just in case, we check result files again
  321. if (IsFileReady(remoteResultFilename)
  322. && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
  323. bResultReady = TRUE;
  324. break;
  325. }
  326. }
  327. // figure out the elapsed time to see whether you should query the agent
  328. GetSystemTimeAsFileTime((FILETIME*)&uliCurrentTime);
  329. BOOL bNeedToQueryAgent = FALSE;
  330. // if somehow the time has been set back significantly or
  331. // the timeout period has elapsed
  332. // we should query the agent
  333. // note: in the retry case, we use dwRetryTimeout instead of uliAgentQueryTimeout
  334. // since if we do not want to wait too long before a retry
  335. if (uliCurrentTime.QuadPart <= uliPreviousTime.QuadPart
  336. || (dwNumOfQueryRetries > 0
  337. && uliPreviousTime.QuadPart + dwRetryTimeout <= uliCurrentTime.QuadPart)
  338. || uliPreviousTime.QuadPart + uliAgentQueryTimeout.QuadPart <= uliCurrentTime.QuadPart)
  339. {
  340. bNeedToQueryAgent = TRUE;
  341. }
  342. if (bNeedToQueryAgent)
  343. {
  344. // reset the timeout for querying agent
  345. // if not in the retry case, we double the timeout
  346. // otherwise, we use the same timeout value
  347. if (dwNumOfQueryRetries == 0)
  348. {
  349. dwAgentQueryTimeout += dwAgentQueryTimeout;
  350. // if it hits the maximum timeout, it is set to the maximum value
  351. if (dwAgentQueryTimeout > dwMaxTimeout)
  352. dwAgentQueryTimeout = dwMaxTimeout;
  353. uliAgentQueryTimeout.QuadPart = (ULONGLONG) dwAgentQueryTimeout * dwConversionFactor;
  354. }
  355. uliPreviousTime = uliCurrentTime;
  356. rc = TryConnectAgent(pNode, FALSE, dwAgentQueryTimeout + dwNotificationTimeout);
  357. // if it is the "join domain with rename" case and we are getting ERROR_ACCESS_DENIED
  358. // or RPC_S_SERVER_UNAVAILABLE, we should check the status file
  359. if (bJoinDomainWithRename
  360. && (rc == ERROR_ACCESS_DENIED || rc == RPC_S_SERVER_UNAVAILABLE))
  361. {
  362. pNode->QueryStatusFromFile(statusFilename);
  363. }
  364. if (pNode->QueryFailed())
  365. {
  366. if (dwNumOfQueryRetries < dwMaxNumOfQueryRetries)
  367. {
  368. // in retry mode, we need to use the original timeout value
  369. dwNumOfQueryRetries++;
  370. pNode->SetQueryFailed(FALSE);
  371. }
  372. else
  373. {
  374. // we have retried enough times, let's break out of the loop
  375. break;
  376. }
  377. }
  378. else if (!pNode->IsRunning())
  379. {
  380. // if something is wrong or the agent is not running anymore
  381. // let's get out of the loop
  382. // but first check the result files again if they are not ready yet
  383. if (!bResultReady && IsFileReady(remoteResultFilename)
  384. && (!bAccntRefExpected || IsFileReady(remoteSecrefsFilename)))
  385. bResultReady = TRUE;
  386. break;
  387. }
  388. else
  389. {
  390. // reset the number of query of retries to zero
  391. dwNumOfQueryRetries = 0;
  392. }
  393. }
  394. // wait for the notification or sleep for one minute
  395. // this is to make agent monitoring thread as robust as possible
  396. if (hFindChange != INVALID_HANDLE_VALUE)
  397. {
  398. // if the notification is set up, let's wait on it
  399. WaitForSingleObject(hFindChange, dwNotificationTimeout);
  400. }
  401. else
  402. {
  403. // if the notification is not set up, let's sleep for one minute
  404. Sleep(dwNotificationTimeout);
  405. }
  406. // find the next notification
  407. if (hFindChange != INVALID_HANDLE_VALUE)
  408. {
  409. // this part is to make sure the code is robust
  410. if (!FindNextChangeNotification(hFindChange))
  411. {
  412. FindCloseChangeNotification(hFindChange);
  413. hFindChange = INVALID_HANDLE_VALUE;
  414. }
  415. }
  416. } while (!bDone);
  417. //
  418. // pull the result
  419. //
  420. pNode->SetHasResult(FALSE);
  421. if (bResultReady)
  422. {
  423. // make sure we copy all needed files over
  424. if (CopyFile(remoteResultFilename,resultFilename,FALSE)
  425. && (!pNode->IsAccountReferenceResultExpected()
  426. || (pNode->IsAccountReferenceResultExpected()
  427. && CopyFile(remoteSecrefsFilename,secrefsFilename,FALSE))))
  428. {
  429. // mark that we have the result
  430. pNode->SetHasResult(TRUE);
  431. }
  432. }
  433. // we should always mark that we have tried to pull the result
  434. // we do this after we tried to pull results so that the result monitoring thread
  435. // can handle it correctly
  436. pNode->SetResultPullingTried(TRUE);
  437. // finally, we signal the agent to shut down
  438. // however in the "join domain with rename" case, since we already lost contact
  439. // with the agent, we should not attempt to call TryConnectAgent
  440. if (!pNode->QueryFailed() && !bJoinDomainWithRename)
  441. {
  442. // tell the agent to shut down in 1 minute just in case
  443. // note: by using TRUE here, the status will not be updated
  444. TryConnectAgent(pNode, TRUE, 60000);
  445. }
  446. // if we cannot query the agent, we assume it has finished
  447. if (pNode->QueryFailed())
  448. {
  449. if (bResultReady)
  450. {
  451. // if bResultReady is TRUE, we will clean the Agent_Status_QueryFailed bit
  452. pNode->SetQueryFailed(FALSE);
  453. }
  454. pNode->SetFinished();
  455. }
  456. // one more update
  457. HWND listWnd;
  458. WCHAR sTime[32];
  459. gData.GetListWindow(&listWnd);
  460. pNode->SetTimeStamp(gTTime.FormatIsoLcl( gTTime.Now( NULL ), sTime ));
  461. SendMessage(listWnd,DCT_UPDATE_ENTRY,NULL,(LPARAM)pNode);
  462. }
  463. catch (_com_error& e)
  464. {
  465. pNode->SetFailed();
  466. pNode->SetOutOfResourceToMonitor(TRUE);
  467. }
  468. // clean up
  469. if (hFindChange != INVALID_HANDLE_VALUE)
  470. FindCloseChangeNotification(hFindChange);
  471. pNode->SetDoneMonitoring(TRUE);
  472. return 0;
  473. }