/*++ Copyright (c) 1992 Microsoft Corporation Module Name: crash.cxx Abstract: Contains code concerned with recovery actions that are taken when a service crashes. This file contains the following functions: ScQueueRecoveryAction CCrashRecord::IncrementCount CRestartContext::Perform CRebootMessageContext::Perform CRebootContext::Perform CRunCommandContext::Perform Author: Anirudh Sahni (anirudhs) 02-Dec-1996 Environment: User Mode -Win32 Revision History: 22-Oct-1998 jschwart Convert SCM to use NT thread pool APIs 02-Dec-1996 AnirudhS Created --*/ // // INCLUDES // #include "precomp.hxx" #include // needed for other lm headers #include // NERR_Success #include // NetSessionEnum #include // NetMessageBufferSend #include // NetApiBufferFree #include // ACTION_TYPE_INVALID #include // CWorkItemContext #include "smartp.h" // CHeapPtr #include "scconfig.h" // ScReadFailureActions, etc. #include "depend.h" // ScStartServiceAndDependencies #include "account.h" // ScLogonService #include "scseclib.h" // ScCreateAndSetSD #include "start.h" // ScAllowInteractiveServices, ScInitStartupInfo #include "resource.h" // IDS_SC_ACTION_BASE // // Defines and Typedefs // #define FILETIMES_PER_SEC ((__int64) 10000000) // (1 second)/(100 ns) #define LENGTH(array) (sizeof(array)/sizeof((array)[0])) // // Globals // // // Local Function Prototypes // VOID ScLogRecoveryFailure( IN SC_ACTION_TYPE ActionType, IN LPCWSTR ServiceDisplayName, IN DWORD Error ); inline LPWSTR LocalDup( LPCWSTR String ) { LPWSTR Dup = (LPWSTR) LocalAlloc(0, WCSSIZE(String)); if (Dup != NULL) { wcscpy(Dup, String); } return Dup; } // // Callback context for restarting a service // class CRestartContext : public CWorkItemContext { DECLARE_CWorkItemContext public: CRestartContext( IN LPSERVICE_RECORD ServiceRecord ) : _ServiceRecord(ServiceRecord) { ServiceRecord->UseCount++; SC_LOG2(USECOUNT, "CRestartContext: %ws increment " "USECOUNT=%lu\n", ServiceRecord->ServiceName, ServiceRecord->UseCount); } ~CRestartContext() { CServiceRecordExclusiveLock RLock; ScDecrementUseCountAndDelete(_ServiceRecord); } private: LPSERVICE_RECORD _ServiceRecord; }; // // Callback context for broadcasting a reboot message // class CRebootMessageContext : public CWorkItemContext { DECLARE_CWorkItemContext public: CRebootMessageContext( IN LPWSTR RebootMessage, IN DWORD Delay, IN LPWSTR DisplayName ) : _RebootMessage(RebootMessage), _Delay(Delay), _DisplayName(LocalDup(DisplayName)) { } ~CRebootMessageContext() { LocalFree(_RebootMessage); } private: LPWSTR _RebootMessage; DWORD _Delay; LPWSTR _DisplayName; }; // // Callback context for a reboot // (The service name is used only for logging) // class CRebootContext : public CWorkItemContext { DECLARE_CWorkItemContext public: CRebootContext( IN DWORD ActionDelay, IN LPWSTR DisplayName ) : _Delay(ActionDelay), _DisplayName(DisplayName) { } ~CRebootContext() { LocalFree(_DisplayName); } private: DWORD _Delay; LPWSTR _DisplayName; }; // // Callback context for running a recovery command // class CRunCommandContext : public CWorkItemContext { DECLARE_CWorkItemContext public: CRunCommandContext( IN LPSERVICE_RECORD ServiceRecord, IN LPWSTR FailureCommand ) : _ServiceRecord(ServiceRecord), _FailureCommand(FailureCommand) { // // The service record is used to get the // account name to run the command in. // ServiceRecord->UseCount++; SC_LOG2(USECOUNT, "CRunCommandContext: %ws increment " "USECOUNT=%lu\n", ServiceRecord->ServiceName, ServiceRecord->UseCount); } ~CRunCommandContext() { LocalFree(_FailureCommand); CServiceRecordExclusiveLock RLock; ScDecrementUseCountAndDelete(_ServiceRecord); } private: LPSERVICE_RECORD _ServiceRecord; LPWSTR _FailureCommand; }; /****************************************************************************/ VOID ScQueueRecoveryAction( IN LPSERVICE_RECORD ServiceRecord ) /*++ Routine Description: Arguments: Return Value: none. --*/ { SC_ACTION_TYPE ActionType = SC_ACTION_NONE; DWORD ActionDelay = 0; DWORD FailNum = 1; NTSTATUS ntStatus; // // See if there is any recovery action configured for this service. // HKEY Key = NULL; { DWORD ResetPeriod = INFINITE; LPSERVICE_FAILURE_ACTIONS_WOW64 psfa = NULL; DWORD Error = ScOpenServiceConfigKey( ServiceRecord->ServiceName, KEY_READ, FALSE, // don't create if missing &Key ); if (Error == ERROR_SUCCESS) { Error = ScReadFailureActions(Key, &psfa); } if (Error != ERROR_SUCCESS) { SC_LOG(ERROR, "Couldn't read service's failure actions, %lu\n", Error); } else if (psfa != NULL && psfa->cActions > 0) { ResetPeriod = psfa->dwResetPeriod; } // // Allocate a crash record for the service. // Increment the service's crash count, subject to the reset period // we just read from the registry (INFINITE if we read none). // if (ServiceRecord->CrashRecord == NULL) { ServiceRecord->CrashRecord = new CCrashRecord; } if (ServiceRecord->CrashRecord == NULL) { SC_LOG0(ERROR, "Couldn't allocate service's crash record\n"); // // NOTE: We still continue, taking the failure count to be 1. // (The crash record is used only in the "else" clause.) // } else { FailNum = ServiceRecord->CrashRecord->IncrementCount(ResetPeriod); } // // Figure out which recovery action we're going to take. // if (psfa != NULL && psfa->cActions > 0) { SC_ACTION * lpsaActions = (SC_ACTION *) ((LPBYTE) psfa + psfa->dwsaActionsOffset); DWORD i = min(FailNum, psfa->cActions); ActionType = lpsaActions[i - 1].Type; ActionDelay = lpsaActions[i - 1].Delay; if (ACTION_TYPE_INVALID(ActionType)) { SC_LOG(ERROR, "Service has invalid action type %lu\n", ActionType); ActionType = SC_ACTION_NONE; } } LocalFree(psfa); } // // Log an event about this service failing, and about the proposed // recovery action. // if (ActionType != SC_ACTION_NONE) { WCHAR wszActionString[50]; if (!LoadString(GetModuleHandle(NULL), IDS_SC_ACTION_BASE + ActionType, wszActionString, LENGTH(wszActionString))) { SC_LOG(ERROR, "LoadString failed %lu\n", GetLastError()); wszActionString[0] = L'\0'; } SC_LOG2(ERROR, "The following recovery action will be taken in %d ms: %ws.\n", ActionDelay, wszActionString); ScLogEvent(NEVENT_SERVICE_CRASH, ServiceRecord->DisplayName, FailNum, ActionDelay, ActionType, wszActionString); } else { ScLogEvent(NEVENT_SERVICE_CRASH_NO_ACTION, ServiceRecord->DisplayName, FailNum); } // // Queue a work item that will actually carry out the action after the // delay has elapsed. // switch (ActionType) { case SC_ACTION_NONE: break; case SC_ACTION_RESTART: { CRestartContext * pCtx = new CRestartContext(ServiceRecord); if (pCtx == NULL) { SC_LOG0(ERROR, "Couldn't allocate restart context\n"); break; } ntStatus = pCtx->AddDelayedWorkItem(ActionDelay, WT_EXECUTEONLYONCE); if (!NT_SUCCESS(ntStatus)) { SC_LOG(ERROR, "Couldn't add restart work item 0x%x\n", ntStatus); delete pCtx; } break; } case SC_ACTION_REBOOT: { // // Get the reboot message for the service, if any // LPWSTR RebootMessage = NULL; ScReadRebootMessage(Key, &RebootMessage); if (RebootMessage != NULL) { // // Broadcast the message to all users. Do this in a separate // thread so that we can release our exclusive lock on the // service database quickly. // CRebootMessageContext * pCtx = new CRebootMessageContext( RebootMessage, ActionDelay, ServiceRecord->DisplayName ); if (pCtx == NULL) { SC_LOG0(ERROR, "Couldn't allocate restart context\n"); LocalFree(RebootMessage); break; } ntStatus = pCtx->AddWorkItem(WT_EXECUTEONLYONCE); if (!NT_SUCCESS(ntStatus)) { SC_LOG(ERROR, "Couldn't add restart work item 0x%x\n", ntStatus); delete pCtx; } } else { // // Queue a work item to perform the reboot after the delay has // elapsed. // (CODEWORK Share this code with CRebootMessageContext::Perform) // LPWSTR DisplayNameCopy = LocalDup(ServiceRecord->DisplayName); CRebootContext * pCtx = new CRebootContext( ActionDelay, DisplayNameCopy ); if (pCtx == NULL) { SC_LOG0(ERROR, "Couldn't allocate reboot context\n"); LocalFree(DisplayNameCopy); } else { ntStatus = pCtx->AddWorkItem(WT_EXECUTEONLYONCE); if (!NT_SUCCESS(ntStatus)) { SC_LOG(ERROR, "Couldn't add reboot work item 0x%x\n", ntStatus); delete pCtx; } } } } break; case SC_ACTION_RUN_COMMAND: { // // Get the failure command for the service, if any // CHeapPtr FailureCommand; ScReadFailureCommand(Key, &FailureCommand); if (FailureCommand == NULL) { SC_LOG0(ERROR, "Asked to run a failure command, but found " "none for this service\n"); ScLogRecoveryFailure( SC_ACTION_RUN_COMMAND, ServiceRecord->DisplayName, ERROR_NO_RECOVERY_PROGRAM ); break; } // // Replace %1% in the failure command with the failure count. // (FormatMessage is *useless* for this purpose because it AV's // if the failure command contains a %2, %3 etc.!) // UNICODE_STRING Formatted; { UNICODE_STRING Unformatted; RtlInitUnicodeString(&Unformatted, FailureCommand); Formatted.Length = 0; Formatted.MaximumLength = Unformatted.MaximumLength + 200; Formatted.Buffer = (LPWSTR) LocalAlloc(0, Formatted.MaximumLength); if (Formatted.Buffer == NULL) { SC_LOG(ERROR, "Couldn't allocate formatted string, %lu\n", GetLastError()); break; } WCHAR Environment[30]; wsprintf(Environment, L"1=%lu%c", FailNum, L'\0'); NTSTATUS ntstatus = RtlExpandEnvironmentStrings_U( Environment, &Unformatted, &Formatted, NULL); if (!NT_SUCCESS(ntstatus)) { SC_LOG(ERROR, "RtlExpandEnvironmentStrings_U failed %#lx\n", ntstatus); wcscpy(Formatted.Buffer, FailureCommand); } } CRunCommandContext * pCtx = new CRunCommandContext(ServiceRecord, Formatted.Buffer); if (pCtx == NULL) { SC_LOG0(ERROR, "Couldn't allocate RunCommand context\n"); LocalFree(Formatted.Buffer); break; } ntStatus = pCtx->AddDelayedWorkItem(ActionDelay, WT_EXECUTEONLYONCE); if (!NT_SUCCESS(ntStatus)) { SC_LOG(ERROR, "Couldn't add RunCommand work item 0x%x\n", ntStatus); delete pCtx; } } break; default: SC_ASSERT(0); } if (Key != NULL) { ScRegCloseKey(Key); } } DWORD CCrashRecord::IncrementCount( DWORD ResetSeconds ) /*++ Routine Description: Increments a service's crash count. Arguments: ResetSeconds - Length, in seconds, of a period of no crashes after which the crash count should be reset to zero. Return Value: The service's new crash count. --*/ { __int64 SecondLastCrashTime = _LastCrashTime; GetSystemTimeAsFileTime((FILETIME *) &_LastCrashTime); if (ResetSeconds == INFINITE || SecondLastCrashTime + ResetSeconds * FILETIMES_PER_SEC > _LastCrashTime) { _Count++; } else { SC_LOG(CONFIG_API, "More than %lu seconds have elapsed since last " "crash, resetting crash count.\n", ResetSeconds); _Count = 1; } SC_LOG(CONFIG_API, "Service's crash count is now %lu\n", _Count); return _Count; } VOID CRestartContext::Perform( IN BOOLEAN fWaitStatus ) /*++ Routine Description: --*/ { // // Make sure we were called because of a timeout // SC_ASSERT(fWaitStatus == TRUE); SC_LOG(CONFIG_API, "Restarting %ws service...\n", _ServiceRecord->ServiceName); RemoveDelayedWorkItem(); // // CODEWORK Allow arguments to the service. // DWORD status = ScStartServiceAndDependencies(_ServiceRecord, 0, NULL, FALSE); if (status == NO_ERROR) { status = _ServiceRecord->StartError; SC_LOG(CONFIG_API, "ScStartServiceAndDependencies succeeded, StartError = %lu\n", status); } else { SC_LOG(CONFIG_API, "ScStartServiceAndDependencies failed, %lu\n", status); // // Should we treat ERROR_SERVICE_ALREADY_RUNNING as a success? // No, because it could alert the administrator to a less-than- // optimal system configuration wherein something else is // restarting the service. // ScLogRecoveryFailure( SC_ACTION_RESTART, _ServiceRecord->DisplayName, status ); } delete this; } VOID CRebootMessageContext::Perform( IN BOOLEAN fWaitStatus ) /*++ Routine Description: --*/ { // // Broadcast the reboot message to all users // SESSION_INFO_0 * Buffer = NULL; DWORD EntriesRead = 0, TotalEntries = 0; NTSTATUS ntStatus; NET_API_STATUS Status = NetSessionEnum( NULL, // servername NULL, // UncClientName NULL, // username 0, // level (LPBYTE *) &Buffer, 0xFFFFFFFF, // prefmaxlen &EntriesRead, &TotalEntries, NULL // resume_handle ); if (EntriesRead > 0) { SC_ASSERT(EntriesRead == TotalEntries); SC_ASSERT(Status == NERR_Success); WCHAR ComputerName[MAX_COMPUTERNAME_LENGTH + 1]; DWORD nSize = LENGTH(ComputerName); if (!GetComputerName(ComputerName, &nSize)) { SC_LOG(ERROR, "GetComputerName failed! %lu\n", GetLastError()); } else { DWORD MsgLen = (DWORD) WCSSIZE(_RebootMessage); for (DWORD i = 0; i < EntriesRead; i++) { Status = NetMessageBufferSend( NULL, // servername Buffer[i].sesi0_cname, // msgname ComputerName, // fromname (LPBYTE) _RebootMessage,// buf MsgLen // buflen ); if (Status != NERR_Success) { SC_LOG2(ERROR, "NetMessageBufferSend to %ws failed %lu\n", Buffer[i].sesi0_cname, Status); } } } } else if (Status != NERR_Success) { SC_LOG(ERROR, "NetSessionEnum failed %lu\n", Status); } if (Buffer != NULL) { NetApiBufferFree(Buffer); } // // Queue a work item to perform the reboot after the delay has elapsed. // Note: We're counting the delay from the time that the broadcast finished. // CRebootContext * pCtx = new CRebootContext(_Delay, _DisplayName); if (pCtx == NULL) { SC_LOG0(ERROR, "Couldn't allocate reboot context\n"); } else { _DisplayName = NULL; // pCtx will free it ntStatus = pCtx->AddWorkItem(WT_EXECUTEONLYONCE); if (!NT_SUCCESS(ntStatus)) { SC_LOG(ERROR, "Couldn't add reboot work item 0x%x\n", ntStatus); delete pCtx; } } delete this; } VOID CRebootContext::Perform( IN BOOLEAN fWaitStatus ) /*++ Routine Description: --*/ { SC_LOG0(CONFIG_API, "Rebooting machine...\n"); // Write an event log entry? // // Enable our shutdown privilege. Since we are shutting down, don't // bother doing it for only the current thread and don't bother // disabling it afterwards. // BOOLEAN WasEnabled; NTSTATUS Status = RtlAdjustPrivilege( SE_SHUTDOWN_PRIVILEGE, TRUE, // enable FALSE, // this thread only? - No &WasEnabled); if (!NT_SUCCESS(Status)) { SC_LOG(ERROR, "RtlAdjustPrivilege failed! %#lx\n", Status); SC_ASSERT(0); } else { WCHAR wszShutdownText[128]; WCHAR wszPrintableText[128 + MAX_SERVICE_NAME_LENGTH]; if (LoadString(GetModuleHandle(NULL), IDS_SC_REBOOT_MESSAGE, wszShutdownText, LENGTH(wszShutdownText))) { wsprintf(wszPrintableText, wszShutdownText, _DisplayName); } else { // // If LoadString failed, it probably means the buffer // is too small to hold the localized string // SC_LOG(ERROR, "LoadString failed! %lu\n", GetLastError()); SC_ASSERT(FALSE); wszShutdownText[0] = L'\0'; } if (!InitiateSystemShutdownEx(NULL, // machine name wszPrintableText, // reboot message _Delay / 1000, // timeout in seconds TRUE, // force apps closed TRUE, // reboot SHTDN_REASON_MAJOR_SOFTWARE | SHTDN_REASON_MINOR_UNSTABLE)) { DWORD dwError = GetLastError(); // // If two services fail simultaneously and both are configured // to reboot the machine, InitiateSystemShutdown will fail all // calls past the first with ERROR_SHUTDOWN_IN_PROGRESS. We // don't want to log an event in this case. // if (dwError != ERROR_SHUTDOWN_IN_PROGRESS) { SC_LOG(ERROR, "InitiateSystemShutdown failed! %lu\n", dwError); ScLogRecoveryFailure( SC_ACTION_REBOOT, _DisplayName, dwError ); } } } delete this; } VOID CRunCommandContext::Perform( IN BOOLEAN fWaitStatus ) /*++ Routine Description: CODEWORK Share this code with ScLogonAndStartImage --*/ { // // Make sure we were called because of a timeout // SC_ASSERT(fWaitStatus == TRUE); DWORD status = NO_ERROR; HANDLE Token = NULL; PSID ServiceSid = NULL; // SID is returned only if not LocalSystem LPWSTR AccountName = NULL; SECURITY_ATTRIBUTES SaProcess; // Process security info (used only if not LocalSystem) STARTUPINFOW StartupInfo; PROCESS_INFORMATION ProcessInfo; RemoveDelayedWorkItem(); // // Get the Account Name for the service. A NULL Account Name means the // service is configured to run in the LocalSystem account. // status = ScLookupServiceAccount( _ServiceRecord->ServiceName, &AccountName ); // We only need to log on if it's not the LocalSystem account if (AccountName != NULL) { // // CODEWORK: Keep track of recovery EXEs spawned so we can // load/unload the user profile for the process. // status = ScLogonService( _ServiceRecord->ServiceName, AccountName, &Token, NULL, &ServiceSid ); if (status != NO_ERROR) { SC_LOG(ERROR, "CRunCommandContext: ScLogonService failed, %lu\n", status); goto Clean0; } SaProcess.nLength = sizeof(SECURITY_ATTRIBUTES); SaProcess.bInheritHandle = FALSE; SC_ACE_DATA AceData[] = { {ACCESS_ALLOWED_ACE_TYPE, 0, 0, PROCESS_ALL_ACCESS, &ServiceSid}, {ACCESS_ALLOWED_ACE_TYPE, 0, 0, PROCESS_SET_INFORMATION | PROCESS_TERMINATE | SYNCHRONIZE, &LocalSystemSid} }; NTSTATUS ntstatus = ScCreateAndSetSD( AceData, // AceData LENGTH(AceData), // AceCount NULL, // OwnerSid (optional) NULL, // GroupSid (optional) &SaProcess.lpSecurityDescriptor // pNewDescriptor ); LocalFree(ServiceSid); if (! NT_SUCCESS(ntstatus)) { SC_LOG(ERROR, "CRunCommandContext: ScCreateAndSetSD failed %#lx\n", ntstatus); status = RtlNtStatusToDosError(ntstatus); goto Clean1; } SC_LOG2(CONFIG_API,"CRunCommandContext: about to spawn recovery program in account %ws: %ws\n", AccountName, _FailureCommand); // // Impersonate the user so we don't give access to // EXEs that have been locked down for the account. // if (!ImpersonateLoggedOnUser(Token)) { status = GetLastError(); SC_LOG1(ERROR, "ScLogonAndStartImage: ImpersonateLoggedOnUser failed %d\n", status); goto Clean2; } // // Spawn the Image Process // ScInitStartupInfo(&StartupInfo, FALSE); if (!CreateProcessAsUserW( Token, // logon token NULL, // lpApplicationName _FailureCommand, // lpCommandLine &SaProcess, // process' security attributes NULL, // first thread's security attributes FALSE, // whether new process inherits handles CREATE_NEW_CONSOLE, // creation flags NULL, // environment block NULL, // current directory &StartupInfo, // startup info &ProcessInfo // process info )) { status = GetLastError(); SC_LOG(ERROR, "CRunCommandContext: CreateProcessAsUser failed %lu\n", status); RevertToSelf(); goto Clean2; } RevertToSelf(); } else { // // It's the LocalSystem account // // // If the process is to be interactive, set the appropriate flags. // BOOL bInteractive = FALSE; if (AccountName == NULL && _ServiceRecord->ServiceStatus.dwServiceType & SERVICE_INTERACTIVE_PROCESS) { bInteractive = ScAllowInteractiveServices(); if (!bInteractive) { // // Write an event to indicate that an interactive service // was started, but the system is configured to not allow // services to be interactive. // ScLogEvent(NEVENT_SERVICE_NOT_INTERACTIVE, _ServiceRecord->DisplayName); } } ScInitStartupInfo(&StartupInfo, bInteractive); SC_LOG1(CONFIG_API,"CRunCommandContext: about to spawn recovery program in " "the LocalSystem account: %ws\n", _FailureCommand); // // Spawn the Image Process // if (!CreateProcessW( NULL, // lpApplicationName _FailureCommand, // lpCommandLine NULL, // process' security attributes NULL, // first thread's security attributes FALSE, // whether new process inherits handles CREATE_NEW_CONSOLE, // creation flags NULL, // environment block NULL, // current directory &StartupInfo, // startup info &ProcessInfo // process info )) { status = GetLastError(); SC_LOG(ERROR, "CRunCommandContext: CreateProcess failed %lu\n", status); goto Clean2; } } SC_LOG0(CONFIG_API, "Recovery program spawned successfully.\n"); CloseHandle(ProcessInfo.hThread); CloseHandle(ProcessInfo.hProcess); Clean2: if (AccountName != NULL) { RtlDeleteSecurityObject(&SaProcess.lpSecurityDescriptor); } Clean1: if (AccountName != NULL) { CloseHandle(Token); } Clean0: if (status != NO_ERROR) { ScLogRecoveryFailure( SC_ACTION_RUN_COMMAND, _ServiceRecord->DisplayName, status ); } delete this; } VOID ScLogRecoveryFailure( IN SC_ACTION_TYPE ActionType, IN LPCWSTR ServiceDisplayName, IN DWORD Error ) /*++ Routine Description: --*/ { WCHAR wszActionString[50]; if (!LoadString(GetModuleHandle(NULL), IDS_SC_ACTION_BASE + ActionType, wszActionString, LENGTH(wszActionString))) { SC_LOG(ERROR, "LoadString failed %lu\n", GetLastError()); wszActionString[0] = L'\0'; } ScLogEvent( NEVENT_SERVICE_RECOVERY_FAILED, ActionType, wszActionString, (LPWSTR) ServiceDisplayName, Error ); }