Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2007 lines
67 KiB

/*++
Copyright (c) 2001 Microsoft Corporation
Module Name:
Monitor.c
Abstract:
Routines for interfacing with the Resource Monitor process
Author:
John Vert (jvert) 3-Jan-1996
Revision History:
--*/
#include "fmp.h"
#define LOG_MODULE MONITOR
//
// Global data
//
CRITICAL_SECTION FmpMonitorLock;
LIST_ENTRY g_leFmpMonitorListHead;
BOOL g_fFmEnableResourceDllDeadlockDetection = FALSE;
DWORD g_dwFmResourceDllDeadlockTimeout = 0;
DWORD g_cResourceDllDeadlocks = 0;
DWORD g_dwLastResourceDllDeadlockTick = 0;
DWORD g_dwFmResourceDllDeadlockPeriod = 0;
DWORD g_dwFmResourceDllDeadlockThreshold = 0;
//
// Local function prototypes
//
DWORD
FmpRmNotifyThread(
IN LPVOID lpThreadParameter
);
DWORD
FmpGetResmonDynamicEndpoint(
OUT LPWSTR *ppResmonDynamicEndpoint
);
PRESMON
FmpCreateMonitor(
LPWSTR DebugPrefix,
BOOL SeparateMonitor
)
/*++
Routine Description:
Creates a new monitor process and initiates the RPC communication
with it.
Arguments:
None.
Return Value:
Pointer to the resource monitor structure if successful.
NULL otherwise.
--*/
{
#define FM_INITIAL_RESMON_COMMAND_LINE_SIZE 256
#define DOUBLE_QUOTE TEXT( "\"" )
#define DEBUGGER_OPTION TEXT( " -d" )
#define SPACE TEXT ( " " )
SECURITY_ATTRIBUTES Security;
HANDLE WaitArray[2];
HANDLE ThreadHandle;
HANDLE Event = NULL;
HANDLE FileMapping = NULL;
STARTUPINFO StartupInfo;
PROCESS_INFORMATION ProcessInfo;
PROCESS_INFORMATION DebugInfo;
BOOL Success;
TCHAR *Binding;
RPC_BINDING_HANDLE RpcBinding;
DWORD Status;
PRESMON Monitor;
DWORD ThreadId;
DWORD Retry = 1;
DWORD creationFlags;
LPWSTR lpszResmonAppName = NULL;
LPWSTR lpszResmonCmdLine = NULL;
DWORD cchCmdLineBufSize = FM_INITIAL_RESMON_COMMAND_LINE_SIZE;
LPWSTR pResmonDynamicEndpoint = NULL;
//
// Recover any DLL files left impartially upgraded.
//
FmpRecoverResourceDLLFiles ();
Monitor = LocalAlloc(LMEM_ZEROINIT, sizeof(RESMON));
if (Monitor == NULL) {
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Failed to allocate a Monitor structure.\n");
return(NULL);
}
Monitor->Shutdown = FALSE;
Monitor->Signature = FMP_RESMON_SIGNATURE;
//
// Create an event and a file mapping object to be passed to
// the Resource Monitor process. The event is for the Resource
// Monitor to signal its initialization is complete. The file
// mapping is for creating the shared memory region between
// the Resource Monitor and the cluster manager.
//
Security.nLength = sizeof(Security);
Security.lpSecurityDescriptor = NULL;
Security.bInheritHandle = TRUE;
Event = CreateEvent(&Security,
TRUE,
FALSE,
NULL);
if (Event == NULL) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Failed to create a ResMon event, error %1!u!.\n",
Status);
goto create_failed;
}
Security.nLength = sizeof(Security);
Security.lpSecurityDescriptor = NULL;
Security.bInheritHandle = TRUE;
FileMapping = CreateFileMapping(INVALID_HANDLE_VALUE,
&Security,
PAGE_READWRITE,
0,
sizeof(MONITOR_STATE),
NULL);
if (FileMapping == NULL) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] File Mapping for ResMon failed, error = %1!u!.\n",
Status);
goto create_failed;
}
//
// Create our own (read-only) view of the shared memory section
//
Monitor->SharedState = MapViewOfFile(FileMapping,
FILE_MAP_READ | FILE_MAP_WRITE,
0,
0,
0);
if (Monitor->SharedState == NULL) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Mapping shared state for ResMon failed, error %1!u!.\n",
Status);
goto create_failed;
}
ZeroMemory( Monitor->SharedState, sizeof(MONITOR_STATE) );
if ( !CsDebugResmon && DebugPrefix != NULL && *DebugPrefix != UNICODE_NULL ) {
Monitor->SharedState->ResmonStop = TRUE;
}
//
// Get the resource monitor expanded app name. This should be passed to CreateProcess to
// avoid Trojan exe based security attacks (see Writing Secure Code p.419)
//
lpszResmonAppName = ClRtlExpandEnvironmentStrings( TEXT("%windir%\\cluster\\resrcmon.exe") );
if ( lpszResmonAppName == NULL )
{
Status = GetLastError();
ClRtlLogPrint(LOG_CRITICAL,
"[FM] Unable to expand env strings in resmon app name, error %1!u!.\n",
Status);
goto create_failed;
}
//
// There are a few command line options that can be given to the resource monitor from the
// cluster service. These are
//
// (1) Options given by the cluster service with no input from the user
// This looks like "resrcmon.exe -e Event -m Filemapping -p ClussvcPID"
//
// (2) Options given by the cluster service with input from the user. There are 2 different
// cases:
// (2.1) "resrcmon.exe -e Event -m Filemapping -p ClussvcPID -d"
// This option tells the resmon to wait for a debugger to be attached. Once the
// user attaches a debugger, the resmon will continue with its init.
//
// (2.2) "resrcmon.exe -e Event -m Filemapping -p ClussvcPID -d "debugger command""
// This option tells the resmon to create the process with the specified "debugger command"
// An example of a debugger command would be "ntsd -g -G".
//
// (3) The admin sets the DebugPrefix property for the resource type.
// In this case, the cluster service will first create the resource monitor process and then
// create the debugger process specified by the DebugPrefix property passing it the PID
// of the resmon as an argument. The debugger can then attach to that PID.
//
while ( TRUE )
{
lpszResmonCmdLine = LocalAlloc ( LMEM_FIXED, cchCmdLineBufSize * sizeof ( WCHAR ) );
if ( lpszResmonCmdLine == NULL )
{
Status = GetLastError();
ClRtlLogPrint(LOG_CRITICAL,
"[FM] Unable to alloc memory for cmd line, error %1!u!.\n",
Status);
goto create_failed;
}
//
// NULL terminate the buffer giving room for the possibility that a " -d" may have
// to fit in down below if the admin chooses the "-debugresmon" option. This "optimization"
// is done so that we don't have to reallocate in case the user just gives a "debugresmon"
// with no debugger command.
//
lpszResmonCmdLine [ cchCmdLineBufSize - ( wcslen( DEBUGGER_OPTION ) + 1 ) ] = UNICODE_NULL;
//
// This is case 1 in the list outlined above.
// (1) Options given by the cluster service with no input from the user
// This looks like "resrcmon.exe -e Event -m Filemapping -p ClussvcPID"
//
if ( _snwprintf( lpszResmonCmdLine,
cchCmdLineBufSize - ( wcslen( DEBUGGER_OPTION ) + 1 ), // Account space for NULL, and a possible -d option
TEXT("\"%ws\" -e %d -m %d -p %d"),
lpszResmonAppName,
Event,
FileMapping,
GetCurrentProcessId() ) > 0 )
{
break;
}
LocalFree ( lpszResmonCmdLine );
lpszResmonCmdLine = NULL;
if ( Retry == 9 )
{
Status = ERROR_INVALID_PARAMETER;
ClRtlLogPrint(LOG_CRITICAL,
"[FM] Command line is too big, error %1!u!.\n",
Status);
goto create_failed;
}
cchCmdLineBufSize *= 2;
Retry ++;
}// while
Retry = 0;
if ( CsDebugResmon ) {
//
// This is case 2.1 in the list outlined above.
//
// (2) Options given by the cluster service with input from the user. There are 2 different
// cases:
// (2.1) "resrcmon.exe -e Event -m Filemapping -p ClussvcPID -d"
// This option tells the resmon to wait for a debugger to be attached. Once the
// user attaches a debugger, the resmon will continue with its init.
//
//
// Wcsncat will ALWAYS NULL terminate the destination buffer.
//
wcsncat( lpszResmonCmdLine,
DEBUGGER_OPTION,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
if ( CsResmonDebugCmd ) {
//
// This is case 2.2 in the list outlined above.
//
// (2) Options given by the cluster service with input from the user. There are 2 different
// cases:
//
// (2.2) "resrcmon.exe -e Event -m Filemapping -p ClussvcPID -d "debugger command""
// This option tells the resmon to create the process with the specified "debugger command"
// An example of a debugger command would be "ntsd -g -G".
//
DWORD cchCmdLineSize = wcslen( lpszResmonCmdLine );
DWORD cchDebugCmdSize = wcslen( CsResmonDebugCmd );
//
// make sure our buffer is large enough; include 2 double quotes
// the space and a NULL terminator
//
DWORD cchAdditionalChars = 2 * wcslen( DOUBLE_QUOTE ) + wcslen( SPACE ) + 1;
if ( cchCmdLineBufSize < ( cchCmdLineSize + cchDebugCmdSize + cchAdditionalChars ) ) {
LPWSTR lpszResmonDebugCmd;
//
// The previously allocated buffer is small. So, reallocate.
//
lpszResmonDebugCmd = ( LPWSTR ) LocalAlloc( LMEM_FIXED,
( cchCmdLineSize +
cchDebugCmdSize +
cchAdditionalChars ) * sizeof( WCHAR ) );
if ( lpszResmonDebugCmd != NULL ) {
//
// Update the new command buffer size
//
cchCmdLineBufSize = cchCmdLineSize + cchDebugCmdSize + cchAdditionalChars;
//
// lstrcpyn will NULL terminate the buffer in all cases, so we don't
// have to explicitly NULL terminate the buffer
//
lstrcpyn( lpszResmonDebugCmd, lpszResmonCmdLine, cchCmdLineBufSize );
LocalFree ( lpszResmonCmdLine );
lpszResmonCmdLine = lpszResmonDebugCmd;
//
// Wcsncat will ALWAYS NULL terminate the destination buffer.
//
wcsncat( lpszResmonCmdLine,
SPACE,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
wcsncat( lpszResmonCmdLine,
DOUBLE_QUOTE,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
wcsncat( lpszResmonCmdLine,
CsResmonDebugCmd,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
wcsncat( lpszResmonCmdLine,
DOUBLE_QUOTE,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
} else {
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Unable to allocate space for debug command line\n");
}
} else {
//
// Wcsncat will ALWAYS NULL terminate the destination buffer.
//
wcsncat( lpszResmonCmdLine,
SPACE,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
wcsncat( lpszResmonCmdLine,
DOUBLE_QUOTE,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
wcsncat( lpszResmonCmdLine,
CsResmonDebugCmd,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
wcsncat( lpszResmonCmdLine,
DOUBLE_QUOTE,
cchCmdLineBufSize -
( wcslen ( lpszResmonCmdLine ) + 1 ) );
}
}
}
//
// Acquire the monitor lock so as to ensure consistency of the resmon RPC EP that is set
// in the registry.
//
FmpAcquireMonitorLock();
//
// Attempt to start ResMon process.
//
retry_resmon_start:
ZeroMemory(&StartupInfo, sizeof(StartupInfo));
StartupInfo.cb = sizeof(StartupInfo);
creationFlags = DETACHED_PROCESS; // so ctrl-c won't kill it
Success = CreateProcess(lpszResmonAppName, // Must be supplied for security
lpszResmonCmdLine, // Command line
NULL,
NULL,
FALSE, // Inherit handles
creationFlags,
NULL,
NULL,
&StartupInfo,
&ProcessInfo);
if (!Success) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Failed to create resmon process, error %1!u!.\n",
Status);
FmpReleaseMonitorLock();
CL_LOGFAILURE(Status);
goto create_failed;
} else if ( CsDebugResmon && !CsResmonDebugCmd ) {
ClRtlLogPrint(LOG_CRITICAL,
"[FM] Waiting for debugger to connect to resmon process %1!u!\n",
ProcessInfo.dwProcessId);
}
CloseHandle(ProcessInfo.hThread); // don't need this
//
// Wait for the ResMon process to terminate, or for it to signal
// its startup event.
//
WaitArray[0] = Event;
WaitArray[1] = ProcessInfo.hProcess;
Status = WaitForMultipleObjects(2,
WaitArray,
FALSE,
INFINITE);
if (Status == WAIT_FAILED) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Wait for ResMon to start failed, error %1!u!.\n",
Status);
FmpReleaseMonitorLock();
goto create_failed;
}
if (Status == ( WAIT_OBJECT_0 + 1 )) {
if ( ++Retry > 1 ) {
//
// The resource monitor terminated prematurely.
//
GetExitCodeProcess(ProcessInfo.hProcess, &Status);
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] ResMon terminated prematurely, error %1!u!.\n",
Status);
FmpReleaseMonitorLock();
goto create_failed;
} else {
goto retry_resmon_start;
}
} else {
//
// Get the resmon dynamic EP from the registry.
//
Status = FmpGetResmonDynamicEndpoint ( &pResmonDynamicEndpoint );
//
// Release the monitor lock now that you have read the resmon EP.
//
FmpReleaseMonitorLock();
if ( Status != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_CRITICAL,
"[FM] Unable get resmon dynamic EP, error %1!u!.\n",
Status);
goto create_failed;
}
//
// The resource monitor has successfully initialized
//
CL_ASSERT(Status == 0);
Monitor->Process = ProcessInfo.hProcess;
//
// invoke the DebugPrefix process only if we're not already debugging
// the resmon process
//
if ( CsDebugResmon && DebugPrefix && *DebugPrefix != UNICODE_NULL ) {
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] -debugresmon overrides DebugPrefix property\n");
}
if ( !CsDebugResmon && ( DebugPrefix != NULL ) && ( *DebugPrefix != UNICODE_NULL )) {
WCHAR DebugLine[512];
//
// This is case 3 in the list outlined above.
//
// (3) The admin sets the DebugPrefix property for the resource type.
// In this case, the cluster service will first create the resource monitor process and then
// create the debugger process specified by the DebugPrefix property passing it the PID
// of the resmon as an argument. The debugger can then attach to that PID.
//
DebugLine[ RTL_NUMBER_OF( DebugLine ) - 1 ] = UNICODE_NULL;
_snwprintf( DebugLine,
RTL_NUMBER_OF( DebugLine ) - 1,
TEXT("\"%ws\" -p %d"),
DebugPrefix,
ProcessInfo.dwProcessId );
ZeroMemory(&StartupInfo, sizeof(StartupInfo));
StartupInfo.cb = sizeof(StartupInfo);
StartupInfo.lpDesktop = TEXT("WinSta0\\Default");
Success = CreateProcess( DebugPrefix, // Must supply app name
DebugLine, // Cmd line arguments
NULL,
NULL,
FALSE, // Inherit handles
CREATE_NEW_CONSOLE,
NULL,
NULL,
&StartupInfo,
&DebugInfo );
Monitor->SharedState->ResmonStop = FALSE;
if ( !Success ) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] ResMon debug start failed, error %1!u!.\n",
Status);
} else {
CloseHandle(DebugInfo.hThread); // don't need this
CloseHandle(DebugInfo.hProcess); // don't need this
}
}
}
CloseHandle(Event);
CloseHandle(FileMapping);
Event = NULL;
FileMapping = NULL;
//
// Initiate RPC with resource monitor process.
//
Status = RpcStringBindingCompose(TEXT("e76ea56d-453f-11cf-bfec-08002be23f2f"),
TEXT("ncalrpc"),
NULL,
pResmonDynamicEndpoint, // Dynamic EP string
NULL,
&Binding);
if (Status != RPC_S_OK) {
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] ResMon RPC binding compose failed, error %1!u!.\n",
Status);
goto create_failed;
}
Status = RpcBindingFromStringBinding(Binding, &Monitor->Binding);
RpcStringFree(&Binding);
if (Status != RPC_S_OK) {
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] ResMon RPC binding creation failed, error %1!u!.\n",
Status);
goto create_failed;
}
//
// Set the binding level on the binding handle.
//
Status = RpcBindingSetAuthInfoW(Monitor->Binding,
NULL,
RPC_C_AUTHN_LEVEL_PKT_PRIVACY,
RPC_C_AUTHN_WINNT,
NULL,
RPC_C_AUTHZ_NAME);
if ( Status != RPC_S_OK ) {
ClRtlLogPrint(LOG_UNUSUAL, "[FM] Failed to set RPC auth level, error %1!d!\n", Status );
goto create_failed;
}
//
// Start notification thread.
//
Monitor->NotifyThread = CreateThread(NULL,
0,
FmpRmNotifyThread,
Monitor,
0,
&ThreadId);
if (Monitor->NotifyThread == NULL) {
Status = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] Creation of notify thread for ResMon failed, error %1!u!.\n",
Status);
goto create_failed;
}
Monitor->RefCount = 2;
LocalFree ( lpszResmonAppName );
LocalFree ( lpszResmonCmdLine );
LocalFree ( pResmonDynamicEndpoint );
//
// Insert the new entry into the monitor list
//
InitializeListHead ( &Monitor->leMonitor );
FmpAcquireMonitorLock ();
InsertTailList ( &g_leFmpMonitorListHead, &Monitor->leMonitor );
FmpReleaseMonitorLock ();
//
// Check if deadlock detection on resource dlls is enabled and if so update the
// monitor. We should only log failures in this function and not affect the
// monitor creation itself.
//
FmpCheckAndUpdateMonitorForDeadlockDetection ( Monitor );
return(Monitor);
create_failed:
//
// Whack the process and close the handle if it was spawned already
//
if ( Monitor->Process != NULL ) {
TerminateProcess( Monitor->Process, 1 );
CloseHandle( Monitor->Process );
}
//
// Wait for the notify thread to exit, but just a little bit.
//
if ( Monitor->NotifyThread != NULL ) {
WaitForSingleObject( Monitor->NotifyThread,
FM_RPC_TIMEOUT*2 ); // Increased timeout to try to ensure RPC completes
CloseHandle( Monitor->NotifyThread );
Monitor->NotifyThread = NULL;
}
//
// Unmap view of shared file.
//
if ( Monitor->SharedState ) UnmapViewOfFile( Monitor->SharedState );
//
// Free the RPC binding handle
//
if ( Monitor->Binding != NULL ) {
RpcBindingFree( &Monitor->Binding );
}
LocalFree( Monitor );
if ( FileMapping != NULL ) {
CloseHandle( FileMapping );
}
if ( Event != NULL ) {
CloseHandle( Event );
}
LocalFree ( lpszResmonAppName );
LocalFree ( lpszResmonCmdLine );
LocalFree ( pResmonDynamicEndpoint );
SetLastError(Status);
return(NULL);
} // FmpCreateMonitor
VOID
FmpShutdownMonitor(
IN PRESMON Monitor
)
/*++
Routine Description:
Performs a clean shutdown of the Resource Monitor process.
Note that this does not make any changes to the state of
any resources being monitored by the Resource Monitor, it
only asks the Resource Monitor to clean up and terminate.
Arguments:
None.
Return Value:
None.
--*/
{
DWORD Status;
CL_ASSERT(Monitor != NULL);
FmpAcquireMonitorLock();
if ( Monitor->Shutdown ) {
FmpReleaseMonitorLock();
return;
}
Monitor->Shutdown = TRUE;
FmpReleaseMonitorLock();
//
// RPC to the server process to tell it to shutdown.
//
RmShutdownProcess(Monitor->Binding);
//
// Wait for the process to exit so that the monitor fully cleans up the resources if necessary.
//
if ( Monitor->Process ) {
Status = WaitForSingleObject(Monitor->Process, FM_MONITOR_SHUTDOWN_TIMEOUT);
if ( Status != WAIT_OBJECT_0 ) {
ClRtlLogPrint(LOG_ERROR,"[FM] Failed to shutdown resource monitor.\n");
TerminateProcess( Monitor->Process, 1 );
}
CloseHandle(Monitor->Process);
Monitor->Process = NULL;
}
RpcBindingFree(&Monitor->Binding);
//
// Wait for the notify thread to exit, but just a little bit.
//
if ( Monitor->NotifyThread ) {
Status = WaitForSingleObject(Monitor->NotifyThread,
FM_RPC_TIMEOUT*2); // Increased timeout to try to ensure RPC completes
if ( Status != WAIT_OBJECT_0 ) {
; // call removed: Terminate Thread( Monitor->NotifyThread, 1 );
// Bad call to make since terminating threads on NT can cause real problems.
}
CloseHandle(Monitor->NotifyThread);
Monitor->NotifyThread = NULL;
}
//
// Clean up shared memory mapping
//
UnmapViewOfFile(Monitor->SharedState);
//
// Remove this entry from the monitor list
//
FmpAcquireMonitorLock ();
RemoveEntryList ( &Monitor->leMonitor );
FmpReleaseMonitorLock ();
if ( InterlockedDecrement(&Monitor->RefCount) == 0 ) {
PVOID caller, callersCaller;
RtlGetCallersAddress(
&caller,
&callersCaller );
ClRtlLogPrint(LOG_NOISE,
"[FMY] Freeing monitor structure (1) %1!lx!, caller %2!lx!, callerscaller %3!lx!\n",
Monitor, caller, callersCaller );
LocalFree(Monitor);
}
return;
} // FmpShutdownMonitor
DWORD
FmpRmNotifyThread(
IN LPVOID lpThreadParameter
)
/*++
Routine Description:
This is the thread that receives resource monitor notifications.
Arguments:
lpThreadParameter - Pointer to resource monitor structure.
Return Value:
None.
--*/
{
PRESMON Monitor;
PRESMON NewMonitor;
RM_NOTIFY_KEY NotifyKey;
DWORD NotifyEvent;
DWORD Status;
CLUSTER_RESOURCE_STATE CurrentState;
BOOL Success;
Monitor = lpThreadParameter;
//
// Loop forever picking up resource monitor notifications.
// When the resource monitor returns FALSE, it indicates
// that shutdown is occurring.
//
do {
try {
Success = RmNotifyChanges(Monitor->Binding,
&NotifyKey,
&NotifyEvent,
(LPDWORD)&CurrentState);
} except (I_RpcExceptionFilter(RpcExceptionCode())) {
//
// RPC communications failure, treat it as a shutdown.
//
Status = GetExceptionCode();
ClRtlLogPrint(LOG_NOISE,
"[FM] NotifyChanges got an RPC failure, %1!u!.\n",
Status);
Success = FALSE;
}
if (Success) {
Success = FmpPostNotification(NotifyKey, NotifyEvent, CurrentState);
} else {
//
// If we are shutting down... then this is okay.
//
if ( FmpShutdown ||
Monitor->Shutdown ) {
break;
}
//
// We will try to start a new resource monitor. If this fails,
// then shutdown the cluster service.
//
ClRtlLogPrint(LOG_ERROR,
"[FM] Resource monitor terminated!\n");
ClRtlLogPrint(LOG_ERROR,
"[FM] Last resource monitor state: %1!u!, resource %2!u!.\n",
Monitor->SharedState->State,
Monitor->SharedState->ActiveResource);
CsLogEvent(LOG_UNUSUAL, FM_EVENT_RESMON_DIED);
//
// If this resource monitor has deadlocked, try to handle that deadlock. Note that
// the fact that resmon gave this specific state value means that deadlock detection
// was enabled in that monitor.
//
if ( Monitor->SharedState->State == RmonDeadlocked )
{
FmpHandleMonitorDeadlock ( Monitor );
}
//
// Use a worker thread to start new resource monitor(s).
//
if (FmpCreateMonitorRestartThread(Monitor))
CsInconsistencyHalt(ERROR_INVALID_STATE);
}
} while ( Success );
ClRtlLogPrint(LOG_NOISE,"[FM] RmNotifyChanges returned\n");
if ( InterlockedDecrement( &Monitor->RefCount ) == 0 ) {
ClRtlLogPrint(LOG_NOISE,
"[FMY] Freeing monitor structure (2) %1!lx!\n",
Monitor );
LocalFree( Monitor );
}
return(0);
} // FmpRmNotifyThread
BOOL
FmpFindMonitorResource(
IN PRESMON OldMonitor,
IN PMONITOR_RESOURCE_ENUM *PtrEnumResource,
IN PFM_RESOURCE Resource,
IN LPCWSTR Name
)
/*++
Routine Description:
Finds all resources that were managed by the old resource monitor and
starts them under the new resource monitor. Or adds them to the list
of resources to be restarted.
Arguments:
OldMonitor - pointer to the old resource monitor structure.
PtrEnumResource - pointer to a pointer to a resource enum structure.
Resource - the current resource being enumerated.
Name - name of the current resource.
Return Value:
TRUE - if we should continue enumeration.
FALSE - otherwise.
Notes:
Nothing in the old resource monitor structure should be used.
--*/
{
DWORD status;
BOOL returnNow = FALSE;
PMONITOR_RESOURCE_ENUM enumResource = *PtrEnumResource;
PMONITOR_RESOURCE_ENUM newEnumResource;
DWORD dwOldBlockingFlag;
if ( Resource->Monitor == OldMonitor ) {
if ( enumResource->fCreateMonitors == FALSE ) goto skip_monitor_creation;
//
// If this is not the quorum resource and it is blocking the
// quorum resource, then fix it up now.
//
dwOldBlockingFlag = InterlockedExchange( &Resource->BlockingQuorum, 0 );
if ( dwOldBlockingFlag ) {
ClRtlLogPrint(LOG_NOISE,
"[FM] RestartMonitor: call InterlockedDecrement on gdwQuoBlockingResources, Resource %1!ws!\n",
OmObjectId(Resource));
InterlockedDecrement(&gdwQuoBlockingResources);
}
//
// If the resource had been previously create in Resmon, then recreate
// it with a new resource monitor.
//
if ( Resource->Flags & RESOURCE_CREATED ) {
// Note - this will create a new resource monitor as needed.
status = FmpRmCreateResource(Resource);
if ( status != ERROR_SUCCESS ) {
ClRtlLogPrint(LOG_ERROR,"[FM] Failed to restart resource %1!ws!. Error %2!u!.\n",
Name, status );
return(TRUE);
}
} else {
return(TRUE);
}
} else {
return(TRUE);
}
skip_monitor_creation:
//
// If we successfully recreated a resource monitor, then add it to the
// list of resources to indicate failure.
//
if ( enumResource->CurrentIndex >= enumResource->EntryCount ) {
newEnumResource = LocalReAlloc( enumResource,
MONITOR_RESOURCE_SIZE( enumResource->EntryCount +
ENUM_GROW_SIZE ),
LMEM_MOVEABLE );
if ( newEnumResource == NULL ) {
ClRtlLogPrint(LOG_ERROR,
"[FM] Failed re-allocating resource enum to restart resource monitor!\n");
return(FALSE);
}
enumResource = newEnumResource;
enumResource->EntryCount += ENUM_GROW_SIZE;
*PtrEnumResource = newEnumResource;
}
enumResource->Entry[enumResource->CurrentIndex] = Resource;
++enumResource->CurrentIndex;
return(TRUE);
} // FmpFindMonitorResource
BOOL
FmpRestartMonitor(
IN PRESMON OldMonitor,
IN BOOL fCreateResourcesOnly,
OUT OPTIONAL PMONITOR_RESOURCE_ENUM *ppMonitorResourceEnum
)
/*++
Routine Description:
Creates a new monitor process and initiates the RPC communication
with it. Restarts all resources that were attached to the old monitor
process if requested to do so (see second parameter).
Arguments:
OldMonitor - pointer to the old resource monitor structure.
fCreateResourcesOnly - Create but do not start any resources
ppMonitorResourceEnum - Resources hosted in the old monitor.
Return Value:
TRUE if successful.
FALSE otherwise.
Notes:
The old monitor structure is deallocated when done.
--*/
{
DWORD enumSize;
DWORD i;
DWORD status;
PMONITOR_RESOURCE_ENUM enumResource;
PFM_RESOURCE resource;
DWORD dwOldBlockingFlag;
FmpAcquireMonitorLock();
if ( FmpShutdown ) {
FmpReleaseMonitorLock();
return(TRUE);
}
enumSize = MONITOR_RESOURCE_SIZE( ENUM_GROW_SIZE );
enumResource = LocalAlloc( LMEM_ZEROINIT, enumSize );
if ( enumResource == NULL ) {
ClRtlLogPrint(LOG_ERROR,
"[FM] Failed allocating resource enum to restart resource monitor!\n");
FmpReleaseMonitorLock();
CsInconsistencyHalt(ERROR_NOT_ENOUGH_MEMORY);
return(FALSE);
}
enumResource->EntryCount = ENUM_GROW_SIZE;
//
// Issue preoffline notifications only if the resources should be created and brought online.
// Else, that will be done by FmpHandleResourceRestartOnMonitorCrash.
//
if ( fCreateResourcesOnly == FALSE )
{
enumResource->CurrentIndex = 0;
enumResource->fCreateMonitors = FALSE;
//
// Enumerate all resources controlled by the old resource monitor so that we can invoke the
// handlers registered for those resources. Both preoffline and postoffline handlers are
// invoked prior to monitor shutdown so that the assumption made about underlying resource
// access (such as quorum disk access) remain valid in a graceful monitor shutdown case.
// We would issue a specific shutdown command in the case of a graceful shutdown occurring
// as a part of resource DLL upgrade.
//
OmEnumObjects( ObjectTypeResource,
(OM_ENUM_OBJECT_ROUTINE)FmpFindMonitorResource,
OldMonitor,
&enumResource );
for ( i = 0; i < enumResource->CurrentIndex; i++ ) {
resource = enumResource->Entry[i];
if ( ( resource->PersistentState == ClusterResourceOnline ) &&
( resource->Group->OwnerNode == NmLocalNode ) ) {
OmNotifyCb( resource, NOTIFY_RESOURCE_PREOFFLINE );
OmNotifyCb( resource, NOTIFY_RESOURCE_POSTOFFLINE );
}
}
}
FmpShutdownMonitor( OldMonitor );
if ( FmpDefaultMonitor == OldMonitor ) {
FmpDefaultMonitor = FmpCreateMonitor(NULL, FALSE);
if ( FmpDefaultMonitor == NULL ) {
LocalFree( enumResource );
FmpReleaseMonitorLock();
CsInconsistencyHalt(GetLastError());
return(FALSE);
}
}
enumResource->CurrentIndex = 0;
enumResource->fCreateMonitors = TRUE;
//
// Enumerate all resources controlled by the old resource monitor,
// and connect them into the new resource monitor.
//
OmEnumObjects( ObjectTypeResource,
(OM_ENUM_OBJECT_ROUTINE)FmpFindMonitorResource,
OldMonitor,
&enumResource );
//
// If you are not requested to restart any resources, bail
//
if ( fCreateResourcesOnly == TRUE )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpRestartMonitor: Skip restarting resources...\n");
goto FnExit;
}
//
// First set each resource in the list to the Offline state.
//
for ( i = 0; i < enumResource->CurrentIndex; i++ ) {
resource = enumResource->Entry[i];
//
// If the resource is owned by the local system, then do it.
//
if ( resource->Group->OwnerNode == NmLocalNode ) {
resource->State = ClusterResourceOffline;
//
// If this is not the quorum resource and it is blocking the
// quorum resource, then fix it up now.
//
dwOldBlockingFlag = InterlockedExchange( &resource->BlockingQuorum, 0 );
if ( dwOldBlockingFlag ) {
ClRtlLogPrint(LOG_NOISE,
"[FM] RestartMonitor: call InterlockedDecrement on gdwQuoBlockingResources, Resource %1!ws!\n",
OmObjectId(resource));
InterlockedDecrement(&gdwQuoBlockingResources);
}
}
}
//
// Find the quorum resource - if present bring online first.
//
for ( i = 0; i < enumResource->CurrentIndex; i++ ) {
resource = enumResource->Entry[i];
//
// If the resource is owned by the local system and is the
// quorum resource, then do it.
//
if ( (resource->Group->OwnerNode == NmLocalNode) &&
resource->QuorumResource ) {
FmpRestartResourceTree( resource );
}
}
//
// Now restart the rest of the resources in the list.
//
for ( i = 0; i < enumResource->CurrentIndex; i++ ) {
resource = enumResource->Entry[i];
//
// If the resource is owned by the local system, then do it.
//
if ( (resource->Group->OwnerNode == NmLocalNode) &&
!resource->QuorumResource ) {
FmpRestartResourceTree( resource );
}
}
FnExit:
FmpReleaseMonitorLock();
//
// If the caller has requested for the enumerated resource list, give it. It is the responsibility
// of the caller to free the list.
//
if ( ARGUMENT_PRESENT ( ppMonitorResourceEnum ) )
{
*ppMonitorResourceEnum = enumResource;
} else
{
LocalFree( enumResource );
}
//
// Don't delete the old monitor block until we've reset the resources
// to point to the new resource monitor block.
// Better to get an RPC failure, rather than some form of ACCVIO.
//
if ( InterlockedDecrement( &OldMonitor->RefCount ) == 0 ) {
#if 0
PVOID caller, callersCaller;
RtlGetCallersAddress(
&caller,
&callersCaller );
ClRtlLogPrint(LOG_NOISE,
"[FMY] Freeing monitor structure (3) %1!lx!, caller %2!lx!, callerscaller %3!lx!\n",
OldMonitor, caller, callersCaller );
#endif
LocalFree( OldMonitor );
}
return(TRUE);
} // FmpRestartMonitor
/****
@func DWORD | FmpCreateMonitorRestartThread| This creates a new
thread to restart a monitor.
@parm IN PRESMON | pMonitor| Pointer to the resource monitor that n
needs to be restarted.
@comm A monitor needs to be started in a separate thread as it
decrements the gquoblockingrescount for resources therein.
This cannot be done by fmpworkerthread because that causes
deadlocks if other items, like failure handling, being
processed by the fmpworkerthread are waiting for work that
will done by the items, like restart monitor, still in queue.
@rdesc Returns a result code. ERROR_SUCCESS on success.
****/
DWORD FmpCreateMonitorRestartThread(
IN PRESMON pMonitor
)
{
HANDLE hThread = NULL;
DWORD dwThreadId;
DWORD dwStatus = ERROR_SUCCESS;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCreateMonitorRestartThread: Entry\r\n");
//reference the resource
//the thread will dereference it
InterlockedIncrement( &pMonitor->RefCount );
hThread = CreateThread( NULL, 0, FmpHandleMonitorCrash,
pMonitor, 0, &dwThreadId );
if ( hThread == NULL )
{
dwStatus = GetLastError();
CL_UNEXPECTED_ERROR(dwStatus);
goto FnExit;
}
FnExit:
//do general cleanup
if (hThread)
CloseHandle(hThread);
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCreateMonitorRestartThread: Exit, status %1!u!\r\n",
dwStatus);
return(dwStatus);
}
BOOL
FmpHandleMonitorCrash(
IN PRESMON pCrashedMonitor
)
/*++
Routine Description:
Handle the crash of a resource monitor.
Arguments:
pCrashedMonitor - Pointer to the crashed monitor.
Return Value:
None.
--*/
{
PMONITOR_RESOURCE_ENUM pEnumResourcesHosted = NULL;
DWORD i, cRetries = MmQuorumArbitrationTimeout * 4; // Wait for quorum online for twice the arb timeout;
PFM_RESOURCE pResource, pExchangedResource;
BOOL fStatus = TRUE;
FmpRestartMonitor ( pCrashedMonitor, // Crashed monitor
TRUE, // Just create resources
&pEnumResourcesHosted ); // Resources hosted in old monitor
if ( pEnumResourcesHosted == NULL )
{
fStatus = FALSE;
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmpHandleMonitorCrash: No resources in crashed monitor\n");
goto FnExit;
}
//
// Acquire the quorum change lock to make sure the quorum resource is not changed
// from under us.
//
ACQUIRE_SHARED_LOCK ( gQuoChangeLock );
//
// Make sure the quorum resource is first in the enumerated list, so that it can be brought
// online first. This is needed because no resource can go online until the quorum resource
// does.
//
for ( i = 0; i < pEnumResourcesHosted->CurrentIndex; i++ )
{
if ( pEnumResourcesHosted->Entry[i] == gpQuoResource )
{
//
// If the quorum resource is already first in the list, bail.
//
if ( i == 0 ) break;
//
// Swap the quorum resource with the first resource in the list.
//
pExchangedResource = pEnumResourcesHosted->Entry[0];
pEnumResourcesHosted->Entry[0] = gpQuoResource;
pEnumResourcesHosted->Entry[i] = pExchangedResource;
ClRtlLogPrint(LOG_NOISE, "[FM] FmpHandleMonitorCrash: Move quorum resource %1!ws! into first position in list\n",
OmObjectName(gpQuoResource));
break;
}
} // for
//
// Handle the restart of each resource
//
for ( i = 0; i < pEnumResourcesHosted->CurrentIndex; i++ )
{
pResource = pEnumResourcesHosted->Entry[i];
//
// Order of locks gQuoChangeLock -> Group lock should be ok (see fm\fminit.c)
//
FmpAcquireLocalResourceLock ( pResource );
//
// If this is the owner node, take some action.
//
if ( pResource->Group->OwnerNode == NmLocalNode )
{
FmpHandleResourceRestartOnMonitorCrash ( pResource );
} // if
FmpReleaseLocalResourceLock ( pResource );
} // for
RELEASE_LOCK ( gQuoChangeLock );
FnExit:
LocalFree ( pEnumResourcesHosted );
return ( fStatus );
}// FmpHandleMonitorCrash
DWORD
FmpGetResmonDynamicEndpoint(
OUT LPWSTR *ppResmonDynamicEndpoint
)
/*++
Routine Description:
Read the resource monitor dynamic endpoint from the registry.
Arguments:
ppResmonDynamicEndpoint - Pointer to the dynamic endpoint string
Return Value:
None.
--*/
{
HKEY hParamsKey = NULL;
DWORD dwStatus, dwSize = 0, dwType;
//
// NULL out return parameter
//
*ppResmonDynamicEndpoint = NULL;
//
// Open key to SYSTEM\CurrentControlSet\Services\ClusSvc\Parameters
//
dwStatus = RegOpenKey ( HKEY_LOCAL_MACHINE,
CLUSREG_KEYNAME_CLUSSVC_PARAMETERS,
&hParamsKey );
if ( dwStatus != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] Error in opening cluster service params key, status %1!u!\n",
dwStatus);
goto FnExit;
}
//
// Get the size of the EP name string
//
dwStatus = RegQueryValueEx ( hParamsKey,
CLUSREG_NAME_SVC_PARAM_RESMON_EP,
0,
&dwType,
NULL,
&dwSize );
if ( dwStatus != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] Error in querying %1!ws! value size, status %2!u!\n",
CLUSREG_NAME_SVC_PARAM_RESMON_EP,
dwStatus);
goto FnExit;
}
*ppResmonDynamicEndpoint = ( LPWSTR ) LocalAlloc( LMEM_FIXED, dwSize );
if ( *ppResmonDynamicEndpoint == NULL )
{
dwStatus = GetLastError();
ClRtlLogPrint(LOG_CRITICAL, "[FM] Error in memory allocation for resmon EP string, status %1!u!\n",
dwStatus);
goto FnExit;
}
//
// Get the EP name string
//
dwStatus = RegQueryValueExW( hParamsKey,
CLUSREG_NAME_SVC_PARAM_RESMON_EP,
0,
&dwType,
( LPBYTE ) *ppResmonDynamicEndpoint,
&dwSize );
if ( dwStatus != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] Error in querying %1!ws! value, status %2!u!\n",
CLUSREG_NAME_SVC_PARAM_RESMON_EP,
dwStatus);
goto FnExit;
}
//
// Delete the value, but this operation is not fatal if it doesn't succeed
//
dwStatus = RegDeleteValue ( hParamsKey, CLUSREG_NAME_SVC_PARAM_RESMON_EP );
if ( dwStatus != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_UNUSUAL, "[FM] Error in deleting %1!ws! value, status %2!u!\n",
CLUSREG_NAME_SVC_PARAM_RESMON_EP,
dwStatus);
dwStatus = ERROR_SUCCESS;
goto FnExit;
}
ClRtlLogPrint(LOG_NOISE, "[FM] Resmon LRPC EP name is %1!ws!\n", *ppResmonDynamicEndpoint);
FnExit:
if ( dwStatus != ERROR_SUCCESS )
{
LocalFree ( *ppResmonDynamicEndpoint );
*ppResmonDynamicEndpoint = NULL;
}
if ( hParamsKey ) RegCloseKey ( hParamsKey );
return ( dwStatus );
} // FmpGetResmonDynamicEndpoint
VOID
FmpHandleResourceRestartOnMonitorCrash(
IN PFM_RESOURCE pResource
)
/*++
Routine Description:
Take action to restart the specified resource on a monitor crash.
Arguments:
pResource - Pointer to the resource to be restarted.
Return Value:
None.
Comments:
This function essentially does the same job as what FmpRmDoHandleCriticalResourceStateChange
does with one VERY CRUCIAL difference. While that function relies on the resource structure
to look at the current state and see if a failure needs to be processed, this function WILL
FORCE a failure to be processed. This is needed in a case such as
1. The current state of the resource is failed.
2. FM is trying to terminate the resource.
3. The resource dll gets stuck in terminate.
4. Resource monitor detects a deadlock and terminates itself.
5. We will post a new failure notification generated by the monitor crash.
6. Resource state in this case transitions from failed to failed and so if we were to
rely on FmpRmDoHandleCriticalResourceStateChange, no action will be taken.
7. On the other hand, this function will pretend the last state of the resource was
ClusterResourceOnline and force a restart.
8. Of course, only those resources whose persistent state is set to 1 will be restarted by
FmpOnlineResource.
--*/
{
//
// If this is the quorum resource, handle the failure in this thread itself and don't post it to
// the worker. This is because it is possible in a wierd case for some resources to be stuck
// in the FM worker thread waiting for the quorum resource to go online and the quorum resource
// online work item is queued behind. Note also that this function is called from a non-worker
// thread. In addition, we handle the quorum online first and so other resources are free to
// go online after the quorum comes online.
//
ClRtlLogPrint (LOG_NOISE, "[FM] FmpHandleResourceRestartOnMonitorCrash: Processing resource %1!ws!\n",
OmObjectName ( pResource ) );
//
// If this resource is either online or in pending state, declare it as failed. We don't
// touch failed or offline resources. Note that we need to mark the state as failed so
// that management tools show the state of the resource correctly. In addition, we want
// the clussvc to die in case the quorum resource fails repeatedly and that triggers a
// group failure.
//
if ( ( pResource->State == ClusterResourceOnline ) ||
( pResource->State > ClusterResourcePending ) )
{
FmpPropagateResourceState( pResource, ClusterResourceFailed );
}
//
// Comments from sunitas: Call the synchronous notifications.
// This is done before the count is decremented as the synchronous
// callbacks like the registry replication must get a chance to
// finish before the quorum resource state is allowed to change.
//
// Note, there is no synchronization here with the resmon's
// online/offline code. They are using the local resource locks.
//
FmpCallResourceNotifyCb( pResource, ClusterResourceFailed );
//
// This function is called with gQuoChangeLock held, so this check is safe.
//
if ( pResource == gpQuoResource )
{
InterlockedExchange( &pResource->BlockingQuorum, 0 );
//
// If this group is moving, then return.
//
if ( ( pResource->Group->MovingList != NULL ) ||
( pResource->Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL ) )
{
ClRtlLogPrint (LOG_NOISE, "[FM] FmpHandleResourceRestartOnMonitorCrash: Take no action on resource %1!ws! since group is moving\n",
OmObjectName ( pResource ) );
goto FnExit;
}
FmpProcessResourceEvents ( pResource,
ClusterResourceFailed, // New state
ClusterResourceOnline ); // Old state -- pretend it is online to force
// a restart.
goto FnExit;
}
//
// Just to be safe, make sure the blocking quorum count is reduced by 1 if necessary.
//
if ( InterlockedExchange( &pResource->BlockingQuorum, 0 ) )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpHandleResourceRestartOnMonitorCrash: call InterlockedDecrement on gdwQuoBlockingResources, Resource %1!ws!\n",
OmObjectName(pResource));
InterlockedDecrement( &gdwQuoBlockingResources );
}
//
// If this group is moving, then return.
//
if ( ( pResource->Group->MovingList != NULL ) ||
( pResource->Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL ) )
{
ClRtlLogPrint (LOG_NOISE, "[FM] FmpHandleResourceRestartOnMonitorCrash: Take no action on resource %1!ws! since group is moving\n",
OmObjectName ( pResource ) );
goto FnExit;
}
//
// Now post a work item to the FM worker thread to process this non-quorum resource
// failure.
//
OmReferenceObject ( pResource );
FmpPostWorkItem( FM_EVENT_RES_RESOURCE_FAILED,
pResource,
ClusterResourceOnline ); // Old state -- pretend it is online to force
// a restart.
FnExit:
return;
}// FmpHandleResourceRestartOnMonitorCrash
VOID
FmCheckIsDeadlockDetectionEnabled(
)
/*++
Routine Description:
Query the cluster key and see if deadlock detection is enabled.
Arguments:
None.
Return Value:
None.
--*/
{
DWORD dwStatus = ERROR_SUCCESS;
DWORD dwValue = 0;
BOOL fDeadlockDetectionEnabled = FALSE;
DWORD dwDeadlockDetectionTimeout = CLUSTER_RESOURCE_DLL_DEFAULT_DEADLOCK_TIMEOUT_SECS;
DWORD dwDeadlockDetectionPeriod = CLUSTER_RESOURCE_DLL_DEFAULT_DEADLOCK_PERIOD_SECS;
DWORD dwDeadlockDetectionThreshold = CLUSTER_RESOURCE_DLL_DEFAULT_DEADLOCK_THRESHOLD;
if ( !FmpInitialized ) return;
//
// First check if deadlock detection is enabled. If not, you are done.
//
dwStatus = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_CLUS_ENABLE_RESOURCE_DLL_DEADLOCK_DETECTION,
&dwValue,
NULL );
if ( dwStatus != ERROR_SUCCESS )
{
if ( dwStatus != ERROR_FILE_NOT_FOUND )
{
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmCheckIsDeadlockDetectionEnabled: Unable to query cluster property %1!ws!, status %2!u!\n",
CLUSREG_NAME_CLUS_ENABLE_RESOURCE_DLL_DEADLOCK_DETECTION,
dwStatus);
goto FnExit;
} else
{
//
// No value is present. Return with success.
//
dwStatus = ERROR_SUCCESS;
}
goto FnExit;
}
if ( dwValue == 1 )
{
fDeadlockDetectionEnabled = TRUE;
} else if ( dwValue != 0 )
{
dwStatus = ERROR_INVALID_PARAMETER;
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmCheckIsDeadlockDetectionEnabled: Illegal value set %2!u! for property %1!ws!, ignoring\n",
CLUSREG_NAME_CLUS_ENABLE_RESOURCE_DLL_DEADLOCK_DETECTION,
dwValue);
goto FnExit;
} else
{
goto FnExit;
}
dwStatus = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_CLUS_RESOURCE_DLL_DEADLOCK_TIMEOUT,
&dwDeadlockDetectionTimeout,
&dwDeadlockDetectionTimeout ); // Set initially to the default
if ( dwStatus != ERROR_SUCCESS )
{
if ( dwStatus != ERROR_FILE_NOT_FOUND )
{
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmCheckIsDeadlockDetectionEnabled: Unable to query cluster property %1!ws!, status %2!u!\n",
CLUSREG_NAME_CLUS_RESOURCE_DLL_DEADLOCK_TIMEOUT,
dwStatus);
goto FnExit;
} else
{
//
// No value is present. Continue with success status.
//
dwStatus = ERROR_SUCCESS;
}
}
dwStatus = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_CLUS_RESOURCE_DLL_DEADLOCK_THRESHOLD,
&dwDeadlockDetectionThreshold,
&dwDeadlockDetectionThreshold ); // Set initially to the default
if ( dwStatus != ERROR_SUCCESS )
{
if ( dwStatus != ERROR_FILE_NOT_FOUND )
{
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmCheckIsDeadlockDetectionEnabled: Unable to query cluster property %1!ws!, status %2!u!\n",
CLUSREG_NAME_CLUS_RESOURCE_DLL_DEADLOCK_THRESHOLD,
dwStatus);
goto FnExit;
} else
{
//
// No value is present. Continue with success status.
//
dwStatus = ERROR_SUCCESS;
}
}
dwStatus = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_CLUS_RESOURCE_DLL_DEADLOCK_PERIOD,
&dwDeadlockDetectionPeriod,
&dwDeadlockDetectionPeriod ); // Set initially to the default
if ( dwStatus != ERROR_SUCCESS )
{
if ( dwStatus != ERROR_FILE_NOT_FOUND )
{
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmCheckIsDeadlockDetectionEnabled: Unable to query cluster property %1!ws!, status %2!u!\n",
CLUSREG_NAME_CLUS_RESOURCE_DLL_DEADLOCK_PERIOD,
dwStatus);
goto FnExit;
} else
{
//
// No value is present. Continue with success status.
//
dwStatus = ERROR_SUCCESS;
}
}
FnExit:
if ( dwStatus == ERROR_SUCCESS )
{
DWORD dwCurrentDeadlockDetectionTimeout;
BOOL fIsDeadlockDetectionEnabledCurrently;
DWORD dwCurrentDeadlockDetectionPeriod;
DWORD dwCurrentDeadlockDetectionThreshold;
//
// Make sure these values are updated together. We can only take the lock if FM
// is initialized.
//
FmpAcquireMonitorLock ();
dwCurrentDeadlockDetectionTimeout = g_dwFmResourceDllDeadlockTimeout;
fIsDeadlockDetectionEnabledCurrently = g_fFmEnableResourceDllDeadlockDetection;
dwCurrentDeadlockDetectionPeriod = g_dwFmResourceDllDeadlockPeriod;
dwCurrentDeadlockDetectionThreshold = g_dwFmResourceDllDeadlockThreshold;
g_fFmEnableResourceDllDeadlockDetection = fDeadlockDetectionEnabled;
//
// Update the three values only if deadlock detection is enabled.
//
if ( g_fFmEnableResourceDllDeadlockDetection )
{
g_dwFmResourceDllDeadlockTimeout = dwDeadlockDetectionTimeout;
g_dwFmResourceDllDeadlockPeriod = dwDeadlockDetectionPeriod;
g_dwFmResourceDllDeadlockThreshold = dwDeadlockDetectionThreshold;
} else
{
//
// Change the timeout to 0 so that the next time deadlock detection is enabled,
// we will update all monitors with the new timeout.
//
g_dwFmResourceDllDeadlockTimeout = 0;
}
if ( g_fFmEnableResourceDllDeadlockDetection != fIsDeadlockDetectionEnabledCurrently )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmCheckIsDeadlockDetectionEnabled: Deadlock detection %1!ws!\n",
(g_fFmEnableResourceDllDeadlockDetection ? L"enabled" : L"disabled"));
}
//
// Update the monitors with deadlock info if necessary. We will update the monitors
// only if the timeout has changed.
//
if ( ( dwCurrentDeadlockDetectionTimeout != g_dwFmResourceDllDeadlockTimeout ) &&
( g_fFmEnableResourceDllDeadlockDetection ) )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmCheckIsDeadlockDetectionEnabled: Deadlock timeout = %1!u! secs\n",
dwDeadlockDetectionTimeout);
FmpCheckAndUpdateMonitorForDeadlockDetection( NULL );
}
//
// If deadlock detection is enabled, log if the deadlock threshold or deadlock period is
// changed.
//
if ( g_fFmEnableResourceDllDeadlockDetection )
{
if ( dwCurrentDeadlockDetectionPeriod != g_dwFmResourceDllDeadlockPeriod )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmCheckIsDeadlockDetectionEnabled: Deadlock period = %1!u! secs\n",
dwDeadlockDetectionPeriod);
}
if ( dwCurrentDeadlockDetectionThreshold != g_dwFmResourceDllDeadlockThreshold )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmCheckIsDeadlockDetectionEnabled: Deadlock threshold = %1!u!\n",
dwDeadlockDetectionThreshold);
}
}
FmpReleaseMonitorLock ();
}
return;
}// FmCheckIsDeadlockDetectionEnabled
VOID
FmpCheckAndUpdateMonitorForDeadlockDetection(
IN PRESMON pMonitor OPTIONAL
)
/*++
Routine Description:
Check if deadlock detection is enabled and if so update the monitor with the information.
If no monitor information is supplied, then all monitors will be updated.
Arguments:
pMonitor - The monitor to be updated. OPTIONAL
Return Value:
None.
--*/
{
DWORD dwStatus;
if ( !FmpInitialized ) return;
FmpAcquireMonitorLock ();
//
// If deadlock detection is disabled, there is nothing else to do.
//
if ( g_fFmEnableResourceDllDeadlockDetection == FALSE )
{
goto FnExit;
}
//
// Update the monitors with the deadlock timeout. That API will also initialize the
// resmon deadlock monitoring subsystem if necessary.
//
if ( ARGUMENT_PRESENT ( pMonitor ) )
{
dwStatus = RmUpdateDeadlockDetectionParams ( pMonitor->Binding,
g_dwFmResourceDllDeadlockTimeout );
ClRtlLogPrint(LOG_NOISE, "[FM] FmpCheckAndUpdateMonitorForDeadlockDetection: Updated monitor with a deadlock timeout of %1!u! secs, status %2!u!\n",
g_dwFmResourceDllDeadlockTimeout,
dwStatus);
//
// If the monitor is successfully updated, save the value that we sent in. Note that
// this is done so that we know what value we used. The global can go out of sync
// with this saved value in many situations since our update policy is kind of
// lazy.
//
if ( dwStatus == ERROR_SUCCESS )
{
pMonitor->dwDeadlockTimeoutSecs = g_dwFmResourceDllDeadlockTimeout;
}
} else
{
PLIST_ENTRY pListEntry;
pListEntry = g_leFmpMonitorListHead.Flink;
while ( pListEntry != &g_leFmpMonitorListHead )
{
pMonitor = CONTAINING_RECORD ( pListEntry,
RESMON,
leMonitor );
dwStatus = RmUpdateDeadlockDetectionParams ( pMonitor->Binding,
g_dwFmResourceDllDeadlockTimeout );
//
// If the monitor is successfully updated, save the value that we sent in. Note that
// this is done so that we know what value we used. The global can go out of sync
// with this saved value in many situations since our update policy is kind of
// lazy.
//
if ( dwStatus == ERROR_SUCCESS )
{
pMonitor->dwDeadlockTimeoutSecs = g_dwFmResourceDllDeadlockTimeout;
}
ClRtlLogPrint(LOG_NOISE, "[FM] FmpCheckAndUpdateMonitorForDeadlockDetection: Updated monitor with a deadlock timeout of %1!u! secs, status %2!u!\n",
g_dwFmResourceDllDeadlockTimeout,
dwStatus);
pListEntry = pListEntry->Flink;
}// while
}
FnExit:
FmpReleaseMonitorLock ();
return;
} // FmpCheckAndUpdateMonitorForDeadlockDetection
VOID
FmpHandleMonitorDeadlock(
IN PRESMON pMonitor
)
/*++
Routine Description:
Handle the deadlock of a monitor.
Arguments:
pMonitor - The monitor that deadlocked.
Return Value:
None.
Comments:
This function is called only when FM positively knows the resmon deadlocked.
--*/
{
DWORD dwCurrentTickCount;
FmpAcquireMonitorLock ();
g_cResourceDllDeadlocks ++;
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpHandleMonitorDeadlock: Deadlock detected, count = %1!u!, last deadlock tick = %2!u!, deadlock timeout = %3!u! secs\n",
g_cResourceDllDeadlocks,
g_dwLastResourceDllDeadlockTick,
pMonitor->dwDeadlockTimeoutSecs);
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpHandleMonitorDeadlock: Deadlock threshold = %1!u!, deadlock period = %2!u! secs\n",
g_dwFmResourceDllDeadlockThreshold,
g_dwFmResourceDllDeadlockPeriod);
//
// Reset the deadlock count if it has been very long since the last deadlock. Currently, we
// see if the resource monitor has deadlocked g_dwLastResourceMonitorDeadlockThreshold + 1
// times within a time period of twice the time it would take to detect a deadlock during that
// period.
//
dwCurrentTickCount = GetTickCount ();
if ( ( dwCurrentTickCount - g_dwLastResourceDllDeadlockTick ) >
g_dwFmResourceDllDeadlockPeriod * 1000 )
{
g_cResourceDllDeadlocks = 1;
g_dwLastResourceDllDeadlockTick = dwCurrentTickCount;
ClRtlLogPrint(LOG_NOISE, "[FM] FmpHandleMonitorDeadlock: Resetting monitor deadlock count, deadlock tick = %1!u!\n",
dwCurrentTickCount);
}
//
// We crossed the tolerable threshold. Give up.
//
if ( g_cResourceDllDeadlocks > g_dwFmResourceDllDeadlockThreshold )
{
MMStopClussvcClusnetHb ();
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpHandleMonitorDeadlock: Inform MM to stop clusnet heartbeats\n");
}
FmpReleaseMonitorLock ();
return;
}// FmpHandleMonitorDeadlock