|
|
/*++
Copyright (c) 2000 Microsoft Corporation
Module Name:
recovery.c
Abstract:
Handles node down events
Author:
Ahmed Mohamed (ahmedm) 12, 01, 2000
Revision History:
--*/
#include "gs.h"
#include "gsp.h"
#include <stdio.h>
extern gs_nid_t GsLocalNodeId; extern int GsMaxNodeId; extern int GsMinNodeId; extern gs_group_t GsGroupTable[];
// Node down event
void GspRsFree(gs_recovery_state_t *rs) { // free recovery state
gs_rblk_t *p;
while (p = rs->rs_list) { rs->rs_list = p->next; free((char *)p); }
GsEventFree(rs->rs_event); free((char *)rs); }
void GspPhase1NodeDown(ULONG set) { gs_group_t *gd; int i, j;
for (i = 0; i < GsGroupTableSize; i++) { gd = &GsGroupTable[i]; if (gd->g_state == GS_GROUP_STATE_FREE) { continue; } GsLockEnter(gd->g_lock); if (gd->g_mset & set) { gd->g_mset &= ~set; gd->g_curview++; gd->g_state |= GS_GROUP_FLAGS_RECOVERY; gd->g_sz = 0; for (j = gd->g_mset; j > 0; j = j >> 1) { if (j & 0x1) gd->g_sz++; } if (set & (1 << gd->g_mid)) {
for (j = GsMinNodeId; j != GsMaxNodeId; j++) { if (gd->g_mset & (1 << j)) { break; } } // elect a new master
gd->g_mid = (gs_memberid_t) j; gd->g_state |= GS_GROUP_FLAGS_NEWMASTER; }
recovery_log(("Phase1 mask %x gid %d mid %d mset %x sz %d\n", set, gd->g_id, gd->g_mid, gd->g_mset, gd->g_sz));
if (gd->g_rs != NULL) { set |= gd->g_rs->rs_dset; GsEventSignal(gd->g_rs->rs_event); GspRsFree(gd->g_rs); } gd->g_rs = (gs_recovery_state_t *) malloc(sizeof(*gd->g_rs)); assert(gd->g_rs != NULL); GsManualEventInit(gd->g_rs->rs_event); gd->g_rs->rs_sz = 0; gd->g_rs->rs_list = NULL; gd->g_rs->rs_epoch = gd->g_curview; gd->g_rs->rs_dset = set; gd->g_rs->rs_mset = gd->g_mset; if (gd->g_mid != gd->g_nid) { // we are not master, reset our mset to self and master only
gd->g_rs->rs_mset = (1 << gd->g_nid) | (1 << gd->g_mid); } } else if (gd->g_mset == 0 && (set & (1 << gd->g_mid))) { // no one is participating in this group and the sole owner dead
// remove the group and free it
GsCloseGroup(gd); }
GsLockExit(gd->g_lock); }
}
void GspRsAddSequence(gs_recovery_state_t *rs, gs_sequence_t mseq, int delta) { gs_rblk_t *p, **q;
for (p = rs->rs_list; p != NULL; p = p->next) { if (p->mseq == mseq) { p->have += delta; recovery_log(("Found seq %d cnt %d\n", mseq, p->have)); return; } } // if we get here that means the sequence is missing
p = (gs_rblk_t *) malloc(sizeof(*p)); if (p == NULL) { err_log(("GspRsAddSeq: unable to allocate memory!\n")); exit(1); }
p->mseq = mseq; p->have = delta;
recovery_log(("Add seq %d cnt %d\n", mseq, p->have));
rs->rs_sz++; q = &rs->rs_list; while (*q != NULL) { if ((*q)->mseq > mseq) { p->next = *q; *q = p; return; } q = &(*q)->next; }
p->next = *q; *q = p; }
void GspPhase2NodeDown(ULONG set) {
gs_group_t *gd; int i, j;
for (i = 0; i < GsGroupTableSize; i++) { gd = &GsGroupTable[i]; if (!(gd->g_state & GS_GROUP_FLAGS_RECOVERY)) { continue; } GsLockEnter(gd->g_lock); if (gd->g_state & GS_GROUP_FLAGS_RECOVERY) { gs_msg_t *p; extern void GspDumpQueue(gs_group_t*);
recovery_log(("Phase2 queue\n")); GspDumpQueue(gd); recovery_log(("Expect gid %d <%d, %d>\n", gd->g_id, gd->g_recv.r_mseq, gd->g_recv.r_bnum));
// walk recv queue and replay messages from dead members
for (p = gd->g_recv.r_head; p != NULL; p = p->m_next) { if (set & (1 << p->m_hdr.h_sid)) { // tag message as if we got a reply
p->m_hdr.h_flags |= GS_FLAGS_REPLY; if (p->m_hdr.h_type != GS_MSG_TYPE_UCAST){ p->m_hdr.h_flags |= GS_FLAGS_REPLAY; msg_mcast(gd->g_mset, &p->m_hdr, p->m_buf, p->m_hdr.h_len); } // check of unclosed continued sends
if (p->m_hdr.h_flags & GS_FLAGS_CONTINUED) { gs_msg_t *q; q = p->m_next; if (q == NULL || q->m_hdr.h_mseq != p->m_hdr.h_mseq || q->m_hdr.h_bnum != p->m_hdr.h_bnum+1) {
q = msg_alloc(NULL, 0); if (q == NULL) { err_log(("Unable to allocate memory!\n")); halt(1); } memcpy(&q->m_hdr, &p->m_hdr, sizeof(p->m_hdr)); q->m_hdr.h_type = GS_MSG_TYPE_ABORT; q->m_hdr.h_len = 0; q->m_hdr.h_bnum++; q->m_hdr.h_flags = GS_FLAGS_LAST;
// insert abort msg
q->m_next = p->m_next; p->m_next = q; } } } } // walk recv queue and build msg of sequences we have
for (p = gd->g_recv.r_head; p != NULL; p = p->m_next) { if (p->m_hdr.h_mseq != GS_MSG_TYPE_UCAST) GspRsAddSequence(gd->g_rs, p->m_hdr.h_mseq, 1); }
// send msg of sequences to master
if (gd->g_mid != gd->g_nid) { gs_rblk_t *p; gs_sequence_t *list; int k; gs_msg_hdr_t hdr;
recovery_log(("Sending sequence state to master %d\n", gd->g_mid));
list = (gs_sequence_t *) malloc(sizeof(*list) * gd->g_rs->rs_sz); if (list == NULL) { err_log(("Unable to allocate memory during recovery\n")); exit(1); } k = 0; for (p = gd->g_rs->rs_list; p != NULL; p = p->next) { list[k] = p->mseq; k++; } assert(k == gd->g_rs->rs_sz); k = k * sizeof(*list);
hdr.h_len = (UINT16) k; hdr.h_type = GS_MSG_TYPE_RECOVERY; hdr.h_sid = (gs_memberid_t)gd->g_nid; hdr.h_mid = (gs_memberid_t) gd->g_mid; hdr.h_gid = gd->g_id; hdr.h_viewnum = gd->g_curview; hdr.h_mseq = gd->g_recv.r_mseq; hdr.h_lseq = gd->g_send.s_mseq; msg_send(gd->g_mid, &hdr, (const char *) list, k); free((char *)list); } else { // add current sequence to dispatch
GspRsAddSequence(gd->g_rs, gd->g_recv.r_mseq, 0); }
// handle send path
for (j = 0; j < gd->g_send.s_wsz; j++) { gs_context_t *ctx = &gd->g_send.s_ctxpool[j]; if (ctx->ctx_id != GS_CONTEXT_INVALID_ID && ctx->ctx_msg != NULL){ recovery_log(("phase2 gid %d ctx %d mask %x\n", gd->g_id, ctx->ctx_id, ctx->ctx_mask)); if (set & ctx->ctx_mask) { int k, n;
recovery_log(("phase2 complete gid %d ctx %d\n", gd->g_id, ctx->ctx_id)); for (n = 0, k = set; k != 0; k = k >> 1, n++) { if (k & 0x1) { GspProcessReply(gd, ctx, n, NULL, 0, STATUS_HOST_UNREACHABLE); } } } } } // clear this node bit
gd->g_rs->rs_mset &= ~(1 << gd->g_nid); if (gd->g_rs->rs_mset == 0) { void GspComputeState(gs_group_t *gd);
GspComputeState(gd); } } GsLockExit(gd->g_lock); } }
void GspSyncState(gs_group_t *gd, gs_msg_t *msg, gs_sequence_t *list, int sz);
void GspComputeState(gs_group_t *gd) {
int k; gs_sequence_t *list; gs_rblk_t *p, *last = NULL; gs_msg_t *msg;
recovery_log(("Compute missing sequences gid %d\n", gd->g_id));
// compute missing sequences
list = (gs_sequence_t *) malloc(sizeof(*list) * gd->g_rs->rs_sz); if (list == NULL) { err_log(("Unable to allocate memory during computestate\n")); exit(1); } k = 0; for (p = gd->g_rs->rs_list; p != NULL; p = p->next) { recovery_log(("rs list sequence %d\n", p->mseq)); if (p->have == 0) { recovery_log(("Skip sequence %d\n", p->mseq)); list[k] = p->mseq; k++; } last = p; } // compute next starting mseq
gd->g_global_seq = last != NULL ? last->mseq+1 : gd->g_recv.r_mseq;
k = k * sizeof(*list);
msg = msg_alloc((char *)list, k); assert(msg != NULL);
msg->m_hdr.h_len = (UINT16) k; msg->m_hdr.h_type = GS_MSG_TYPE_SYNC; msg->m_hdr.h_flags = GS_FLAGS_LAST; msg->m_hdr.h_sid = (gs_memberid_t) gd->g_nid; msg->m_hdr.h_mid = (gs_memberid_t) gd->g_mid; msg->m_hdr.h_cid = (gs_cookie_t) -1; msg->m_hdr.h_gid = gd->g_id; msg->m_hdr.h_viewnum = gd->g_curview; msg->m_hdr.h_mseq = gd->g_global_seq++; msg->m_hdr.h_lseq = gd->g_send.s_lseq; msg->m_hdr.h_bnum = 0; *((ULONG *)msg->m_hdr.h_tag) = gd->g_rs->rs_dset;
// send missing sequence list to other nodes
msg_mcast(gd->g_mset, &msg->m_hdr, (const char *) list, k);
recovery_log(("Next starting sequence is %d\n", gd->g_global_seq));
// handle self
GspSyncState(gd, msg, list, k / sizeof(*list));
free((char *)list);
}
void GspRecoveryMsgHandler(gs_msg_t *rmsg)
{ gs_msg_hdr_t *hdr; gs_group_t *gd; hdr = &rmsg->m_hdr;
gd = GspLookupGroup(hdr->h_gid); // accept messages only if in a valid view
if (gd && rmsg->m_hdr.h_viewnum == gd->g_curview) { gs_sequence_t *list; int sz, k;
list = (gs_sequence_t *) rmsg->m_buf; sz = rmsg->m_hdr.h_len / sizeof(*list);
GsLockEnter(gd->g_lock);
// make sure group is in recovery mode
assert(gd->g_state & GS_GROUP_FLAGS_RECOVERY); assert(gd->g_mid == gd->g_nid);
// add current sequence to dispatch
GspRsAddSequence(gd->g_rs, hdr->h_mseq, 0); // insert sequences into have list
for (k = 0; k < sz; k++) { GspRsAddSequence(gd->g_rs, list[k], 1); }
// clear this node bit
gd->g_rs->rs_mset &= ~(1 << hdr->h_sid); if (gd->g_rs->rs_mset == 0) { GspComputeState(gd); }
GsLockExit(gd->g_lock); }
msg_free(rmsg);
}
void GspSyncState(gs_group_t *gd, gs_msg_t *msg, gs_sequence_t *list, int sz)
{ int k;
// make sure group is in recovery mode
assert(gd->g_state & GS_GROUP_FLAGS_RECOVERY); assert(gd->g_mid != gd->g_nid); assert(gd->g_mid == hdr->h_sid);
// mark missing sequences
for (k = 0; k < sz; k++) { gs_msg_t *p;
recovery_log(("Missing sequence %d\n", list[k]));
p = msg_alloc(NULL, 0); if (p == NULL) { err_log(("Unable to allocate memory during syncstate!\n")); halt(1); }
p->m_hdr.h_sid = gd->g_nid; p->m_hdr.h_gid = gd->g_id; p->m_hdr.h_cid = (gs_cookie_t) -1; p->m_hdr.h_type = GS_MSG_TYPE_SKIP; p->m_hdr.h_mseq = list[k]; p->m_hdr.h_lseq = gd->g_send.s_lseq; p->m_hdr.h_bnum = 0; p->m_hdr.h_flags = GS_FLAGS_LAST;
GspOrderInsert(gd, p, p, p->m_hdr.h_mseq, 0); }
// set startview to curview
gd->g_startview = gd->g_curview; // clear recovery state
gd->g_state &= ~GS_GROUP_FLAGS_RECOVERY; // free recovery state
GsEventSignal(gd->g_rs->rs_event); GspRsFree(gd->g_rs); gd->g_rs = NULL;
// insert msg into dispatch queue at proper order
GspOrderInsert(gd, msg, msg, msg->m_hdr.h_mseq, 0); GspDispatch(gd); #if 0
// xxx: need to understand this again
if (gd->g_recv.r_last != NULL) { GspCleanQueue(gd, last_mseq); } #endif
// restart any pending sends
if (gd->g_send.s_waitqueue != NULL && (gd->g_state & GS_GROUP_FLAGS_NEWMASTER)) { recovery_log(("resend: gs %x s %x\n", gd, gd->g_send.s_waitqueue)); GspAllocateSequence(gd); } gd->g_state &= ~GS_GROUP_FLAGS_NEWMASTER; }
void GspSyncMsgHandler(gs_msg_t *msg) { gs_msg_hdr_t *hdr; gs_group_t *gd; hdr = &msg->m_hdr;
gd = GspLookupGroup(hdr->h_gid); // accept messages only if in a valid view
if (gd && msg->m_hdr.h_viewnum == gd->g_curview) { gs_sequence_t *list; int sz;
list = (gs_sequence_t *) msg->m_buf; sz = msg->m_hdr.h_len / sizeof(*list);
GsLockEnter(gd->g_lock);
// clear this node bit
gd->g_rs->rs_mset &= ~(1 << hdr->h_sid); assert(gd->g_rs->rs_mset == 0);
GspSyncState(gd, msg, list, sz);
GsLockExit(gd->g_lock); } else { msg_free(msg); }
}
|