/*++ Copyright (c) 2000 Microsoft Corporation Module Name: recovery.c Abstract: Handles node down events Author: Ahmed Mohamed (ahmedm) 12, 01, 2000 Revision History: --*/ #include "gs.h" #include "gsp.h" #include extern gs_nid_t GsLocalNodeId; extern int GsMaxNodeId; extern int GsMinNodeId; extern gs_group_t GsGroupTable[]; // Node down event void GspRsFree(gs_recovery_state_t *rs) { // free recovery state gs_rblk_t *p; while (p = rs->rs_list) { rs->rs_list = p->next; free((char *)p); } GsEventFree(rs->rs_event); free((char *)rs); } void GspPhase1NodeDown(ULONG set) { gs_group_t *gd; int i, j; for (i = 0; i < GsGroupTableSize; i++) { gd = &GsGroupTable[i]; if (gd->g_state == GS_GROUP_STATE_FREE) { continue; } GsLockEnter(gd->g_lock); if (gd->g_mset & set) { gd->g_mset &= ~set; gd->g_curview++; gd->g_state |= GS_GROUP_FLAGS_RECOVERY; gd->g_sz = 0; for (j = gd->g_mset; j > 0; j = j >> 1) { if (j & 0x1) gd->g_sz++; } if (set & (1 << gd->g_mid)) { for (j = GsMinNodeId; j != GsMaxNodeId; j++) { if (gd->g_mset & (1 << j)) { break; } } // elect a new master gd->g_mid = (gs_memberid_t) j; gd->g_state |= GS_GROUP_FLAGS_NEWMASTER; } recovery_log(("Phase1 mask %x gid %d mid %d mset %x sz %d\n", set, gd->g_id, gd->g_mid, gd->g_mset, gd->g_sz)); if (gd->g_rs != NULL) { set |= gd->g_rs->rs_dset; GsEventSignal(gd->g_rs->rs_event); GspRsFree(gd->g_rs); } gd->g_rs = (gs_recovery_state_t *) malloc(sizeof(*gd->g_rs)); assert(gd->g_rs != NULL); GsManualEventInit(gd->g_rs->rs_event); gd->g_rs->rs_sz = 0; gd->g_rs->rs_list = NULL; gd->g_rs->rs_epoch = gd->g_curview; gd->g_rs->rs_dset = set; gd->g_rs->rs_mset = gd->g_mset; if (gd->g_mid != gd->g_nid) { // we are not master, reset our mset to self and master only gd->g_rs->rs_mset = (1 << gd->g_nid) | (1 << gd->g_mid); } } else if (gd->g_mset == 0 && (set & (1 << gd->g_mid))) { // no one is participating in this group and the sole owner dead // remove the group and free it GsCloseGroup(gd); } GsLockExit(gd->g_lock); } } void GspRsAddSequence(gs_recovery_state_t *rs, gs_sequence_t mseq, int delta) { gs_rblk_t *p, **q; for (p = rs->rs_list; p != NULL; p = p->next) { if (p->mseq == mseq) { p->have += delta; recovery_log(("Found seq %d cnt %d\n", mseq, p->have)); return; } } // if we get here that means the sequence is missing p = (gs_rblk_t *) malloc(sizeof(*p)); if (p == NULL) { err_log(("GspRsAddSeq: unable to allocate memory!\n")); exit(1); } p->mseq = mseq; p->have = delta; recovery_log(("Add seq %d cnt %d\n", mseq, p->have)); rs->rs_sz++; q = &rs->rs_list; while (*q != NULL) { if ((*q)->mseq > mseq) { p->next = *q; *q = p; return; } q = &(*q)->next; } p->next = *q; *q = p; } void GspPhase2NodeDown(ULONG set) { gs_group_t *gd; int i, j; for (i = 0; i < GsGroupTableSize; i++) { gd = &GsGroupTable[i]; if (!(gd->g_state & GS_GROUP_FLAGS_RECOVERY)) { continue; } GsLockEnter(gd->g_lock); if (gd->g_state & GS_GROUP_FLAGS_RECOVERY) { gs_msg_t *p; extern void GspDumpQueue(gs_group_t*); recovery_log(("Phase2 queue\n")); GspDumpQueue(gd); recovery_log(("Expect gid %d <%d, %d>\n", gd->g_id, gd->g_recv.r_mseq, gd->g_recv.r_bnum)); // walk recv queue and replay messages from dead members for (p = gd->g_recv.r_head; p != NULL; p = p->m_next) { if (set & (1 << p->m_hdr.h_sid)) { // tag message as if we got a reply p->m_hdr.h_flags |= GS_FLAGS_REPLY; if (p->m_hdr.h_type != GS_MSG_TYPE_UCAST){ p->m_hdr.h_flags |= GS_FLAGS_REPLAY; msg_mcast(gd->g_mset, &p->m_hdr, p->m_buf, p->m_hdr.h_len); } // check of unclosed continued sends if (p->m_hdr.h_flags & GS_FLAGS_CONTINUED) { gs_msg_t *q; q = p->m_next; if (q == NULL || q->m_hdr.h_mseq != p->m_hdr.h_mseq || q->m_hdr.h_bnum != p->m_hdr.h_bnum+1) { q = msg_alloc(NULL, 0); if (q == NULL) { err_log(("Unable to allocate memory!\n")); halt(1); } memcpy(&q->m_hdr, &p->m_hdr, sizeof(p->m_hdr)); q->m_hdr.h_type = GS_MSG_TYPE_ABORT; q->m_hdr.h_len = 0; q->m_hdr.h_bnum++; q->m_hdr.h_flags = GS_FLAGS_LAST; // insert abort msg q->m_next = p->m_next; p->m_next = q; } } } } // walk recv queue and build msg of sequences we have for (p = gd->g_recv.r_head; p != NULL; p = p->m_next) { if (p->m_hdr.h_mseq != GS_MSG_TYPE_UCAST) GspRsAddSequence(gd->g_rs, p->m_hdr.h_mseq, 1); } // send msg of sequences to master if (gd->g_mid != gd->g_nid) { gs_rblk_t *p; gs_sequence_t *list; int k; gs_msg_hdr_t hdr; recovery_log(("Sending sequence state to master %d\n", gd->g_mid)); list = (gs_sequence_t *) malloc(sizeof(*list) * gd->g_rs->rs_sz); if (list == NULL) { err_log(("Unable to allocate memory during recovery\n")); exit(1); } k = 0; for (p = gd->g_rs->rs_list; p != NULL; p = p->next) { list[k] = p->mseq; k++; } assert(k == gd->g_rs->rs_sz); k = k * sizeof(*list); hdr.h_len = (UINT16) k; hdr.h_type = GS_MSG_TYPE_RECOVERY; hdr.h_sid = (gs_memberid_t)gd->g_nid; hdr.h_mid = (gs_memberid_t) gd->g_mid; hdr.h_gid = gd->g_id; hdr.h_viewnum = gd->g_curview; hdr.h_mseq = gd->g_recv.r_mseq; hdr.h_lseq = gd->g_send.s_mseq; msg_send(gd->g_mid, &hdr, (const char *) list, k); free((char *)list); } else { // add current sequence to dispatch GspRsAddSequence(gd->g_rs, gd->g_recv.r_mseq, 0); } // handle send path for (j = 0; j < gd->g_send.s_wsz; j++) { gs_context_t *ctx = &gd->g_send.s_ctxpool[j]; if (ctx->ctx_id != GS_CONTEXT_INVALID_ID && ctx->ctx_msg != NULL){ recovery_log(("phase2 gid %d ctx %d mask %x\n", gd->g_id, ctx->ctx_id, ctx->ctx_mask)); if (set & ctx->ctx_mask) { int k, n; recovery_log(("phase2 complete gid %d ctx %d\n", gd->g_id, ctx->ctx_id)); for (n = 0, k = set; k != 0; k = k >> 1, n++) { if (k & 0x1) { GspProcessReply(gd, ctx, n, NULL, 0, STATUS_HOST_UNREACHABLE); } } } } } // clear this node bit gd->g_rs->rs_mset &= ~(1 << gd->g_nid); if (gd->g_rs->rs_mset == 0) { void GspComputeState(gs_group_t *gd); GspComputeState(gd); } } GsLockExit(gd->g_lock); } } void GspSyncState(gs_group_t *gd, gs_msg_t *msg, gs_sequence_t *list, int sz); void GspComputeState(gs_group_t *gd) { int k; gs_sequence_t *list; gs_rblk_t *p, *last = NULL; gs_msg_t *msg; recovery_log(("Compute missing sequences gid %d\n", gd->g_id)); // compute missing sequences list = (gs_sequence_t *) malloc(sizeof(*list) * gd->g_rs->rs_sz); if (list == NULL) { err_log(("Unable to allocate memory during computestate\n")); exit(1); } k = 0; for (p = gd->g_rs->rs_list; p != NULL; p = p->next) { recovery_log(("rs list sequence %d\n", p->mseq)); if (p->have == 0) { recovery_log(("Skip sequence %d\n", p->mseq)); list[k] = p->mseq; k++; } last = p; } // compute next starting mseq gd->g_global_seq = last != NULL ? last->mseq+1 : gd->g_recv.r_mseq; k = k * sizeof(*list); msg = msg_alloc((char *)list, k); assert(msg != NULL); msg->m_hdr.h_len = (UINT16) k; msg->m_hdr.h_type = GS_MSG_TYPE_SYNC; msg->m_hdr.h_flags = GS_FLAGS_LAST; msg->m_hdr.h_sid = (gs_memberid_t) gd->g_nid; msg->m_hdr.h_mid = (gs_memberid_t) gd->g_mid; msg->m_hdr.h_cid = (gs_cookie_t) -1; msg->m_hdr.h_gid = gd->g_id; msg->m_hdr.h_viewnum = gd->g_curview; msg->m_hdr.h_mseq = gd->g_global_seq++; msg->m_hdr.h_lseq = gd->g_send.s_lseq; msg->m_hdr.h_bnum = 0; *((ULONG *)msg->m_hdr.h_tag) = gd->g_rs->rs_dset; // send missing sequence list to other nodes msg_mcast(gd->g_mset, &msg->m_hdr, (const char *) list, k); recovery_log(("Next starting sequence is %d\n", gd->g_global_seq)); // handle self GspSyncState(gd, msg, list, k / sizeof(*list)); free((char *)list); } void GspRecoveryMsgHandler(gs_msg_t *rmsg) { gs_msg_hdr_t *hdr; gs_group_t *gd; hdr = &rmsg->m_hdr; gd = GspLookupGroup(hdr->h_gid); // accept messages only if in a valid view if (gd && rmsg->m_hdr.h_viewnum == gd->g_curview) { gs_sequence_t *list; int sz, k; list = (gs_sequence_t *) rmsg->m_buf; sz = rmsg->m_hdr.h_len / sizeof(*list); GsLockEnter(gd->g_lock); // make sure group is in recovery mode assert(gd->g_state & GS_GROUP_FLAGS_RECOVERY); assert(gd->g_mid == gd->g_nid); // add current sequence to dispatch GspRsAddSequence(gd->g_rs, hdr->h_mseq, 0); // insert sequences into have list for (k = 0; k < sz; k++) { GspRsAddSequence(gd->g_rs, list[k], 1); } // clear this node bit gd->g_rs->rs_mset &= ~(1 << hdr->h_sid); if (gd->g_rs->rs_mset == 0) { GspComputeState(gd); } GsLockExit(gd->g_lock); } msg_free(rmsg); } void GspSyncState(gs_group_t *gd, gs_msg_t *msg, gs_sequence_t *list, int sz) { int k; // make sure group is in recovery mode assert(gd->g_state & GS_GROUP_FLAGS_RECOVERY); assert(gd->g_mid != gd->g_nid); assert(gd->g_mid == hdr->h_sid); // mark missing sequences for (k = 0; k < sz; k++) { gs_msg_t *p; recovery_log(("Missing sequence %d\n", list[k])); p = msg_alloc(NULL, 0); if (p == NULL) { err_log(("Unable to allocate memory during syncstate!\n")); halt(1); } p->m_hdr.h_sid = gd->g_nid; p->m_hdr.h_gid = gd->g_id; p->m_hdr.h_cid = (gs_cookie_t) -1; p->m_hdr.h_type = GS_MSG_TYPE_SKIP; p->m_hdr.h_mseq = list[k]; p->m_hdr.h_lseq = gd->g_send.s_lseq; p->m_hdr.h_bnum = 0; p->m_hdr.h_flags = GS_FLAGS_LAST; GspOrderInsert(gd, p, p, p->m_hdr.h_mseq, 0); } // set startview to curview gd->g_startview = gd->g_curview; // clear recovery state gd->g_state &= ~GS_GROUP_FLAGS_RECOVERY; // free recovery state GsEventSignal(gd->g_rs->rs_event); GspRsFree(gd->g_rs); gd->g_rs = NULL; // insert msg into dispatch queue at proper order GspOrderInsert(gd, msg, msg, msg->m_hdr.h_mseq, 0); GspDispatch(gd); #if 0 // xxx: need to understand this again if (gd->g_recv.r_last != NULL) { GspCleanQueue(gd, last_mseq); } #endif // restart any pending sends if (gd->g_send.s_waitqueue != NULL && (gd->g_state & GS_GROUP_FLAGS_NEWMASTER)) { recovery_log(("resend: gs %x s %x\n", gd, gd->g_send.s_waitqueue)); GspAllocateSequence(gd); } gd->g_state &= ~GS_GROUP_FLAGS_NEWMASTER; } void GspSyncMsgHandler(gs_msg_t *msg) { gs_msg_hdr_t *hdr; gs_group_t *gd; hdr = &msg->m_hdr; gd = GspLookupGroup(hdr->h_gid); // accept messages only if in a valid view if (gd && msg->m_hdr.h_viewnum == gd->g_curview) { gs_sequence_t *list; int sz; list = (gs_sequence_t *) msg->m_buf; sz = msg->m_hdr.h_len / sizeof(*list); GsLockEnter(gd->g_lock); // clear this node bit gd->g_rs->rs_mset &= ~(1 << hdr->h_sid); assert(gd->g_rs->rs_mset == 0); GspSyncState(gd, msg, list, sz); GsLockExit(gd->g_lock); } else { msg_free(msg); } }