windows-server-2003/base/cluster/resdll/ndquorum/crs.c

/*++

Copyright (c) 2001  Microsoft Corporation

Module Name:

    crs.c

Abstract:

    Implements Consistency Replica Set Algorithm

Author:

    Ahmed Mohamed (ahmedm) 1-Jan-2001

Revision History:

--*/
#include <nt.h>
#include <ntdef.h>
#include <ntrtl.h>
#include <nturtl.h>

#include <windows.h>
#include <stdio.h>
#include <assert.h>

#define QFS_DBG
#include "crs.h"

#include "fsutil.h"

#define xmalloc(size)  VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE)

#define xfree(buffer) VirtualFree(buffer, 0, MEM_RELEASE) 

#define CrspEqual(r1,r2)        ((r1)->hdr.seq == (r2)->hdr.seq && \
                                 (r1)->hdr.epoch == (r2)->hdr.epoch && \
                                 (r1)->hdr.state == (r2)->hdr.state)

DWORD CrsForcedQuorumSize = 0xffff;

void
WINAPI
CrsSetForcedQuorumSize(DWORD size)
{
    CrsForcedQuorumSize = size;
}

VOID
CrsForceClose(CrsInfo_t *p)
/*
    This should be called only on emergency terminations. This would unlock the crs.log
    file and close the handle. This does not hold any lock.
*/
{
    if (p == NULL) {
        CrsLog(("CrsForceClose: Exiting...\n"));
        return;
    }

    CrsLog(("CrsForceClose: fh 0x%x, nid %d\n", p->fh, p->lid));
    
    if (p->fh != INVALID_HANDLE_VALUE) {
        if(!UnlockFile(p->fh, 0, 0, (DWORD)-1, (DWORD)-1)) {
            CrsLog(("CrsForceClose: UnlockFile(0x%x) returns %d\n", p->fh, GetLastError()));
        }
        if(!CloseHandle(p->fh)) {
            CrsLog(("CrsForceClose: CloseHandle(0x%x) returns %d\n", p->fh, GetLastError()));
        }
        p->fh = INVALID_HANDLE_VALUE;
    }
}

DWORD
CrspFindLast(CrsInfo_t *p, DWORD logsz)
{
    
    CrsRecord_t *rec, *last_rec;
    BOOL err;
    DWORD n, i;

    if (p->fh == INVALID_HANDLE_VALUE) {
        CrsLog(("CrspFindLast: Invalid file handle. Exiting...\n"));
        return ERROR_INVALID_HANDLE;
    }
    
    n = SetFilePointer(p->fh, 0, NULL, FILE_BEGIN);
    if (n == INVALID_SET_FILE_POINTER) {
        return GetLastError();
    }

    err = ReadFile(p->fh, p->buf, logsz, &n, NULL);
    if (!err)
        return GetLastError();

    if (n != logsz) {
        CrsLog(("Crs%d: failed to load complete file, read %d expected %d\n",
                p->lid,
                n, logsz));
        return ERROR_BAD_LENGTH;
    }
    
    // Not needed.
    // ASSERT(p->max_records * CRS_RECORD_SZ == (int)n);
    // if(p->max_records * CRS_RECORD_SZ != (int)n) {
    //    CrsLog(("Crs%d: unable to load log file %d bytes, got %d bytes\n",
    //           p->lid, n, logsz));
    //    return ERROR_BAD_LENGTH;
    // }

    CrsLog(("Crs%d: loaded %d bytes, %d records\n", p->lid,
           n, p->max_records));

    last_rec = NULL;
    rec = p->buf;
    for (i = 0; i < logsz; i += CRS_RECORD_SZ, rec++) {
        if (rec->hdr.tag != CRS_TAG) {
            CrsLog(("crs%d: Bad record %d, got %x expected %x\n",
                    p->lid,
                    i/CRS_RECORD_SZ, rec->hdr.tag, CRS_TAG));
            return ERROR_BAD_FORMAT;
        }

        if (!last_rec ||
            rec->hdr.epoch > last_rec->hdr.epoch ||
            (rec->hdr.epoch == last_rec->hdr.epoch &&
             (rec->hdr.seq > last_rec->hdr.seq))) {
            last_rec = rec;
        }
    }
    ASSERT(last_rec);

    // make sure only the last record is not committed or aborted
    rec = p->buf;
    for (i = 0; i < logsz; i += CRS_RECORD_SZ, rec++) {
        if (!(rec->hdr.state & (CRS_COMMIT | CRS_ABORT))) {
            if (rec != last_rec) {
                CrsLog(("crs:%d Bad record %d state %x expected commit|abort\n",
                        p->lid, i/CRS_RECORD_SZ, rec->hdr.state));
                return ERROR_INTERNAL_ERROR;
            }
        }
    }

    p->last_record = (int) (last_rec - p->buf);
    p->seq = last_rec->hdr.seq;
    p->epoch = last_rec->hdr.epoch;

    return ERROR_SUCCESS;

}

#define CrspFlush(p,offset)     CrspWrite(p,offset, CRS_SECTOR_SZ)

static
DWORD
CrspWrite(CrsInfo_t *p, int offset, DWORD length)
{
    DWORD       n;

    if (p->fh == INVALID_HANDLE_VALUE) {
        CrsLog(("CrspWrite: Invalid file handle. Exiting...\n"));
        return ERROR_INVALID_HANDLE;
    }

    p->pending = FALSE;

    n = (DWORD) offset;
    // write out last sector, assumes lock is held
    ASSERT(offset < p->max_records);
    offset = offset / CRS_RECORDS_PER_SECTOR;

    CrsLog(("Crs%d: flush %d bytes record %d -> %d,%d\n", p->lid,
            length, n,
            offset, offset*CRS_SECTOR_SZ));

    n = SetFilePointer(p->fh, offset * CRS_SECTOR_SZ, NULL, FILE_BEGIN);
    if (n == INVALID_SET_FILE_POINTER) {
        return GetLastError();
    }

    n = 0;
    if (WriteFile(p->fh, (PVOID) &p->buf[offset*CRS_RECORDS_PER_SECTOR], length, &n, NULL)) {
        if (n != length) {
            CrsLog(("Write count mismatch, wrote %d, expected %d\n", n, length));
            return ERROR_BAD_LENGTH;
        }
        return ERROR_SUCCESS;
    }

    n = GetLastError();
    CrsLog(("Crs%d: flush record %d failed err %d\n", p->lid, offset, n));
    if (n == ERROR_UNEXP_NET_ERR) {
        // repeat the write one more time
        p->pending = TRUE;
    }

    return n;
}

static
DWORD
CrspAppendRecord(CrsInfo_t *p, CrsRecord_t *rr, CrsRecord_t **rec)
{
    CrsRecord_t *q;
    DWORD err;

    // tag record 
    rr->hdr.tag = CRS_TAG;

    // assumes lock is held
    if ((p->last_record & CRS_SECTOR_MASK) == CRS_SECTOR_MASK) {
        // flush current sector
        err = CrspFlush(p, p->last_record);
        if (err != ERROR_SUCCESS)
            return err;

    }

        // advance last record
    p->last_record++;
    if (p->last_record == p->max_records)
        p->last_record = 0;

    CrsLog(("Crs%d: append record %d epoch %I64d seq %I64d state %x\n",
            p->lid, p->last_record,
            rr->hdr.epoch, rr->hdr.seq, rr->hdr.state));

    // copy record
    q = &p->buf[p->last_record];
    memcpy((PVOID)q, (PVOID) rr, CRS_RECORD_SZ);

    // flush it out now
    err = CrspFlush(p, p->last_record);
    if (err == ERROR_SUCCESS) {
        if (rec) *rec = q;
    } else {
        if (p->last_record == 0)
            p->last_record = p->max_records;
        p->last_record--;
    }

    return err;
}

// NextRecord:
//      if seq is null, fill in last record and return SUCCESS
//      if seq is not found, return NOT_FOUND
//      if seq is last record, return EOF
//      otherwise return next record after seq in lrec and SUCCESS
DWORD
CrspNextLogRecord(CrsInfo_t *info, CrsRecord_t *seq,
                  CrsRecord_t *lrec, BOOLEAN this_flag)
{
    CrsRecord_t *last, *p;
    DWORD err = ERROR_SUCCESS;

    if (lrec == NULL || info == NULL) {
        return ERROR_INVALID_PARAMETER;
    }

    // read record
    EnterCriticalSection(&info->lock);
    last = &info->buf[info->last_record];
    if (seq == NULL) {
        CrsLog(("Crs%d: last record %d %I64d %I64d\n",
                info->lid, info->last_record, last->hdr.epoch, last->hdr.seq));

        // read last record
        memcpy(lrec, last, CRS_RECORD_SZ);

    } else if (seq->hdr.epoch != last->hdr.epoch ||
               seq->hdr.seq != last->hdr.seq) {
        int i;

        CrsLog(("Crs%d: last record %d %I64d %I64d search %I64d %I64d\n",
                info->lid, info->last_record,
                last->hdr.epoch, last->hdr.seq,
                seq->hdr.epoch, seq->hdr.seq));

        // assume we don't have it
        p = seq;
        seq = NULL;
        // do a search instead of index, so that
        // seq can be reset as epoch increments
        for (i = 0; i < info->max_records; i++) {
            last = &info->buf[i];
            if (p->hdr.epoch == last->hdr.epoch &&
                p->hdr.seq == last->hdr.seq) {

                seq = last;
                break;
            }
        }
        if (seq != NULL) {
            if (this_flag == FALSE) {
                // return record after this one
                i++;
                if (i >= info->max_records)
                    i = 0;
                seq = &info->buf[i];
            }
            CrsLog(("Crs%d: search found %d %I64d, %I64d\n", info->lid,
                   seq - info->buf, seq->hdr.epoch, seq->hdr.seq));
            memcpy(lrec, seq, CRS_RECORD_SZ);
        } else {
            err = ERROR_NOT_FOUND;
        }
    } else {
        
        CrsLog(("Crs%d: reached last record %d %I64d %I64d, %I64d %I64d\n",
                info->lid, info->last_record,
                last->hdr.epoch, last->hdr.seq,
                seq->hdr.epoch, seq->hdr.seq));

        if (this_flag == TRUE) {
            // we are trying to read the last record
            memcpy(lrec, last, CRS_RECORD_SZ);
            err = ERROR_SUCCESS;
        } else {
            err = ERROR_HANDLE_EOF;
        }
    }

    LeaveCriticalSection(&info->lock);

    if (err == ERROR_SUCCESS && lrec->hdr.epoch == 0) {
        // invalid rec, log is empty
        err = ERROR_HANDLE_EOF;
    }


    return err;
}

// Call into fs with <undo, replay, query, disable, enable, done>
//      undo: pass replica in recovery due to a conflict
//      replay: replica is missing change, if replay fails with abort, we
//              do a full copy; otherwise we issue a skip record
//      query: ask replica if record was completed or not
//      done: signal end of recovery and pass in new wset, rset
// we silently handle <abort(skip) and epoch records>
//      abort: add a skip record
//      epoch records: just log it as is
DWORD
CrspReplay(LPVOID rec)
{
    CrsRecoveryBlk_t *rr;
    CrsInfo_t *info, *minfo;
    CrsRecord_t *p, *q;
    CrsRecord_t lrec, mlrec;
    DWORD err;

    rr = (CrsRecoveryBlk_t *) rec;
    info = rr->info;
    minfo = rr->minfo;

    CrsLog(("CrsReplay%d mid %d, lid %d leader_id %d\n",
            rr->nid, rr->mid, info->lid, info->leader_id));

    // for now force a full copy. It seems sometimes I get into a bad state, when we
    // get the time, we can reenable this and find out exactly the corner cases that
    // cause us to be out of sync.
#if 1
    do {
        p = NULL;
        // read last record
        err = CrspNextLogRecord(info, NULL, &lrec, FALSE);
        if (err != ERROR_SUCCESS) {
            CrsLog(("CrsReplay%d: unable to read last record %d\n",
                    info->lid, err));
            break;
        }

        // find our last record in master replica
        q = &lrec;
        p = &mlrec;
        err = CrspNextLogRecord(minfo, q, p, TRUE);
        // if found and consistent with master, no undo
        if (err == ERROR_SUCCESS && p->hdr.state == q->hdr.state) {
            CrsLog(("CrsReplay%d: last record %I64d, %I64d consistent %x %x\n",
                    info->lid, q->hdr.epoch, q->hdr.seq,
                    p->hdr.state, q->hdr.state));
            break;
        }

        if (err != ERROR_SUCCESS) {
            CrsLog(("CrsReplay%d: missing lrec %I64d, %I64d in disk %d, err %d\n",
                    info->lid, q->hdr.epoch, q->hdr.seq, minfo->lid, err));
        } else {
            CrsLog(("CrsReplay%d: undo last record %I64d, %I64d %x needs %x\n",
                    info->lid, q->hdr.epoch, q->hdr.seq,
                    q->hdr.state, p->hdr.state));
            ASSERT(p->hdr.state & (CRS_COMMIT|CRS_ABORT));
        }

        // last record is in conflict, we must undo it first
        if (!(q->hdr.state & CRS_EPOCH)) {
            // if we found this record in master and a conflict is detected,
            // we undo it. Otherwise, we need to do a full copy
            if (err == ERROR_SUCCESS) {
                ASSERT(p->hdr.state & (CRS_COMMIT|CRS_ABORT));
                ASSERT(q->hdr.state & CRS_PREPARE);
                err = info->callback(info->callback_arg,
                                     rr->nid, q,
                                     CRS_ACTION_UNDO, rr->mid);
            }
        } else {
            // A missing epoch record doesn't mean we are old. A regroup
            // could have happened but no new data records got added. We
            // undo it, and continue;
            err = STATUS_SUCCESS;
        }

        if (err == STATUS_SUCCESS) {
            // update current record, sequence, epoch
            info->buf[info->last_record].hdr.state = 0;
            info->buf[info->last_record].hdr.epoch = 0;
            info->buf[info->last_record].hdr.seq = 0;
            if (info->last_record == 0) {
                info->last_record = info->max_records;
            }
            info->last_record--;
            info->seq = info->buf[info->last_record].hdr.seq;
            info->epoch = info->buf[info->last_record].hdr.epoch;
            CrsLog(("CrsReplay%d: new last record %d %I64d, %I64d\n",
                    info->lid, info->last_record, info->epoch, info->seq));
        } else {
            // can't undo it, do full copy and readjust our log
            CrsLog(("CrsReplay%d: Unable to undo record %I64d, %I64d\n",
                    info->lid, q->hdr.epoch, q->hdr.seq));
            p = NULL;
        }
    } while (err == STATUS_SUCCESS && info->state == CRS_STATE_RECOVERY);

                   
    while (p != NULL && info->state == CRS_STATE_RECOVERY) {
        // read master copy
        err = CrspNextLogRecord(minfo, p, &mlrec, FALSE);
        if (err != ERROR_SUCCESS) {
            if (err == ERROR_HANDLE_EOF) {
                CrsLog(("CrsReplay%d: last record %I64d, %I64d in disk %d\n",
                        info->lid, q->hdr.epoch, q->hdr.seq, minfo->lid));

                // the last record is where we are at
                info->seq = info->buf[info->last_record].hdr.seq;
                info->epoch = info->buf[info->last_record].hdr.epoch;

                // This would be performed later in CrsStart().
#if 0
                // we reached the end, signal end of recovery
                err = info->callback(info->callback_arg,
                               rr->nid, p,
                               CRS_ACTION_DONE, rr->mid);

#else
                err = STATUS_SUCCESS;
#endif

                goto exit;
            }
            break;
        }

        p = &mlrec;
        if ((p->hdr.state & CRS_EPOCH) || (p->hdr.state & CRS_ABORT)) {
            CrsLog(("CrsReplay%d: skip record %I64d, %I64d %x\n",
                    info->lid, p->hdr.epoch, p->hdr.seq, p->hdr.state));
            err = !STATUS_SUCCESS;
        } else if (p->hdr.state & CRS_COMMIT) {
            err = info->callback(info->callback_arg,
                                 rr->nid, p,
                                 CRS_ACTION_REPLAY, rr->mid);
            if (err == STATUS_TRANSACTION_ABORTED) {
                CrsLog(("CrsReplay: failed nid %d seq %I64d err %x\n",
                        rr->nid, p->hdr.seq, err));
                break;
            }
        } else {
            ASSERT(p->hdr.state & CRS_PREPARE);
            // what if the record is prepared but not yet committed or
            // aborted; in transit record. 
            // stop now
            CrsLog(("CrsReplay%d: bad record seq %I64d state %x\n",
                    rr->nid, p->hdr.seq, p->hdr.state));
            break;
        }
        if (err != STATUS_SUCCESS) {
            // add record
            err = CrspAppendRecord(info, p, NULL);
            if (err != ERROR_SUCCESS) {
                CrsLog(("CrsReplay%d: failed append seq %I64d err %x\n",
                        rr->nid, p->hdr.seq, err));
                break;
            }
            if (p->hdr.state & CRS_EPOCH) {
                ; //ASSERT(info->epoch+1 == p->hdr.epoch);
            } else {
                ASSERT(info->epoch == p->hdr.epoch);
                ASSERT(info->seq+1 == p->hdr.seq);
            }
            info->seq = p->hdr.seq;
            info->epoch = p->hdr.epoch;
        } else if (info->seq == p->hdr.seq) {
            // make sure we have added it
            ASSERT(info->seq == p->hdr.seq);
            ASSERT(info->epoch == p->hdr.epoch);
            ASSERT(info->buf[info->last_record].hdr.seq == p->hdr.seq);
            ASSERT(info->buf[info->last_record].hdr.epoch == p->hdr.epoch);

            // Propagate dubious bit
            if (p->hdr.state & CRS_DUBIOUS) {
                info->buf[info->last_record].hdr.state |= CRS_DUBIOUS;
            }
            ASSERT(info->buf[info->last_record].hdr.state == p->hdr.state);
        } else {
            // force a full copy
            err = !STATUS_SUCCESS;
            break;
        }
    }
#else
    p = NULL;
#endif
    if (p == NULL || err != STATUS_SUCCESS) {
        CrsLog(("CrsReplay%d: Full copy from disk %d\n",
                info->lid, minfo->lid));
        // we are out of date or need full recovery, do a full copy
        err = info->callback(info->callback_arg,
                             rr->nid, NULL,
                             CRS_ACTION_COPY, rr->mid);

        if (err == STATUS_SUCCESS) {
            DWORD len;

            // we now copy our master log and flush it
            ASSERT(minfo->max_records == info->max_records);

            len = info->max_records * CRS_RECORD_SZ;
            memcpy(info->buf, minfo->buf, len);
            err = CrspWrite(info, 0, len);
            if (err == ERROR_SUCCESS) {
                // adjust our state
                info->last_record = minfo->last_record;
                info->seq = info->buf[info->last_record].hdr.seq;
                info->epoch = info->buf[info->last_record].hdr.epoch;

                // The action below would be performed later in CrsStart().
#if 0
                // we reached the end, signal end of recovery
                err = info->callback(info->callback_arg,
                               rr->nid, p,
                               CRS_ACTION_DONE, rr->mid);
#endif

            }
        }
    }

 exit:

    CrsLog(("CrsReplay%d mid %d status 0x%x\n", rr->nid, rr->mid, err));

    return err;
}


/////////////////////// Public Functions //////////////////////
DWORD
WINAPI
CrsOpen(crs_callback_t callback, PVOID callback_arg, USHORT lid,
        WCHAR *log_name, int max_logsectors, HANDLE *outhdl)
{

    // Open the log file
    // If the file in newly create, set the proper size
    // If the file size is not the same size, we need to either
    // expand or truncate the file. (truncate needs copy)
    // Scan file to locate last sector and record
    // If last record hasn't been commited, issue a query.
    // If query succeeded then, mark it as committed.
    // Set epoch,seq
    DWORD status;
    HANDLE maph;
    CrsInfo_t   *p;
    int logsz;
    ULONG disp=FILE_OPEN_IF;

    if (outhdl == NULL) {
        return ERROR_INVALID_PARAMETER;
    }

    *outhdl = NULL;

    p = (CrsInfo_t *) malloc(sizeof(*p));
    if (p == NULL) {
        return ERROR_NOT_ENOUGH_MEMORY;
    }
    memset((PVOID) p, 0, sizeof(*p));

    // CrsLog(("Crs%d file '%S'\n", lid, log_name));
    p->lid = lid;
    p->callback = callback;
    p->callback_arg = callback_arg;
    p->pending = FALSE;

#if 0
    // Create log file, and set size of newly created
    p->fh = CreateFileW(log_name,
                     GENERIC_READ | GENERIC_WRITE,
                     FILE_SHARE_READ|FILE_SHARE_WRITE,
                     NULL,
                     OPEN_ALWAYS,
                     FILE_FLAG_WRITE_THROUGH,
                     NULL);
#else
    p->fh = INVALID_HANDLE_VALUE;
    status = xFsCreate(&p->fh, 
                    NULL, 
                    log_name, 
                    wcslen(log_name), 
                    FILE_WRITE_THROUGH|FILE_SYNCHRONOUS_IO_ALERT,
                    0,
                    FILE_SHARE_READ|FILE_SHARE_WRITE,
                    &disp,
                    GENERIC_READ | GENERIC_WRITE | FILE_WRITE_EA,
                    NULL,
                    0
                    );

    if ((status == STATUS_SUCCESS)&&(disp == FILE_OPENED)) {
        status = ERROR_ALREADY_EXISTS;
    }

#endif

    // status = GetLastError();
    if(p->fh == INVALID_HANDLE_VALUE){
        free((char *) p);
        return status;
    }

    // acquire an exclusive lock on the whole file
    if (!LockFile(p->fh, 0, 0, (DWORD)-1, (DWORD)-1)) {
        FILE_FULL_EA_INFORMATION ea[2] = {0};
        IO_STATUS_BLOCK ios;
        NTSTATUS err;

        // get status
        status = GetLastError();

        // change the ea to cause a notification to happen
        ea[0].NextEntryOffset = 0;
        ea[0].Flags = 0;
        ea[0].EaNameLength = 1;
        ea[0].EaValueLength = 1;
        ea[0].EaName[0] = 'X';
        // Increment size by 1, due to value.
        err = NtSetEaFile(p->fh, &ios, (PVOID) ea, sizeof(ea));
        CrsLog(("Crs%d Setting EA err=0x%x status=0x%x\n", lid, err, status));

        goto error;
    }

    if (status == ERROR_ALREADY_EXISTS) {
        // todo: compare current file size to new size and adjust file
        // size accordingly. For now, just use old size
        logsz = GetFileSize(p->fh, NULL);
        CrsLog(("Crs%d: (Open) Filesz %d max_sec %d\n", lid, logsz, max_logsectors));
        ASSERT(logsz == max_logsectors * CRS_SECTOR_SZ);
    } else {
        //extend the file pointer to max size 
        logsz = max_logsectors * CRS_SECTOR_SZ;
        SetFilePointer(p->fh, logsz, NULL, FILE_BEGIN);
        SetEndOfFile(p->fh);
        CrsLog(("Crs%d: (Create) Set Filesz %d max_sec %d\n", lid, logsz, max_logsectors));
    }

    // allocate file copy in memory
    p->buf = xmalloc(logsz);
    if (p->buf == NULL) {
        status = ERROR_NOT_ENOUGH_MEMORY;
        goto error;
    }
    
    // set max record
    p->max_records = logsz / CRS_RECORD_SZ;

    if (status == ERROR_ALREADY_EXISTS) {
        // load file and compute last epoch/seq
        status = CrspFindLast(p, logsz);
    } else {
        status = !ERROR_SUCCESS;
    }
    // init the file, when we detect a read failure or first time
    if (status != ERROR_SUCCESS) {
        CrsRecord_t *r;
        int i;

        // initialize file
        p->seq = 0;
        p->epoch = 0;
        p->last_record = 0;

        r = p->buf;
        for (i = 0; i < logsz; i+= CRS_RECORD_SZ, r++) {
            r->hdr.epoch = p->epoch;
            r->hdr.seq = p->seq;
            r->hdr.tag = CRS_TAG;
            r->hdr.state = CRS_COMMIT | CRS_PREPARE | CRS_EPOCH;
        }
        status = CrspWrite(p, 0, logsz);
    }

    if (status != ERROR_SUCCESS) {
        goto error;
    }

    CrsLog(("Crs%d: %x Last record %d max %d epoch %I64d seq %I64d\n", p->lid,
            p->fh,
            p->last_record, p->max_records, p->epoch, p->seq));

    // initialize rest of state
    p->state = CRS_STATE_INIT;
    p->refcnt = 1;
    p->leader_id = 0;
    InitializeCriticalSection(&p->lock);

    *outhdl = p;

    return ERROR_SUCCESS;

 error:
    CloseHandle(p->fh);
    if (p->buf) {
        xfree(p->buf);
    }
    free((PVOID) p);
    return status;
}

//
DWORD
WINAPI
CrsStart(PVOID *hdls, ULONG alive_set, int cluster_sz,
         ULONG *write_set, ULONG *read_set, ULONG *evict_set)

{
    DWORD status;
    CrsInfo_t **info = (CrsInfo_t **) hdls;
    int i, active_sz, mid;
    ULONG mask, active_set, fail_set;
    CrsInfo_t *p;
    CrsRecord_t *q, *mlrec;

    if (write_set) *write_set = 0;
    if (read_set) *read_set = 0;
    if (evict_set) *evict_set = 0;

    // no alive node
    if (cluster_sz == 0 || alive_set == 0) {
        // nothing to do
        return ERROR_WRITE_PROTECT;
    }


    // scan each hdl and make sure it is initialized and lock all hdls
    mask = alive_set;
    for (i = 0; mask != 0; i++, mask = mask >> 1) {
        if (!(mask & 0x1)) {
            continue;
        }

        p = info[i];
        if (p == NULL) {
            continue;
        }

        EnterCriticalSection(&p->lock);

        // check the state of the last record
        p = info[i];
        q = &p->buf[p->last_record];
        CrsLog(("Crs%d last record %d epoch %I64d seq %I64d state %x\n",
                p->lid, p->last_record,
                q->hdr.epoch, q->hdr.seq, q->hdr.state));
    }

    mid = 0;
    mlrec = NULL;
    // select master replica
    for (i = 0, mask = alive_set; mask != 0; i++, mask = mask >> 1) {
        if (!(mask & 0x1)) {
            continue;
        }
        p = info[i];
        if (p == NULL)
            continue;

        q = &p->buf[p->last_record];
        if (!mlrec || 
            mlrec->hdr.epoch < q->hdr.epoch || 
            (mlrec->hdr.epoch == q->hdr.epoch && mlrec->hdr.seq < q->hdr.seq) ||
            (mlrec->hdr.epoch == q->hdr.epoch && mlrec->hdr.seq == q->hdr.seq &&
             mlrec->hdr.state != q->hdr.state && (q->hdr.state & CRS_COMMIT))) {

            mid = i;
            mlrec = q;
        }
    }

    ASSERT(mid != 0);

    // if master last record is in doubt, query filesystem. If the filesystem
    // is certain that the operation has occured, it returns STATUS_SUCCESS for
    //  COMMIT, STATUS_CANCELLED for ABORT, and STATUS_NOT_FOUND for can't tell.
    // All undetermined IO must be undone and redone in all non-master replicas
    // to ensure all replicas reach consistency. This statement is true even
    // for replicas that are currently absent from our set. We tag such records
    // we both COMMIT and ABORT, so that the replay thread issues replay for
    // new records and undo,replay for last records
    p = info[mid];
    p->leader_id = (USHORT) mid;
    ASSERT(mlrec != NULL);
    if (!(mlrec->hdr.state & (CRS_COMMIT | CRS_ABORT))) {
        ASSERT(mlrec->hdr.state & CRS_PREPARE);
        status = p->callback(p->callback_arg, p->lid,
                             mlrec, CRS_ACTION_QUERY,
                             p->lid);

        if (status == STATUS_SUCCESS) {
            mlrec->hdr.state |= CRS_COMMIT;
        } else if (status == STATUS_CANCELLED) {
            mlrec->hdr.state |= CRS_ABORT;
        } else if (status == STATUS_NOT_FOUND) {
            // assume it is committed, but mark it for undo during recovery
            mlrec->hdr.state |= (CRS_COMMIT | CRS_DUBIOUS);
        }

        // todo: if status == TRANSACTION_ABORTED, we need to bail out since
        // must master is dead
        // no need to flush, I think!
//      CrspFlush(p, p->last_record);

        // todo: what if the flush fails here, I am assuming that
        // an append will equally fail.
    }


    ASSERT(mlrec->hdr.state & (CRS_COMMIT | CRS_ABORT));

    // compute sync and recovery masks
    fail_set = 0;
    active_set = 0;
    active_sz = 0;
    for (i = 0, mask = alive_set; mask != 0; i++, mask = mask >> 1) {
        if (!(mask & 0x1)) {
            continue;
        }

        p = info[i];
        if (p == NULL) {
            continue;
        }

        // set leader id
        p->leader_id = (USHORT) mid;
        q = &p->buf[p->last_record];
            
        if (CrspEqual(mlrec, q)) {
            ASSERT(q->hdr.state & (CRS_COMMIT | CRS_ABORT));
            p->state = CRS_STATE_READ;
            active_set |= (1 << i);
            active_sz++;
        } else if (p->state != CRS_STATE_RECOVERY) {
            CrsRecoveryBlk_t rrbuf;
            CrsRecoveryBlk_t *rr = &rrbuf;

            // recover replica
            rr->nid = i;
            rr->mid = mid;
            rr->info = p;
            rr->minfo = info[mid];

            // set recovery state
            p->state = CRS_STATE_RECOVERY;

            status = CrspReplay((LPVOID) rr);

            // if we fail, evict this replica
            if (status != ERROR_SUCCESS) {
                fail_set |= (1 << i);
            } else {
                // repeat this replica again
                i--;
                mask = mask << 1;
            }
        }
    }

    // Now recreate the open file state. This needs to be done for all replicas.
    // Removed this operation from CrspReplay() since now it needs to be performed on
    // all replicas, even master.
    //
    for (i=0, mask=active_set; mask != 0;i++, mask = mask >>1) {
        if (!(mask & 0x1)) {
            continue;
        }

        status = info[i]->callback(info[i]->callback_arg, i, NULL, CRS_ACTION_DONE, mid);

        if (status != STATUS_SUCCESS) {
            active_set &= (~(1<<i));
            active_sz--;
            fail_set |= (1<<i);
        }
    }

    // assume success
    status = ERROR_SUCCESS;

    // set read sets
    if (read_set) *read_set = active_set;

    if (!CRS_QUORUM(active_sz, cluster_sz)) {
        CrsLog(("No quorum active %d cluster %d\n", active_sz, cluster_sz));
        mid = 0;
        status = ERROR_WRITE_PROTECT;
    } else {
        int pass_cnt = 0;
        ULONG pass_set = 0;

        // Enable writes on all active replicas
        for (i = 0, mask = active_set; mask != 0; i++, mask = mask >> 1) {
            CrsRecord_t rec;
            if (!(mask & 0x1)) {
                continue;
            }
            p = info[i];
            if (p == NULL)
                continue;

            p->state = CRS_STATE_WRITE;

            // we now generate a new epoch and flush it to the disk
            p->epoch++;
            if (p->epoch == 0)
                p->epoch = 1;
            // reset seq to zero
            p->seq = 0;

            // write new epoch now, if not a majority replicas succeeded in writing
            // the new <epoch, seq> we fail
            rec.hdr.epoch = p->epoch;
            rec.hdr.seq = p->seq;
            rec.hdr.state = CRS_PREPARE | CRS_COMMIT | CRS_EPOCH;
            memset(rec.data, 0, sizeof(rec.data));
            if (CrspAppendRecord(p, &rec, NULL) == ERROR_SUCCESS) {
                pass_cnt++;
                pass_set |= (1 << i);
            } else {
                fail_set |= (1 << i);
            }
        }

        // Recheck to make sure all replicas have advanced epoch
        if (!CRS_QUORUM(pass_cnt, cluster_sz)) {
            CrsLog(("No quorum due to error pass %d cluster %d\n", pass_cnt, cluster_sz));
            mid = 0;
            pass_set = 0;
            pass_cnt = 0;
            status = ERROR_WRITE_PROTECT;
        }

        if (pass_cnt != active_sz) {
            // some replicas have died
            for (i = 0, mask = pass_set; mask != 0; i++, mask = mask >> 1) {
                if ((alive_set & (1 << i)) && ((~mask) & (1 << i))) {
                    p = info[i];
                    ASSERT(p != NULL);
                    p->state = CRS_STATE_READ;
                }
            }
        }
        // set write set
        if (write_set) *write_set = pass_set;
    }

    if (evict_set) *evict_set = fail_set;

    // unlock all hdls and set new master if any
    for (i = 0, mask = alive_set; mask != 0; i++, mask = mask >> 1) {
        if (!(mask & 0x1)) {
            continue;
        }
        p = info[i];
        if (p == NULL)
            continue;

        p->leader_id = (USHORT) mid;
        LeaveCriticalSection(&p->lock);
    }

    return status;
}


void
WINAPI
CrsClose(PVOID hd)
{
    DWORD err=ERROR_SUCCESS;
    CrsInfo_t *info = (CrsInfo_t *) hd;

    // If we any recovery threads running, make sure we terminate them first
    // before close and free all of this stuff
    if (info == NULL) {
        CrsLog(("CrsClose: try to close a null handle!\n"));
        return;
    }

    // Flush everything out and close the file
    EnterCriticalSection(&info->lock);
    // flush 
    CrspFlush(info, info->last_record);
    LeaveCriticalSection(&info->lock);

    DeleteCriticalSection(&info->lock);

    if (info->fh != INVALID_HANDLE_VALUE) {
        UnlockFile(info->fh, 0, 0, (DWORD)-1, (DWORD)-1);
        err = CloseHandle(info->fh);
        info->fh = INVALID_HANDLE_VALUE;
    }

    CrsLog(("Crs%d: %x Closed %d\n", info->fh, info->lid, err));

    xfree(info->buf);
    free((char *) info);
}

void
WINAPI
CrsFlush(PVOID hd)
{
    CrsInfo_t *info = (CrsInfo_t *) hd;

    // if we have a commit or abort that isn't flushed yet, flush it now
    EnterCriticalSection(&info->lock);
    if (info->pending == TRUE) {
        CrspFlush(info, info->last_record);
    }
    LeaveCriticalSection(&info->lock);
}

PVOID
WINAPI
CrsPrepareRecord(PVOID hd, PVOID lrec, crs_id_t id, ULONG *retVal)
{
    CrsRecord_t *p = (CrsRecord_t *)lrec;
    CrsInfo_t *info = (CrsInfo_t *) hd;
    DWORD err;

    // move to correct slot in this sector. If we need a new sector,
    // read it from the file. Make sure we flush any pending commits on
    // current sector before we over write our in memory sector buffer.

    // prepare record, if seq none 0 then we are skipping the next sequence

    *retVal = STATUS_MEDIA_WRITE_PROTECTED;
    EnterCriticalSection(&info->lock);

    if (info->state == CRS_STATE_WRITE ||
        (info->state == CRS_STATE_RECOVERY && id != NULL && id[0] != 0)) {

        if (id != NULL && id[0] != 0) {
            CrsHdr_t *tmp = (CrsHdr_t *) id;
            assert(id[0] == info->seq+1);
            p->hdr.seq = tmp->seq;
            p->hdr.epoch = tmp->epoch;
        } else {
            p->hdr.seq = info->seq+1;
            p->hdr.epoch = info->epoch;
        }
        p->hdr.state = CRS_PREPARE;
        err = CrspAppendRecord(info, p, &p);
        *retVal = err;
        if (err == ERROR_SUCCESS) {
            // we return with the lock held, gets release on commitorabort
            CrsLog(("Crs%d prepare %x seq %I64d\n",info->lid, p, p->hdr.seq));
            return p;
        }
        CrsLog(("Crs%d: Append failed seq %I64%d\n", info->lid, p->hdr.seq));
    } else {
        CrsLog(("Crs%d: Prepare bad state %d id %x\n", info->lid, info->state, id));
    }

    LeaveCriticalSection(&info->lock);
    return NULL;
}

int
WINAPI
CrsCommitOrAbort(PVOID hd, PVOID lrec, int commit)
{
    CrsRecord_t *p = (CrsRecord_t *)lrec;
    CrsInfo_t *info = (CrsInfo_t *) hd;

    if (p == NULL || info == NULL) {
        return ERROR_INVALID_PARAMETER;
    }

    // update state of record
    if (p->hdr.seq != info->seq+1) {
        CrsLog(("Crs: sequence mis-match on commit|abort %I64d %I64d\n",
                p->hdr.seq, info->seq));
        assert(0);
        return ERROR_INVALID_PARAMETER;
    }

    assert(!(p->hdr.state & (CRS_COMMIT | CRS_ABORT)));

    // todo: this is wrong, what if one replica succeeds
    // and others abort. Now, the others will reuse the
    // same seq for a different update and when the
    // succeeded replica rejoins it can't tell that the
    // sequence got reused.
    if (commit == TRUE) {
        p->hdr.state |= CRS_COMMIT;
        // advance the sequence
        info->seq++;
        CrsLog(("Crs%d: commit last %d leader %d seq %I64d\n", info->lid, 
                info->last_record,
                info->leader_id, p->hdr.seq));
    } else {
        p->hdr.state |= CRS_ABORT;
        // we need to re-adjust our last record
        if (info->last_record == 0) {
            info->last_record = info->max_records;
        }
        info->last_record--;
        CrsLog(("Crs%d: abort last %d leader %d seq %I64d\n", info->lid, 
                info->last_record,
                info->leader_id, p->hdr.seq));
    }

    info->pending = TRUE;
    LeaveCriticalSection(&info->lock);

    return ERROR_SUCCESS;
}


int
WINAPI
CrsCanWrite(PVOID hd)
{
    CrsInfo_t *info = (CrsInfo_t *) hd;
    int err;

    // do we have a quorm or not
    EnterCriticalSection(&info->lock);
    err = (info->state == CRS_STATE_WRITE);
    LeaveCriticalSection(&info->lock);
    return err;
}

crs_epoch_t
CrsGetEpoch(PVOID hd)
{
    CrsInfo_t *info=(CrsInfo_t *)hd;
    crs_epoch_t epoch;

    EnterCriticalSection(&info->lock);
    epoch = info->epoch;
    LeaveCriticalSection(&info->lock);
    return epoch;
}