|
|
/*++
Copyright (c) 2000 Microsoft Corporation
Module Name:
fs.c
Abstract:
Implements filesystem operations
Author:
Ahmed Mohamed (ahmedm) 1-Feb-2000
Revision History:
--*/ #include <nt.h>
#include <ntdef.h>
#include <ntrtl.h>
#include <nturtl.h>
#include <windows.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "fs.h"
#include "crs.h"
#include "fsp.h"
#include "fsutil.h"
// Locking order: ulock followed by qlock
////////////////////////////////////////////////////////////////////////////
UINT32 get_attributes(DWORD a) { UINT32 attr = 0; if (a & FILE_ATTRIBUTE_READONLY) attr |= ATTR_READONLY; if (a & FILE_ATTRIBUTE_HIDDEN) attr |= ATTR_HIDDEN; if (a & FILE_ATTRIBUTE_SYSTEM) attr |= ATTR_SYSTEM; if (a & FILE_ATTRIBUTE_ARCHIVE) attr |= ATTR_ARCHIVE; if (a & FILE_ATTRIBUTE_DIRECTORY) attr |= ATTR_DIRECTORY; if (a & FILE_ATTRIBUTE_COMPRESSED) attr |= ATTR_COMPRESSED; if (a & FILE_ATTRIBUTE_OFFLINE) attr |= ATTR_OFFLINE; return attr; }
DWORD unget_attributes(UINT32 attr) { DWORD a = 0; if (attr & ATTR_READONLY) a |= FILE_ATTRIBUTE_READONLY; if (attr & ATTR_HIDDEN) a |= FILE_ATTRIBUTE_HIDDEN; if (attr & ATTR_SYSTEM) a |= FILE_ATTRIBUTE_SYSTEM; if (attr & ATTR_ARCHIVE) a |= FILE_ATTRIBUTE_ARCHIVE; if (attr & ATTR_DIRECTORY) a |= FILE_ATTRIBUTE_DIRECTORY; if (attr & ATTR_COMPRESSED) a |= FILE_ATTRIBUTE_COMPRESSED; if (attr & ATTR_OFFLINE) a |= FILE_ATTRIBUTE_OFFLINE; return a; }
DWORD unget_disp(UINT32 flags) { switch (flags & FS_DISP_MASK) { case DISP_DIRECTORY: case DISP_CREATE_NEW: return FILE_CREATE; case DISP_CREATE_ALWAYS: return FILE_OPEN_IF; case DISP_OPEN_EXISTING: return FILE_OPEN; case DISP_OPEN_ALWAYS: return FILE_OPEN_IF; case DISP_TRUNCATE_EXISTING: return FILE_OVERWRITE; default: return 0; } }
DWORD unget_access(UINT32 flags) { DWORD win32_access = (flags & FS_DISP_MASK) == DISP_DIRECTORY ? FILE_GENERIC_READ|FILE_GENERIC_WRITE : FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES; if (flags & ACCESS_READ) win32_access |= FILE_GENERIC_READ; if (flags & ACCESS_WRITE) win32_access |= FILE_GENERIC_WRITE; win32_access |= FILE_READ_EA | FILE_WRITE_EA; return win32_access; }
DWORD unget_share(UINT32 flags) { // we always open read shared because this simplifies recovery.
DWORD win32_share = FILE_SHARE_READ; if (flags & SHARE_READ) win32_share |= FILE_SHARE_READ; if (flags & SHARE_WRITE) win32_share |= FILE_SHARE_WRITE; return win32_share; }
DWORD unget_flags(UINT32 flags) { DWORD x;
x = 0; if ((flags & FS_DISP_MASK) == DISP_DIRECTORY) { x = FILE_DIRECTORY_FILE|FILE_SYNCHRONOUS_IO_ALERT; } else { // I don't think I can tell without doing a query first, so don't!
// x = FILE_NON_DIRECTORY_FILE;
}
if ((flags & FS_CACHE_MASK) == CACHE_WRITE_THROUGH) { x |= FILE_WRITE_THROUGH; } if ((flags & FS_CACHE_MASK) == CACHE_NO_BUFFERING) { x |= FILE_NO_INTERMEDIATE_BUFFERING; }
return x; }
void DecodeCreateParam(UINT32 uflags, UINT32 *flags, UINT32 *disp, UINT32 *share, UINT32 *access) { *flags = unget_flags(uflags); *disp = unget_disp(uflags); *share = unget_share(uflags); *access = unget_access(uflags);
} /********************************************************************/
NTSTATUS FspAllocatePrivateHandle(UserInfo_t *p, fhandle_t *fid) { int i; NTSTATUS err = STATUS_NO_MORE_FILES;
LockEnter(p->Lock);
for (i = 0; i < FsTableSize; i++) { if (p->Table[i].Flags == 0) { p->Table[i].Flags = ATTR_SYMLINK; // place marker
err = STATUS_SUCCESS; break; } }
LockExit(p->Lock);
*fid = (fhandle_t) i;
return err; }
void FspFreeHandle(UserInfo_t *p, fhandle_t fnum) {
FsLog(("FreeHandle %d\n", fnum));
ASSERT(fnum != INVALID_FHANDLE_T); LockEnter(p->Lock); p->Table[fnum].Flags = 0; LockExit(p->Lock); }
/*********************************************************** */
void FspEvict(VolInfo_t *p, ULONG mask, BOOLEAN flag) { DWORD err; void FspCloseVolume(VolInfo_t *vol, ULONG AliveSet); ULONG set;
// must be called with update lock held
while (mask != 0) { FsArbLog(("FspEvict Entry: WSet %x Rset %x ASet %x set %x\n", p->WriteSet, p->ReadSet, p->AliveSet, mask));
if (flag == FALSE) { // we just need to close the volume and return since
// these replicas are not yet added to the aliveset and crs doesn't know
// about them
FspCloseVolume(p, mask); break; }
LockEnter(p->qLock); // clear nid
p->AliveSet &= ~mask; set = p->AliveSet; LockExit(p->qLock);
// close nid handles <crs, vol, open files>
FspCloseVolume(p, mask);
mask = 0;
err = CrsStart(p->CrsHdl, set, p->DiskListSz, &p->WriteSet, &p->ReadSet, &mask);
if (mask == 0 && err == ERROR_WRITE_PROTECT) { // we have no quorum
if (p->Event) { SetEvent(p->Event); } } }
FsArbLog(("FspEvict Exit: vol %S WSet %x RSet %x ASet %x\n", p->Root, p->WriteSet, p->ReadSet, p->AliveSet)); }
void FspJoin(VolInfo_t *p, ULONG mask) { DWORD err; ULONG set = 0;
// must be called with update lock
if (mask != 0) { FsArbLog(("FspJoin Entry: WSet %x Rset %x ASet %x set %x\n", p->WriteSet, p->ReadSet, p->AliveSet, mask));
// grab lock now
LockEnter(p->qLock); p->AliveSet |= mask; set = p->AliveSet; LockExit(p->qLock);
mask = 0; err = CrsStart(p->CrsHdl, set, p->DiskListSz, &p->WriteSet, &p->ReadSet, &mask);
if (mask != 0) { // we need to evict dead members
FspEvict(p, mask, TRUE); } if (err == ERROR_WRITE_PROTECT) { // we have no quorum
if (p->Event) { SetEvent(p->Event); } } }
FsArbLog(("FspJoin Exit: WSet %x Rset %x ASet %x\n", p->WriteSet, p->ReadSet, set)); }
void FspInitAnswers(IO_STATUS_BLOCK *ios, PVOID *rbuf, char *r, int sz) {
int i;
for (i = 0; i < FsMaxNodes; i++) { ios[i].Status = STATUS_HOST_UNREACHABLE; if (rbuf) { rbuf[i] = r; r += sz; } } }
int FspCheckAnswers(VolInfo_t *vol, IO_STATUS_BLOCK *ios, PVOID *rbuf, UINT32 sz) { int i; int nums, numf, lasts; ULONG masks, maskf;
lasts = 0; nums = numf = 0; masks = maskf = 0; for (i = 0; i < FsMaxNodes; i++) { if (ios[i].Status == STATUS_HOST_UNREACHABLE) { continue; }
if (lasts == 0) { lasts = i; }
if (ios[i].Status == STATUS_SUCCESS) { nums++; masks |= (1 << i); if (ios[lasts].Information != ios[i].Information) { FsLog(("Success node %d inconsistent with node %d!!!\n", lasts, i)); } } else if (ios[i].Status == STATUS_CONNECTION_DISCONNECTED || ios[i].Status == STATUS_BAD_NETWORK_PATH || // this maps to may network errors
RtlNtStatusToDosError(ios[i].Status) == ERROR_UNEXP_NET_ERR || ios[i].Status == STATUS_VOLUME_DISMOUNTED) { ios[i].Status = STATUS_MEDIA_WRITE_PROTECTED; // evict any replica that lost connectivity
FspEvict(vol, (ULONG)(1 << i), TRUE); if (lasts == i) { lasts = 0; } } else { numf++; maskf |= (1 << i); } } if (numf == 0 || nums == 0) { return lasts; }
FsLog(("Nodes inconsistency success %x,%d failure %x,%d!!!\n", masks, nums, maskf, numf));
// We need to evict whomever is smaller
if (numf > nums) { FspEvict(vol, masks, TRUE); for (i = 0; i < FsMaxNodes; i++) { if (maskf & (1 << i)) { lasts = i; break; } } } else { FspEvict(vol, maskf, TRUE); for (i = 0; i < FsMaxNodes; i++) { if (masks & (1 << i)) { lasts = i; break; } } }
FsLog(("Take result of node %d\n", lasts));
return lasts; }
//////////////////////////////////////////////////////////////////////////////////////
NTSTATUS FspCreate(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) {
// each file has a name stream that contains its crs log. We first
// must open the parent crs log, issue a prepare on it. Create the new file
// and then issuing a commit or abort on parent crs log. We also, have
// to issue joins for each new crs handle that we get for the new file or
// opened file. Note, this open may cause the file to enter recovery
fs_create_msg_t *msg = (fs_create_msg_t *)args; NTSTATUS err, status; UINT32 disp, share, access, flags; fs_log_rec_t lrec; PVOID seq; fs_ea_t x; HANDLE fd; HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); fs_create_reply_t *rmsg = (fs_create_reply_t *)rbuf; PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); fs_id_t *fid;
DecodeCreateParam(msg->flags, &flags, &disp, &share, &access);
FsInitEa(&x);
memset(&lrec.fs_id, 0, sizeof(lrec.fs_id)); lrec.command = FS_CREATE; lrec.flags = msg->flags; lrec.attrib = msg->attr; seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid); if (seq == 0) { FsLog(("create: Unable to prepare log record!, open readonly\n")); return STATUS_MEDIA_WRITE_PROTECTED; } // set fid
{ fs_log_rec_t *p = (PVOID) seq;
memcpy(p->fs_id, p->id, sizeof(fs_id_t));
FsInitEaFid(&x, fid); memcpy(fid, p->id, sizeof(fs_id_t)); }
err = xFsCreate(&fd, vfd, msg->name, msg->name_len, flags, msg->attr, share, &disp, access, (PVOID) &x, sizeof(x));
xFsLog(("create: %S err %x access %x disp %x\n", msg->name, err, access, disp));
CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS && (disp == FILE_CREATED || disp == FILE_OVERWRITTEN));
if (err == STATUS_SUCCESS) { // we need to get the file id, no need to do this, for debug only
err = xFsQueryObjectId(fd, (PVOID) fid); if (err != STATUS_SUCCESS) { FsLog(("Failed to get fileid %x\n", err)); err = STATUS_SUCCESS; } }
#ifdef FS_ASYNC
BindNotificationPort(comport, fd, (PVOID) fdnum); #endif
if (uinfo != NULL && msg->fnum != INVALID_FHANDLE_T) { FS_SET_USER_HANDLE(uinfo, nid, msg->fnum, fd); } else { xFsClose(fd); }
ASSERT(rmsg != NULL);
memcpy(&rmsg->fid, fid, sizeof(fs_id_t)); rmsg->action = (USHORT)disp; rmsg->access = (USHORT)access; *rlen = sizeof(*rmsg);
FsLog(("Create '%S' nid %d fid %d handle %x oid %I64x:%I64x\n", msg->name, nid, msg->fnum, fd, rmsg->fid[0], rmsg->fid[1]));
return err; }
NTSTATUS FspOpen(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { // same as create except disp is allows open only and
// no crs logging
fs_create_msg_t *msg = (fs_create_msg_t *)args; NTSTATUS err, status; UINT32 disp, share, access, flags; HANDLE fd; HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); fs_create_reply_t *rmsg = (fs_create_reply_t *)rbuf;
ASSERT(rmsg != NULL);
DecodeCreateParam(msg->flags, &flags, &disp, &share, &access);
disp = FILE_OPEN; err = xFsCreate(&fd, vfd, msg->name, msg->name_len, flags, msg->attr, share, &disp, access, NULL, 0);
xFsLog(("open: %S err %x access %x disp %x\n", msg->name, err, access, disp));
if (err == STATUS_SUCCESS) { ASSERT(disp != FILE_CREATED && disp != FILE_OVERWRITTEN); // we need to get the file id, no need to do this, for debug only
err = xFsQueryObjectId(fd, (PVOID) &rmsg->fid); if (err != STATUS_SUCCESS) { FsLog(("Open '%S' failed to get fileid %x\n", msg->name, err)); err = STATUS_SUCCESS; } }
#ifdef FS_ASYNC
BindNotificationPort(comport, fd, (PVOID) fdnum); #endif
if (uinfo != NULL && msg->fnum != INVALID_FHANDLE_T) { FS_SET_USER_HANDLE(uinfo, nid, msg->fnum, fd); } else { xFsClose(fd); }
rmsg->action = (USHORT)disp; rmsg->access = (USHORT)access; *rlen = sizeof(*rmsg);
FsLog(("Open '%S' nid %d fid %d handle %x oid %I64x:%I64x\n", msg->name, nid, msg->fnum, fd, rmsg->fid[0], rmsg->fid[1]));
return err; }
NTSTATUS FspSetAttr(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fs_setattr_msg_t *msg = (fs_setattr_msg_t *)args; NTSTATUS err; fs_log_rec_t lrec; PVOID seq; PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); HANDLE fd = FS_GET_USER_HANDLE(uinfo, nid, msg->fnum);
lrec.command = FS_SETATTR; memcpy((PVOID) lrec.fs_id, (PVOID) msg->fs_id, sizeof(fs_id_t)); lrec.attrib = msg->attr.FileAttributes;
if ((seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid)) == 0) { return STATUS_MEDIA_WRITE_PROTECTED; } // can be async ?
err = xFsSetAttr(fd, &msg->attr);
CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS);
return err;
}
NTSTATUS FspSetAttr2(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fs_setattr_msg_t *msg = (fs_setattr_msg_t *)args; HANDLE fd = INVALID_HANDLE_VALUE; HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); NTSTATUS err; fs_log_rec_t lrec; PVOID seq;
assert(len == sizeof(*msg));
// must be sync in order to close file
err = xFsOpenWA(&fd, vfd, msg->name, msg->name_len); if (err == STATUS_SUCCESS) { err = xFsQueryObjectId(fd, (PVOID) &lrec.fs_id); }
if (err == STATUS_SUCCESS) {
lrec.command = FS_SETATTR; lrec.attrib = msg->attr.FileAttributes;
if ((seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid)) != 0) {
err = xFsSetAttr(fd, &msg->attr);
CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS); } else { return STATUS_MEDIA_WRITE_PROTECTED; } }
if (fd != INVALID_HANDLE_VALUE) xFsClose(fd);
xFsLog(("setattr2 nid %d '%S' err %x\n", nid, msg->name, err));
return err;
}
NTSTATUS FspLookup(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fs_lookup_msg_t *msg = (fs_lookup_msg_t *) args; HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); FILE_NETWORK_OPEN_INFORMATION *attr = (FILE_NETWORK_OPEN_INFORMATION *)rbuf; ASSERT(*rlen == sizeof(*attr));
return xFsQueryAttrName(vfd, msg->name, msg->name_len, attr);
}
NTSTATUS FspGetAttr(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fhandle_t handle = *(fhandle_t *) args; HANDLE fd = FS_GET_USER_HANDLE(uinfo, nid, handle); FILE_NETWORK_OPEN_INFORMATION *attr = (FILE_NETWORK_OPEN_INFORMATION *)rbuf;
ASSERT(*rlen == sizeof(*attr));
return xFsQueryAttr(fd, attr); }
NTSTATUS FspClose(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fhandle_t handle = *(fhandle_t *) args; HANDLE fd; NTSTATUS err;
if (uinfo != NULL && handle != INVALID_FHANDLE_T) fd = FS_GET_USER_HANDLE(uinfo, nid, handle); else fd = FS_GET_VOL_HANDLE(vinfo, nid);
FsLog(("Closing nid %d fid %d handle %x\n", nid, handle, fd)); err = xFsClose(fd); if (err != STATUS_SUCCESS) // return err;
err = STATUS_SUCCESS; // don't evict a node due to this
if (uinfo != NULL && handle != INVALID_FHANDLE_T) { FS_SET_USER_HANDLE(uinfo, nid, handle, INVALID_HANDLE_VALUE); } else { FS_SET_VOL_HANDLE(vinfo, nid, INVALID_HANDLE_VALUE); }
return err; }
NTSTATUS FspReadDir(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *entries_found) { fs_io_msg_t *msg = (fs_io_msg_t *)args; int i; NTSTATUS e = STATUS_SUCCESS; int size = (int) msg->size; int cookie = (int) msg->cookie; HANDLE dir; dirinfo_t *buffer = (dirinfo_t *)msg->buf;
xFsLog(("DirLoad: size %d\n", size));
if (uinfo != NULL && msg->fnum != INVALID_FHANDLE_T) dir = FS_GET_USER_HANDLE(uinfo, nid, msg->fnum); else dir = FS_GET_VOL_HANDLE(vinfo, nid);
*entries_found = 0; for(i = 0; size >= sizeof(dirinfo_t) ; i+=PAGESIZE) { // this must come from the source if we are to do async readdir
char buf[PAGESIZE]; int sz;
sz = min(PAGESIZE, size); e = xFsReadDir(dir, buf, &sz, (cookie == 0) ? TRUE : FALSE); if (e == STATUS_SUCCESS) { PFILE_DIRECTORY_INFORMATION p;
p = (PFILE_DIRECTORY_INFORMATION) buf; while (size >= sizeof(dirinfo_t)) { char *foo; int k;
k = p->FileNameLength/2; p->FileName[k] = L'\0'; wcscpy(buffer->name, p->FileName); buffer->attribs.file_size = p->EndOfFile.QuadPart; buffer->attribs.alloc_size = p->AllocationSize.QuadPart; buffer->attribs.create_time = p->CreationTime.QuadPart; buffer->attribs.access_time = p->LastAccessTime.QuadPart; buffer->attribs.mod_time = p->LastWriteTime.QuadPart; buffer->attribs.attributes = p->FileAttributes; buffer->cookie = ++cookie; buffer++; size -= sizeof(dirinfo_t); (*entries_found)++;
if (p->NextEntryOffset == 0) break;
foo = (char *) p; foo += p->NextEntryOffset; p = (PFILE_DIRECTORY_INFORMATION) foo; } } else { break; } }
return e;
}
NTSTATUS FspMkDir(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fs_create_msg_t *msg = (fs_create_msg_t *)args; NTSTATUS err; HANDLE fd; fs_log_rec_t lrec; PVOID seq; fs_ea_t x; PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); fs_id_t *fid; UINT32 disp, share, access, flags;
FsInitEa(&x);
memset(&lrec.fs_id, 0, sizeof(lrec.fs_id)); lrec.command = FS_MKDIR; lrec.attrib = msg->attr; lrec.flags = msg->flags;
if ((seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid)) == 0) { return STATUS_MEDIA_WRITE_PROTECTED; }
// set fid
{ fs_log_rec_t *p = (PVOID) seq;
memcpy(p->fs_id, p->id, sizeof(fs_id_t));
FsInitEaFid(&x, fid); // set fs_id of the file
memcpy(fid, p->id, sizeof(fs_id_t)); }
// decode attributes
DecodeCreateParam(msg->flags, &flags, &disp, &share, &access);
// always sync call
err = xFsCreate(&fd, vfd, msg->name, msg->name_len, flags, msg->attr, share, &disp, access, (PVOID) &x, sizeof(x));
FsLog(("Mkdir '%S' %x: cflags %x flags:%x attr:%x share:%x disp:%x access:%x\n", msg->name, err, msg->flags, flags, msg->attr, share, disp, access));
CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS && (disp == FILE_CREATED || disp == FILE_OVERWRITTEN));
if (err == STATUS_SUCCESS) { // return fid
if (rbuf != NULL) { ASSERT(*rlen == sizeof(fs_id_t)); memcpy(rbuf, fid, sizeof(fs_id_t)); } xFsClose(fd); }
return err;
}
NTSTATUS FspRemove(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fs_remove_msg_t *msg = (fs_remove_msg_t *)args; NTSTATUS err; fs_log_rec_t lrec; PVOID seq; PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); HANDLE fd;
*rlen = 0;
// next three statements to obtain name -> fs_id
err = xFsOpenRA(&fd, vfd, msg->name, msg->name_len); if (err != STATUS_SUCCESS) { return err; }
// get object id
err = xFsQueryObjectId(fd, (PVOID) &lrec.fs_id);
xFsClose(fd);
lrec.command = FS_REMOVE;
if (err != STATUS_SUCCESS) { return err; }
if ((seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid)) == 0) { return STATUS_MEDIA_WRITE_PROTECTED; }
err = xFsDelete(vfd, msg->name, msg->name_len);
CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS);
xFsLog(("Rm nid %d '%S' %x\n", nid, msg->name, err));
return err;
}
NTSTATUS FspRename(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { fs_rename_msg_t *msg = (fs_rename_msg_t *)args; NTSTATUS err; fs_log_rec_t lrec; PVOID seq; PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); HANDLE fd;
lrec.command = FS_RENAME;
err = xFsOpen(&fd, vfd, msg->sname, msg->sname_len, STANDARD_RIGHTS_REQUIRED| SYNCHRONIZE | FILE_READ_EA | FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, 0);
if (err != STATUS_SUCCESS) { return err; }
// get file id
err = xFsQueryObjectId(fd, (PVOID) &lrec.fs_id);
if (err == STATUS_SUCCESS) { if ((seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid)) != 0) { err = xFsRename(fd, vfd, msg->dname, msg->dname_len); CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS); } else { err = STATUS_MEDIA_WRITE_PROTECTED; } } else { xFsLog(("Failed to obtain fsid %x\n", err)); }
xFsClose(fd);
xFsLog(("Mv nid %d %S -> %S err %x\n", nid, msg->sname, msg->dname, err));
return err;
}
NTSTATUS FspWrite(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG len, PVOID rbuf, ULONG_PTR *rlen) { NTSTATUS err; IO_STATUS_BLOCK ios; LARGE_INTEGER off; ULONG key; fs_io_msg_t *msg = (fs_io_msg_t *)args; fs_log_rec_t lrec; PVOID seq; PVOID crs_hd = FS_GET_CRS_HANDLE(vinfo, nid); HANDLE fd;
if (uinfo != NULL && msg->fnum != INVALID_FHANDLE_T) fd = FS_GET_USER_HANDLE(uinfo, nid, msg->fnum); else fd = (HANDLE) msg->context;
lrec.command = FS_WRITE; memcpy(lrec.fs_id, (PVOID) msg->fs_id, sizeof(fs_id_t)); lrec.offset = msg->offset; lrec.length = msg->size;
if ((seq = CrsPrepareRecord(crs_hd, (PVOID) &lrec, msg->xid)) == 0) { return STATUS_MEDIA_WRITE_PROTECTED; }
// Write ops
xFsLog(("Write %d len %d off %d\n", nid, msg->size, msg->offset));
off.LowPart = msg->offset; off.HighPart = 0; key = FS_BUILD_LOCK_KEY((uinfo ? uinfo->Uid : 0), nid, msg->fnum);
if (msg->size > 0) { err = NtWriteFile(fd, NULL, NULL, (PVOID) NULL, &ios, msg->buf, msg->size, &off, &key); } else { FILE_END_OF_FILE_INFORMATION x;
x.EndOfFile = off;
err = NtSetInformationFile(fd, &ios, (char *) &x, sizeof(x), FileEndOfFileInformation); }
if (err == STATUS_PENDING) { EventWait(fd); err = ios.Status; }
*rlen = ios.Information;
CrsCommitOrAbort(crs_hd, seq, err == STATUS_SUCCESS);
return err;
}
NTSTATUS FspRead(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { fs_io_msg_t *msg = (fs_io_msg_t *)args; NTSTATUS err; IO_STATUS_BLOCK ios; LARGE_INTEGER off; HANDLE fd = FS_GET_USER_HANDLE(uinfo, nid, msg->fnum); ULONG key;
assert(sz == sizeof(*msg));
// Read ops
off.LowPart = msg->offset; off.HighPart = 0; key = FS_BUILD_LOCK_KEY(uinfo->Uid, nid, msg->fnum);
ios.Information = 0; err = NtReadFile(fd, NULL, NULL, NULL, &ios, msg->buf, msg->size, &off, &key);
if (err == STATUS_PENDING) { EventWait(fd); err = ios.Status; }
*rlen = ios.Information;
xFsLog(("fs_read err %x sz %d\n", err, *rlen));
return err; }
NTSTATUS FspFlush(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { fhandle_t fnum = *(fhandle_t *)args; IO_STATUS_BLOCK ios; HANDLE fd;
ASSERT(sz == sizeof(fhandle_t)); *rlen = 0;
if (uinfo != NULL && fnum != INVALID_FHANDLE_T) { fd = FS_GET_USER_HANDLE(uinfo, nid, fnum); } else { fd = FS_GET_VOL_HANDLE(vinfo, nid); } return NtFlushBuffersFile(fd, &ios); }
NTSTATUS FspLock(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { fs_lock_msg_t *msg = (fs_lock_msg_t *)args; NTSTATUS err; IO_STATUS_BLOCK ios; LARGE_INTEGER offset, len; BOOLEAN wait, shared; ULONG key = FS_BUILD_LOCK_KEY(uinfo->Uid, nid, msg->fnum);
assert(sz == sizeof(*msg));
// xxx: need to log
FsLog(("Lock %d off %d len %d flags %x\n", msg->fnum, msg->offset, msg->length, msg->flags));
offset.LowPart = msg->offset; offset.HighPart = 0; len.LowPart = msg->length; len.HighPart = 0;
// todo: need to be async, if we are the owner node and failnow is false, then
// we should pass in the context and the completion port responses back
// to the user
wait = (BOOLEAN) ((msg->flags & FS_LOCK_WAIT) ? TRUE : FALSE); // todo: this can cause lots of headache, never wait.
wait = FALSE; shared = (BOOLEAN) ((msg->flags & FS_LOCK_SHARED) ? FALSE : TRUE); err = NtLockFile(uinfo->Table[msg->fnum].Fd[nid], NULL, NULL, (PVOID) NULL, &ios, &offset, &len, key, wait, shared);
// xxx: Need to log in software only
*rlen = 0; FsLog(("Lock err %x\n", err)); return err; }
NTSTATUS FspUnlock(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { fs_lock_msg_t *msg = (fs_lock_msg_t *)args; NTSTATUS err; IO_STATUS_BLOCK ios; LARGE_INTEGER offset, len; ULONG key = FS_BUILD_LOCK_KEY(uinfo->Uid, nid, msg->fnum);
assert(sz == sizeof(*msg));
// xxx: need to log
xFsLog(("Unlock %d off %d len %d\n", msg->fnum, msg->offset, msg->length));
offset.LowPart = msg->offset; offset.HighPart = 0; len.LowPart = msg->length; len.HighPart = 0;
// always sync I think
err = NtUnlockFile(uinfo->Table[msg->fnum].Fd[nid], &ios, &offset, &len, key);
// xxx: need to log in software only
FsLog(("Unlock err %x\n", err));
*rlen = 0; return err; }
NTSTATUS FspStatFs(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { fs_attr_t *msg = (fs_attr_t *)args; NTSTATUS err; IO_STATUS_BLOCK ios; FILE_FS_SIZE_INFORMATION fsinfo; HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid);
assert(sz == sizeof(*msg));
// xxx: need to log
lstrcpyn(msg->fs_name, "FsCrs", MAX_FS_NAME_LEN);
err = NtQueryVolumeInformationFile(vfd, &ios, (PVOID) &fsinfo, sizeof(fsinfo), FileFsSizeInformation); if (err == STATUS_SUCCESS) { msg->total_units = fsinfo.TotalAllocationUnits.QuadPart; msg->free_units = fsinfo.AvailableAllocationUnits.QuadPart; msg->sectors_per_unit = fsinfo.SectorsPerAllocationUnit; msg->bytes_per_sector = fsinfo.BytesPerSector; }
*rlen = 0; return err; }
NTSTATUS FspCheckFs(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { NTSTATUS err; IO_STATUS_BLOCK ios; FILE_FS_SIZE_INFORMATION fsinfo; HANDLE vfd = FS_GET_VOL_HANDLE(vinfo, nid); PVOID crshdl = FS_GET_CRS_HANDLE(vinfo, nid);
err = NtQueryVolumeInformationFile(vfd, &ios, (PVOID) &fsinfo, sizeof(fsinfo), FileFsSizeInformation);
// We need to issue crsflush to flush last write
CrsFlush(crshdl);
if (err == STATUS_SUCCESS) { HANDLE notifyfd = FS_GET_VOL_NOTIFY_HANDLE(vinfo, nid); if (WaitForSingleObject(notifyfd, 0) == WAIT_OBJECT_0) { // reload notification again
FindNextChangeNotification(notifyfd); } } else { FsLog(("FsReserve failed nid %d err %x\n", nid, err)); }
*rlen = 0; return err; }
NTSTATUS FspGetRoot(VolInfo_t *vinfo, UserInfo_t *uinfo, int nid, PVOID args, ULONG sz, PVOID rbuf, ULONG_PTR *rlen) { LPWSTR vname = FS_GET_VOL_NAME(vinfo, nid);
swprintf(rbuf, L"\\\\?\\%s\\%s",vname,vinfo->Root);
FsLog(("FspGetRoot '%S'\n", rbuf));
return STATUS_SUCCESS; }
/////////////////////////////////////////////////////////////////////////////////////
BOOLEAN FsReadOnly = FALSE;
int SendAvailRequest(fs_handler_t callback, VolInfo_t *vol, UserInfo_t *uinfo, PVOID msg, ULONG len, PVOID *rbuf, ULONG rsz, IO_STATUS_BLOCK *ios) { ULONG mask; int i; DWORD count = 0;
if (vol == NULL) return ERROR_INVALID_HANDLE;
// lock volume for update
LockEnter(vol->uLock);
// issue update for each replica
i = 0; for (mask = vol->ReadSet; mask != 0; mask = mask >> 1, i++) { if (mask & 0x1) { count++; ios[i].Information = rsz; ios[i].Status = callback(vol, uinfo, i, msg, len, rbuf ? rbuf[i] : NULL, &ios[i].Information); } }
// process ios and evict replicas that don't agree with majority
if ((!FsReadOnly && CRS_QUORUM(count, vol->DiskListSz)) || (FsReadOnly && vol->ReadSet != 0)) i = FspCheckAnswers(vol, ios, rbuf, rsz); else { i = 0; ios[0].Status = STATUS_MEDIA_WRITE_PROTECTED; ios[0].Information = count; // return number in current read set
}
// unlock volume
LockExit(vol->uLock);
return i; }
int SendRequest(fs_handler_t callback, UserInfo_t *uinfo, PVOID msg, ULONG len, PVOID *rbuf, ULONG rsz, IO_STATUS_BLOCK *ios) { ULONG mask; int i; VolInfo_t *vol = uinfo->VolInfo;
if (vol == NULL) return ERROR_INVALID_HANDLE;
// lock volume for update
LockEnter(vol->uLock);
// issue update for each replica
i = 0; for (mask = vol->WriteSet; mask != 0; mask = mask >> 1, i++) { if (mask & 0x1) { ios[i].Information = rsz; ios[i].Status = callback(vol, uinfo, i, msg, len, rbuf ? rbuf[i] : NULL, &ios[i].Information); } }
// process ios and evict replicas that don't agree with majority
if (vol->WriteSet != 0) i = FspCheckAnswers(vol, ios, rbuf, rsz); else { i = 0; ios[0].Status = STATUS_MEDIA_WRITE_PROTECTED; ios[0].Information = 0; }
// unlock volume
LockExit(vol->uLock);
return i; }
NTSTATUS SendReadRequest(fs_handler_t callback, UserInfo_t *uinfo, PVOID msg, ULONG len, PVOID rbuf, ULONG rsz, IO_STATUS_BLOCK *ios) { ULONG mask; int i; VolInfo_t *vol = uinfo->VolInfo;
if (vol == NULL) return ERROR_INVALID_HANDLE;
// lock volume for update
LockEnter(vol->uLock);
// issue update for each replica
i = 0; for (mask = vol->ReadSet; mask != 0; mask = mask >> 1, i++) { if (mask & 0x1) { ios->Information = rsz; ios->Status = callback(vol, uinfo, i, msg, len, rbuf, &ios->Information);
if (ios->Status == STATUS_CONNECTION_DISCONNECTED || ios->Status == STATUS_VOLUME_DISMOUNTED) { // mark replica as invalid
FspEvict(vol, (ULONG)(1 << i), TRUE); // reload mask again
mask = vol->ReadSet; } else { break; } } }
// process ios and evict replicas that don't agree with majority
if (vol->ReadSet == 0) { ios->Status = STATUS_MEDIA_WRITE_PROTECTED; ios->Information = 0; }
// unlock volume
LockExit(vol->uLock);
return STATUS_SUCCESS; }
///////////////////////////////////////////////////////////////////////////////
DWORD FsCreate( PVOID fshdl, LPWSTR name, USHORT namelen, UINT32 flags, fattr_t* fattr, fhandle_t* phandle, UINT32 *action ) { UserInfo_t *uinfo = (UserInfo_t *) fshdl; NTSTATUS err; fs_create_reply_t nfd[FsMaxNodes]; IO_STATUS_BLOCK status[FsMaxNodes]; PVOID rbuf[FsMaxNodes]; fs_create_msg_t msg; fhandle_t fdnum;
ASSERT(uinfo != NULL);
xFsLog(("FsDT::create(%S, 0x%08X, 0x%08X, 0x%08d)\n", name, flags, fattr, namelen));
if (!phandle) return ERROR_INVALID_PARAMETER; *phandle = INVALID_FHANDLE_T;
if (!name) return ERROR_INVALID_PARAMETER;
if (flags != (FLAGS_MASK & flags)) { return ERROR_INVALID_PARAMETER; }
if (action != NULL) *action = flags & FS_ACCESS_MASK;
// if we are doing a directory, open locally
// todo: this should be merged with other case, if
// we are doing an existing open, then no need to
// issue update and log it, but we have to do
// mcast in order for the close to work.
if (namelen > 0) { if (*name == L'\\') { name++; namelen--; }
if (name[namelen-1] == L'\\') { namelen--; name[namelen] = L'\0'; } }
memset(&msg.xid, 0, sizeof(msg.xid)); msg.name = name; msg.name_len = namelen; msg.flags = flags; msg.attr = 0; if (fattr) { msg.attr = unget_attributes(fattr->attributes); }
FspInitAnswers(status, rbuf, (char *) nfd, sizeof(nfd[0]));
// allocate a new handle
err = FspAllocatePrivateHandle(uinfo, &fdnum); if (err == STATUS_SUCCESS) { int sid;
msg.fnum = fdnum; // Set flags in advance to sync with replay
uinfo->Table[fdnum].Flags = flags;
if (namelen < 2 || ((flags & FS_DISP_MASK) == DISP_DIRECTORY) || (unget_disp(flags) == FILE_OPEN)) { sid = SendAvailRequest(FspOpen, uinfo->VolInfo, uinfo, (PVOID) &msg, sizeof(msg), rbuf, sizeof(nfd[0]), status); } else { sid = SendRequest(FspCreate, uinfo, (PVOID) &msg, sizeof(msg), rbuf, sizeof(nfd[0]), status); }
if (action != NULL) { if (!(nfd[sid].access & FILE_GENERIC_WRITE)) flags &= ~ACCESS_WRITE; *action = flags | nfd[sid].action; }
err = status[sid].Status; if (err == STATUS_SUCCESS) { fs_id_t *fid = FS_GET_FID_HANDLE(uinfo, fdnum);
// set file id
memcpy((PVOID) fid, (PVOID) nfd[sid].fid, sizeof(fs_id_t)); FsLog(("File id %I64x:%I64x\n", (*fid)[0], (*fid)[1])); // todo: bind handles to completion port if we do async
} else { // free handle
FspFreeHandle(uinfo, fdnum); } }
// todo: need to set fid
*phandle = fdnum;
FsLog(("create: return fd %d err %x\n", *phandle, err));
return RtlNtStatusToDosError(err); }
void BuildFileAttr(FILE_BASIC_INFORMATION *attr, fattr_t *fattr) {
memset(attr, 0, sizeof(*attr)); if (fattr->create_time != INVALID_UINT64) attr->CreationTime.QuadPart = fattr->create_time;
if (fattr->mod_time != INVALID_UINT64) attr->LastWriteTime.QuadPart = fattr->mod_time;
if (fattr->access_time != INVALID_UINT64) attr->LastAccessTime.QuadPart = fattr->access_time;
if (fattr->attributes != INVALID_UINT32) attr->FileAttributes = unget_attributes(fattr->attributes);
}
DWORD FsSetAttr( PVOID fshdl, fhandle_t handle, fattr_t* attr ) { UserInfo_t *uinfo = (UserInfo_t *)fshdl; fs_setattr_msg_t msg; int sid; IO_STATUS_BLOCK status[FsMaxNodes];
if (!attr || handle == INVALID_FHANDLE_T) return ERROR_INVALID_PARAMETER;
// todo: get file id
memset(&msg.xid, 0, sizeof(msg.xid)); msg.fs_id = FS_GET_FID_HANDLE(uinfo, handle); BuildFileAttr(&msg.attr, attr); msg.fnum = handle;
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspSetAttr, uinfo, (char *)&msg, sizeof(msg), NULL, 0, status);
return RtlNtStatusToDosError(status[sid].Status); }
DWORD FsSetAttr2( PVOID fshdl, LPWSTR name, USHORT name_len, fattr_t* attr ) { UserInfo_t *uinfo = (UserInfo_t *) fshdl; fs_setattr_msg_t msg; int sid; IO_STATUS_BLOCK status[FsMaxNodes];
if (!attr || !name) return ERROR_INVALID_PARAMETER;
if (*name == '\\') { name++; name_len--; }
// todo: locate file id
memset(&msg.xid, 0, sizeof(msg.xid)); msg.name = name; msg.name_len = name_len;
BuildFileAttr(&msg.attr, attr);
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspSetAttr2, uinfo, (char *)&msg, sizeof(msg), NULL, 0, status);
return RtlNtStatusToDosError(status[sid].Status); }
DWORD FsLookup( PVOID fshdl, LPWSTR name, USHORT name_len, fattr_t* fattr ) { fs_lookup_msg_t msg; int err; IO_STATUS_BLOCK ios; FILE_NETWORK_OPEN_INFORMATION attr;
FsLog(("Lookup name '%S' %x\n", name, fattr));
if (!fattr) return ERROR_INVALID_PARAMETER;
if (*name == '\\') { name++; name_len--; }
msg.name = name; msg.name_len = name_len;
err = SendReadRequest(FspLookup, (UserInfo_t *)fshdl, (PVOID) &msg, sizeof(msg), (PVOID) &attr, sizeof(attr), &ios);
err = ios.Status; if (ios.Status == STATUS_SUCCESS) { fattr->file_size = attr.EndOfFile.QuadPart; fattr->alloc_size = attr.AllocationSize.QuadPart; fattr->create_time = *(TIME64 *)&attr.CreationTime; fattr->access_time = *(TIME64 *)&attr.LastAccessTime; fattr->mod_time = *(TIME64 *)&attr.LastWriteTime; fattr->attributes = get_attributes(attr.FileAttributes); }
FsLog(("Lookup: return %x\n", err));
return RtlNtStatusToDosError(err); }
DWORD FsGetAttr( PVOID fshdl, fhandle_t handle, fattr_t* fattr ) { int err; IO_STATUS_BLOCK ios; FILE_NETWORK_OPEN_INFORMATION attr;
xFsLog(("Getattr fid '%d' %x\n", handle, fattr));
if (!fattr) return ERROR_INVALID_PARAMETER;
err = SendReadRequest(FspGetAttr, (UserInfo_t *)fshdl, (PVOID) &handle, sizeof(handle), (PVOID) &attr, sizeof(attr), &ios);
err = ios.Status; if (err == STATUS_SUCCESS) { fattr->file_size = attr.EndOfFile.QuadPart; fattr->alloc_size = attr.AllocationSize.QuadPart; fattr->create_time = *(TIME64 *)&attr.CreationTime; fattr->access_time = *(TIME64 *)&attr.LastAccessTime; fattr->mod_time = *(TIME64 *)&attr.LastWriteTime; fattr->attributes =attr.FileAttributes; }
FsLog(("Getattr: return %d\n", err));
return RtlNtStatusToDosError(err); }
DWORD FsClose( PVOID fshdl, fhandle_t handle ) { int sid, err; IO_STATUS_BLOCK status[FsMaxNodes]; UserInfo_t *uinfo;
if (handle == INVALID_FHANDLE_T) return ERROR_INVALID_PARAMETER; if (handle >= FsTableSize) return ERROR_INVALID_PARAMETER;
FsLog(("Close: fid %d\n", handle));
FspInitAnswers(status, NULL, NULL, 0);
uinfo = (UserInfo_t *) fshdl; sid = SendAvailRequest(FspClose, uinfo->VolInfo, uinfo, (PVOID) &handle, sizeof(handle), NULL, 0, status);
err = status[sid].Status; if (err == STATUS_SUCCESS) { // need to free this handle slot
FspFreeHandle((UserInfo_t *) fshdl, handle); }
FsLog(("Close: fid %d err %x\n", handle, err));
return RtlNtStatusToDosError(err); }
DWORD FsWrite( PVOID fshdl, fhandle_t handle, UINT32 offset, UINT16 *pcount, void* buffer, PVOID context ) { DWORD err; IO_STATUS_BLOCK status[FsMaxNodes]; int i, sid; fs_io_msg_t msg; UserInfo_t *uinfo = (UserInfo_t *) fshdl; if (!pcount || handle == INVALID_FHANDLE_T) return ERROR_INVALID_PARAMETER;
FsLog(("Write %d offset %d count %d\n", handle, offset, *pcount));
i = (int) offset; if (i < 0) { offset = 0; (*pcount)--; }
// todo: locate file id
memset(&msg.xid, 0, sizeof(msg.xid)); msg.fs_id = FS_GET_FID_HANDLE(uinfo, handle); msg.offset = offset; msg.size = (UINT32) *pcount; msg.buf = buffer; msg.context = context; msg.fnum = handle;
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspWrite, (UserInfo_t *)fshdl, (PVOID) &msg, sizeof(msg), NULL, 0, status);
err = status[sid].Status; *pcount = (USHORT) status[sid].Information;
FsLog(("write: return %x\n", err));
return RtlNtStatusToDosError(err); }
DWORD FsRead( PVOID fshdl, fhandle_t handle, UINT32 offset, UINT16* pcount, void* buffer, PVOID context ) { NTSTATUS err; IO_STATUS_BLOCK ios; fs_io_msg_t msg; memset(&msg.xid, 0, sizeof(msg.xid)); msg.offset = offset; msg.buf = buffer; msg.size = (UINT32) *pcount; msg.context = context; msg.fnum = handle;
FsLog(("read: %x fd %d sz %d\n", context, handle, msg.size));
err = SendReadRequest(FspRead, (UserInfo_t *)fshdl, (PVOID) &msg, sizeof(msg), NULL, 0, &ios);
err = ios.Status; if (err == STATUS_END_OF_FILE) { *pcount = 0; return ERROR_SUCCESS; } err = RtlNtStatusToDosError(err);
*pcount = (USHORT) ios.Information;
FsLog(("read: %x return %x sz %d\n", context, err, *pcount));
return err; #if 0
#ifdef FS_ASYNC
return ERROR_IO_PENDING; //err;
#else
return ERROR_SUCCESS; #endif
#endif
}
DWORD FsReadDir( PVOID fshdl, fhandle_t dir, UINT32 cookie, dirinfo_t* buffer, UINT32 size, UINT32 *entries_found ) { fs_io_msg_t msg; int err; IO_STATUS_BLOCK ios;
FsLog(("read_dir: cookie %d buf %x entries %x\n", cookie, buffer, entries_found)); if (!entries_found || !buffer) return ERROR_INVALID_PARAMETER;
msg.cookie = cookie; msg.buf = (PVOID) buffer; msg.size = size; msg.fnum = dir;
err = SendReadRequest(FspReadDir, (UserInfo_t *)fshdl, (PVOID) &msg, sizeof(msg), NULL, 0, &ios);
err = ios.Status; *entries_found = (UINT32) ios.Information;
xFsLog(("read_dir: err %d entries %d\n", err, *entries_found)); return RtlNtStatusToDosError(err); }
DWORD FsRemove( PVOID fshdl, LPWSTR name, USHORT name_len ) { fs_remove_msg_t msg; int err, sid; IO_STATUS_BLOCK status[FsMaxNodes];
if (*name == L'\\') { name++; name_len--; }
memset(&msg.xid, 0, sizeof(msg.xid)); msg.name = name; msg.name_len = name_len;
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspRemove, (UserInfo_t *) fshdl, (PVOID *)&msg, sizeof(msg), NULL, 0, status);
err = status[sid].Status;
return RtlNtStatusToDosError(err); }
DWORD FsRename( PVOID fshdl, LPWSTR from_name, USHORT from_name_len, LPWSTR to_name, USHORT to_name_len ) {
int err, sid; fs_rename_msg_t msg; IO_STATUS_BLOCK status[FsMaxNodes];
if (!from_name || !to_name) return ERROR_INVALID_PARAMETER;
if (*from_name == L'\\') { from_name++; from_name_len--; }
if (*to_name == L'\\') { to_name++; to_name_len--; } if (*from_name == L'\0' || *to_name == L'\0') return ERROR_INVALID_PARAMETER;
FsLog(("rename %S -> %S,%d\n", from_name, to_name,to_name_len));
memset(&msg.xid, 0, sizeof(msg.xid)); msg.sname = from_name; msg.sname_len = from_name_len; msg.dname = to_name; msg.dname_len = to_name_len;
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspRename, (UserInfo_t *) fshdl, (PVOID) &msg, sizeof(msg), NULL, 0, status);
err = status[sid].Status;
return RtlNtStatusToDosError(err); }
DWORD FsMkDir( PVOID fshdl, LPWSTR name, USHORT name_len, fattr_t* attr ) { int err, sid; IO_STATUS_BLOCK status[FsMaxNodes]; fs_id_t ids[FsMaxNodes]; PVOID *rbuf[FsMaxNodes]; fs_create_msg_t msg;
// XXX: we ignore attr for now...
if (!name) return ERROR_INVALID_PARAMETER; if (*name == L'\\') { name++; name_len--; }
memset(&msg.xid, 0, sizeof(msg.xid)); msg.attr = (attr != NULL ? unget_attributes(attr->attributes) : FILE_ATTRIBUTE_DIRECTORY); msg.flags = DISP_DIRECTORY | SHARE_READ | SHARE_WRITE; msg.name = name; msg.name_len = name_len;
FspInitAnswers(status, (PVOID *)rbuf, (PVOID) ids, sizeof(ids[0]));
sid = SendRequest(FspMkDir, (UserInfo_t *) fshdl, (PVOID) &msg, sizeof(msg), (PVOID *)rbuf, sizeof(ids[0]), status);
err = status[sid].Status; // todo: insert pathname and file id into hash table
return RtlNtStatusToDosError(err); }
DWORD FsFlush( PVOID fshdl, fhandle_t handle ) { NTSTATUS status; int sid; IO_STATUS_BLOCK ios[FsMaxNodes];
FspInitAnswers(ios, NULL, NULL, 0);
sid = SendRequest(FspFlush, (UserInfo_t *) fshdl, (PVOID) &handle, sizeof(handle), NULL, 0, ios); status = ios[sid].Status;
FsLog(("Flush %d err %x\n", handle, status));
if (status == STATUS_PENDING) { status = STATUS_SUCCESS; }
return RtlNtStatusToDosError(status); }
DWORD FsLock(PVOID fshdl, fhandle_t handle, ULONG offset, ULONG length, ULONG flags, PVOID context) { fs_lock_msg_t msg; int err, sid; IO_STATUS_BLOCK status[FsMaxNodes];
if (handle == INVALID_FHANDLE_T) return ERROR_INVALID_PARAMETER;
memset(&msg.xid, 0, sizeof(msg.xid)); msg.offset = offset; msg.length = length; msg.flags = flags; msg.fnum = handle;
FsLog(("Lock fid %d off %d len %d\n", msg.fnum, offset, length));
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspLock, (UserInfo_t *) fshdl, (PVOID)&msg, sizeof(msg), NULL, 0, status);
err = status[sid].Status;
FsLog(("Lock fid %d err %x\n", msg.fnum, err));
return RtlNtStatusToDosError(err); }
DWORD FsUnlock(PVOID fshdl, fhandle_t handle, ULONG offset, ULONG length) { fs_lock_msg_t msg; int err, sid; IO_STATUS_BLOCK status[FsMaxNodes];
if (handle == INVALID_FHANDLE_T) return ERROR_INVALID_PARAMETER;
memset(&msg.xid, 0, sizeof(msg.xid)); msg.offset = offset; msg.length = length; msg.fnum = handle;
FsLog(("Unlock fid %d off %d len %d\n", handle, offset, length));
FspInitAnswers(status, NULL, NULL, 0);
sid = SendRequest(FspUnlock, (UserInfo_t *) fshdl, (PVOID)&msg, sizeof(msg), NULL, 0, status);
err = status[sid].Status;
return RtlNtStatusToDosError(err); }
DWORD FsStatFs( PVOID fshdl, fs_attr_t* attr ) { DWORD err; IO_STATUS_BLOCK ios;
if (!attr) return ERROR_INVALID_PARAMETER;
err = SendReadRequest(FspStatFs, (UserInfo_t *) fshdl, (PVOID) attr, sizeof(*attr), NULL, 0, &ios);
err = ios.Status;
return RtlNtStatusToDosError(err); }
DWORD FsGetRoot(PVOID fshdl, LPWSTR fullpath) { DWORD err; IO_STATUS_BLOCK ios;
if (!fullpath || !fshdl) return ERROR_INVALID_PARAMETER;
// use local replica instead
if ((((UserInfo_t *)fshdl)->VolInfo->FsCtx->Root)) { swprintf(fullpath, L"\\\\?\\%s\\%s", (((UserInfo_t *)fshdl)->VolInfo->FsCtx->Root), (((UserInfo_t *)fshdl)->VolInfo->Root)); FsLog(("FspGetRoot '%S'\n", fullpath)); err = STATUS_SUCCESS; } else { err = SendReadRequest(FspGetRoot, (UserInfo_t *) fshdl, NULL, 0, (PVOID)fullpath, 0, &ios);
err = ios.Status; }
return RtlNtStatusToDosError(err); }
static FsDispatchTable gDisp = { 0x100, FsCreate, FsLookup, FsSetAttr, FsSetAttr2, FsGetAttr, FsClose, FsWrite, FsRead, FsReadDir, FsStatFs, FsRemove, FsRename, FsMkDir, FsRemove, FsFlush, FsLock, FsUnlock, FsGetRoot }; //////////////////////////////////////////////////////////////
DWORD FsInit(PVOID resHdl, PVOID *Hdl) { DWORD status; FsCtx_t *ctx;
// This should be a compile check instead of runtime check
ASSERT(sizeof(fs_log_rec_t) == CRS_RECORD_SZ); ASSERT(sizeof(fs_log_rec_t) == sizeof(CrsRecord_t));
if (Hdl == NULL) { return ERROR_INVALID_PARAMETER; }
FsLog(("FsInit:\n"));
// allocate a context
ctx = (FsCtx_t *) MemAlloc(sizeof(*ctx)); if (ctx == NULL) { return ERROR_NOT_ENOUGH_MEMORY; }
// initialize configuration table and other global state
memset(ctx, 0, sizeof(*ctx));
// local path
ctx->Root = NULL;
LockInit(ctx->Lock);
ctx->reshdl = resHdl; *Hdl = (PVOID) ctx;
// we need to mount the IPC share now
status = FsRegister((PVOID)ctx, L"IPC$", L"dummy", NULL, 0, &ctx->ipcHdl); if (status == ERROR_SUCCESS) { // Init. volume
VolInfo_t *vinfo = (VolInfo_t *)ctx->ipcHdl; ASSERT(vinfo != NULL);
// use node zero
vinfo->Fd[0] = INVALID_HANDLE_VALUE; vinfo->ReadSet = 0; vinfo->AliveSet = 0; } else { FsLog(("FsInit: failed to register ipc share %d\n", status)); // free memory
MemFree(ctx); *Hdl = NULL; }
return status; }
void FspFreeSession(SessionInfo_t *s) { UserInfo_t *u; int i, j;
u = &s->TreeCtx; FsLog(("Session free uid %d tid %d ref %d\n", u->Uid, u->Tid, u->RefCnt));
LockEnter(u->Lock); if (u->VolInfo != NULL) { UserInfo_t **p; VolInfo_t *v = u->VolInfo;
LockExit(u->Lock);
// remove from vollist now
LockEnter(v->uLock); p = &v->UserList; while (*p != NULL) { if (*p == u) { // found it
*p = u->Next; FsLog(("Remove uinfo %x,%x from vol %x %S\n", u, u->Next, v->UserList, v->Root)); break; } p = &(*p)->Next; } LockExit(v->uLock);
// relock again
LockEnter(u->Lock); }
// Close all user handles
for (i = 0; i < FsTableSize; i++) { if (u->Table[i].Flags) { FsLog(("Close slot %d %x\n", i, u->Table[i].Flags)); FsClose((PVOID) u, (fhandle_t)i); } }
// sap volptr
u->VolInfo = NULL;
LockExit(u->Lock);
DeleteCriticalSection(&u->Lock);
// free memory now, don't free u since it's part of s
MemFree(s); }
void FspCloseVolume(VolInfo_t *vol, ULONG AliveSet) { DWORD i;
// clear arbitrate state now
vol->Arbitrate.State = ARB_STATE_IDLE;
// Close crs and root handles, by evicting our alive set
// close nid handles <crs, vol, open files>
for (i = 0; i < FsMaxNodes; i++) { if (AliveSet & (1 << i)) { if (vol->CrsHdl[i]) { CrsClose(vol->CrsHdl[i]); vol->CrsHdl[i] = NULL; } FindCloseChangeNotification(vol->NotifyFd[i]); vol->NotifyFd[i] = INVALID_HANDLE_VALUE; xFsClose(vol->Fd[i]); vol->Fd[i] = INVALID_HANDLE_VALUE; // need to close all user handles now
{ UserInfo_t *u;
for (u = vol->UserList; u; u = u->Next) { DWORD j; FsLog(("Lock user %x root %S\n", u, vol->Root)); LockEnter(u->Lock);
// close all handles for this node
for (j = 0; j < FsTableSize; j++) { if (u->Table[j].Fd[i] != INVALID_HANDLE_VALUE) { FsLog(("Close fid %d\n", j)); xFsClose(u->Table[j].Fd[i]); u->Table[j].Fd[i] = INVALID_HANDLE_VALUE; } } LockExit(u->Lock); FsLog(("Unlock user %x\n", u)); } } } }
}
// call this when we are deleting resource and we need to get ride of
// our IPC reference to directory
void FsEnd(PVOID Hdl) { FsCtx_t *ctx = (FsCtx_t *) Hdl; VolInfo_t *p;
if (!ctx) return;
LockEnter(ctx->Lock);
p = (VolInfo_t *)ctx->ipcHdl; if (p) { xFsClose(p->Fd[0]); p->Fd[0] = INVALID_HANDLE_VALUE; p->ReadSet = 0; p->AliveSet = 0; }
LockExit(ctx->Lock); }
void FsExit(PVOID Hdl) { // flush all state
FsCtx_t *ctx = (FsCtx_t *) Hdl; VolInfo_t *p; SessionInfo_t *s; LogonInfo_t *log;
LockEnter(ctx->Lock); while (s = ctx->SessionList) { ctx->SessionList = s->Next; // free this session now
FspFreeSession(s); } while (p = ctx->VolList) { ctx->VolList = p->Next; ctx->VolListSz--; // free this volume now
FspCloseVolume(p, p->AliveSet); MemFree(p); }
while (log = ctx->LogonList) { ctx->LogonList = log->Next; // free token
CloseHandle(log->Token); MemFree(log); }
// now we free our structure
LockExit(ctx->Lock); MemFree(ctx); }
// adds a new share to list of trees available
DWORD FsRegister(PVOID Hdl, LPWSTR root, LPWSTR share, LPWSTR disklist[], DWORD len, PVOID *vHdl) { FsCtx_t *ctx = (FsCtx_t *) Hdl; VolInfo_t *p;
// check limit
if (len >= FsMaxNodes) { return ERROR_TOO_MANY_NAMES; }
if (root == NULL || share == NULL || (wcslen(share) > (MAX_PATH - 5))) { return ERROR_INVALID_PARAMETER; }
// add a new volume to the list of volume. path is an array
// of directories. Note: The order of this list MUST be the
// same in all nodes since it also determines the disk id
// this is a simple check and assume one thread is calling this function
LockEnter(ctx->Lock);
// update our ipc context
if (ctx->ipcHdl) { NTSTATUS status; UINT32 disp = FILE_OPEN; HANDLE vfd; WCHAR path[MAX_PATH];
p = (VolInfo_t *)ctx->ipcHdl; if (p->Fd[0] != INVALID_HANDLE_VALUE) xFsClose(p->Fd[0]); p->Fd[0] = INVALID_HANDLE_VALUE; p->ReadSet = 0; p->AliveSet = 0;
// set local path
ctx->Root = share;
// update our ipc handle now
FsLog(("FsRegister: ipc share '%S'\n", share));
// open our local ipc path
wcscpy(path, L"\\??\\"); wcscat(path, share); wcscat(path, L"\\");
status = xFsCreate(&vfd, NULL, path, wcslen(path), FILE_DIRECTORY_FILE|FILE_SYNCHRONOUS_IO_ALERT, 0, FILE_SHARE_READ|FILE_SHARE_WRITE, &disp, FILE_GENERIC_READ|FILE_GENERIC_WRITE|FILE_GENERIC_EXECUTE, NULL, 0);
if (status == STATUS_SUCCESS) {
// our root must have already been created and secured.
ASSERT(disp != FILE_CREATED);
// use node zero
p->Fd[0] = vfd; p->ReadSet = 0x1; p->AliveSet = 0x1;
} else { FsLog(("Fsregister: '%S' failed to open %x\n", share, status)); LockExit(ctx->Lock); return RtlNtStatusToDosError(status); } }
// find the volume share
for (p = ctx->VolList; p != NULL; p = p->Next) { if (!wcscmp(root, p->Root)) { LockEnter(p->uLock); break; } } LockExit(ctx->Lock);
if (p == NULL) { p = (VolInfo_t *)MemAlloc(sizeof(*p)); if (p == NULL) { return ERROR_NOT_ENOUGH_MEMORY; }
memset(p, 0, sizeof(*p));
LockInit(p->uLock); LockInit(p->qLock); // We don't need to walk the list again to check if a register has happened because
// this is serialized in nodequorum.c
LockEnter(ctx->Lock); p->Tid = (USHORT)++ctx->VolListSz; p->Next = ctx->VolList; ctx->VolList = p; p->FsCtx = ctx;
// lock the volume
LockEnter(p->uLock);
LockExit(ctx->Lock);
p->Label = L"Cluster Quorum";
}
p->Root = root; if (disklist) { DWORD i;
for (i = 1; i < FsMaxNodes; i++) p->DiskList[i] = disklist[i]; } p->DiskListSz = len;
FsLog(("FsRegister Tid %d Share '%S' %d disks\n", p->Tid, root, len));
// drop the volume lock
LockExit(p->uLock);
*vHdl = (PVOID) p;
return ERROR_SUCCESS; }
SessionInfo_t * FspAllocateSession() { SessionInfo_t *s; UserInfo_t *u; int i;
// add user to our tree and initialize handle tables
s = (SessionInfo_t *)MemAlloc(sizeof(*s)); if (s != NULL) { memset(s, 0, sizeof(*s));
u = &s->TreeCtx; LockInit(u->Lock);
// init handle table
for (i = 0; i < FsTableSize; i++) { int j; for (j = 0; j < FsMaxNodes; j++) { FS_SET_USER_HANDLE(u, j, i, INVALID_HANDLE_VALUE); } } }
return s; }
// binds a session to a specific tree/share
DWORD FsMount(PVOID Hdl, LPWSTR root_name, USHORT uid, USHORT *tid) { FsCtx_t *ctx = (FsCtx_t *) Hdl; SessionInfo_t *s = NULL, *ns; VolInfo_t *p; DWORD err = ERROR_SUCCESS;
*tid = 0;
// allocate new ns
ns = FspAllocateSession(); if (ns == NULL) { return ERROR_NOT_ENOUGH_MEMORY; }
LockEnter(ctx->Lock); // locate share
for (p = ctx->VolList; p != NULL; p = p->Next) { if (!_wcsicmp(root_name, p->Root)) { FsLog(("Mount share '%S' tid %d\n", p->Root, p->Tid)); break; } }
if (p != NULL) {
*tid = p->Tid;
for (s = ctx->SessionList; s != NULL; s = s->Next) { if (s->TreeCtx.Uid == uid && s->TreeCtx.Tid == p->Tid) { break; } }
if (s == NULL) { UserInfo_t *u = &ns->TreeCtx;
// insert into session list
ns->Next = ctx->SessionList; ctx->SessionList = ns; FsLog(("Bind uid %d -> tid %d <%x,%x>\n", uid, p->Tid, u, p->UserList));
u->RefCnt++; u->Uid = uid; u->Tid = p->Tid; u->VolInfo = p; // insert user_info into volume list
LockEnter(p->uLock); FsLog(("Add <%x,%x>\n", u, p->UserList)); u->Next = p->UserList; p->UserList = u; LockExit(p->uLock); } else { // we already have this session opened, increment refcnt
s->TreeCtx.RefCnt++; // free ns
MemFree(ns); } } else { err = ERROR_BAD_NET_NAME; }
LockExit(ctx->Lock);
return (err); }
// This function is also a CloseSession
void FsDisMount(PVOID Hdl, USHORT uid, USHORT tid) { FsCtx_t *ctx = (FsCtx_t *) Hdl; SessionInfo_t *s, **last;
// lookup tree and close all user handles
s = NULL; LockEnter(ctx->Lock);
last = &ctx->SessionList; while (*last != NULL) { UserInfo_t *u = &(*last)->TreeCtx; if (u->Uid == uid && u->Tid == tid) { ASSERT(u->RefCnt > 0); u->RefCnt--; if (u->RefCnt == 0) { FsLog(("Dismount uid %d tid %d <%x,%x>\n", uid, tid, u, *last)); s = *last; *last = s->Next; } break; } last = &(*last)->Next; } LockExit(ctx->Lock); if (s != NULL) { FspFreeSession(s); } }
// todo: I am not using the token for now, but need to use it for all
// io operations
DWORD FsLogonUser(PVOID Hdl, HANDLE token, LUID logonid, USHORT *uid) { FsCtx_t *ctx = (FsCtx_t *) Hdl; LogonInfo_t *s; int i;
// add user to our tree and initialize handle tables
s = (LogonInfo_t *)MemAlloc(sizeof(*s)); if (s == NULL) { return ERROR_NOT_ENOUGH_MEMORY; }
memset(s, 0, sizeof(*s));
s->Token = token; s->LogOnId = logonid;
LockEnter(ctx->Lock); s->Next = ctx->LogonList; ctx->LogonList = s; LockExit(ctx->Lock);
*uid = (USHORT) logonid.LowPart; FsLog(("Logon %d,%d, uid %d\n", logonid.HighPart, logonid.LowPart, *uid));
return (ERROR_SUCCESS);
}
void FsLogoffUser(PVOID Hdl, LUID logonid) { FsCtx_t *ctx = (FsCtx_t *) Hdl; LogonInfo_t *s; USHORT uid;
LockEnter(ctx->Lock); for (s = ctx->LogonList; s != NULL; s = s->Next) { if (s->LogOnId.LowPart == logonid.LowPart && s->LogOnId.HighPart == logonid.HighPart) { uid = (USHORT) logonid.LowPart; break; } } if (s != NULL) { SessionInfo_t **last;
FsLog(("Logoff user %d\n", uid));
// Flush all user trees
last = &ctx->SessionList; while (*last != NULL) { UserInfo_t *u = &(*last)->TreeCtx; if (u->Uid == uid) { SessionInfo_t *ss = *last; // remove session and free it now
*last = ss->Next; FspFreeSession(ss); } else { last = &(*last)->Next; } } }
LockExit(ctx->Lock); }
FsDispatchTable* FsGetHandle(PVOID Hdl, USHORT tid, USHORT uid, PVOID *fshdl) { FsCtx_t *ctx = (FsCtx_t *) Hdl; SessionInfo_t *s;
// locate tid,uid in session list
LockEnter(ctx->Lock); for (s = ctx->SessionList; s != NULL; s = s->Next) { if (s->TreeCtx.Uid == uid && s->TreeCtx.Tid == tid) { *fshdl = (PVOID *) &s->TreeCtx; LockExit(ctx->Lock); return &gDisp; } }
LockExit(ctx->Lock);
*fshdl = NULL; return NULL; }
//////////////////////////////////// Arb/Release ///////////////////////////////
DWORD FspOpenReplica(VolInfo_t *p, DWORD id, HANDLE *CrsHdl, HANDLE *Fd, HANDLE *notifyFd, FspArbitrate_t *arb) { WCHAR path[MAXPATH]; UINT32 disp = FILE_OPEN_IF; NTSTATUS err;
swprintf(path, L"\\\\?\\%s\\crs.log", p->DiskList[id]); err = CrsOpen(FsCrsCallback, (PVOID) p, (USHORT)id, path, FsCrsNumSectors, CrsHdl);
if (err == ERROR_SUCCESS && CrsHdl != NULL) { // got it
// open root volume directory
swprintf(path, L"\\??\\%s\\%s\\", p->DiskList[id], p->Root); err = xFsCreate(Fd, NULL, path, wcslen(path), FILE_DIRECTORY_FILE|FILE_SYNCHRONOUS_IO_ALERT, 0, FILE_SHARE_READ|FILE_SHARE_WRITE, &disp, FILE_GENERIC_READ|FILE_GENERIC_WRITE|FILE_GENERIC_EXECUTE, NULL, 0);
if (err == STATUS_SUCCESS) { // check if we are part of arb.
if (arb != NULL) { // get quorum lock
LockEnter(p->qLock); if (arb->State == ARB_STATE_BUSY) { arb->Count++; arb->Set |= (1 << id); if (arb->Event && CRS_QUORUM(arb->Count, p->DiskListSz)) { // first time only
SetEvent(arb->Event); arb->Event = NULL; } // note it is safe to touch this because our parent thread already
// locked the updates out and is wait for us to finish
p->Fd[id] = *Fd; ASSERT(p->CrsHdl[id] == NULL); p->CrsHdl[id] = *CrsHdl; LockExit(p->qLock); FsLog(("Add Replica %d\n", id)); } else { LockExit(p->qLock); FsLog(("Stale open %d\n", id)); CrsClose(*CrsHdl); xFsClose(*Fd); err = ERROR_SEM_TIMEOUT; } } if (err == ERROR_SUCCESS) { FsArbLog(("Mounted %S\n", path)); swprintf(path, L"\\\\?\\%s\\", p->DiskList[id]);
// scan the tree to break any current oplocks on dead nodes
xFsTouchTree(*Fd);
// we now queue notification changes to force srv to contact client
*notifyFd = FindFirstChangeNotificationW(path, FALSE, FILE_NOTIFY_CHANGE_EA); // if part of arb, set it now
if (arb != NULL) { p->NotifyFd[id] = *notifyFd; } if (*notifyFd != INVALID_HANDLE_VALUE) { int i;
for (i = 0; i < FsMaxNodes; i++) { FindNextChangeNotification(*notifyFd); } } else { FsArbLog(("Failed to register notification %d\n", GetLastError())); } } } else { FsArbLog(("Failed to mount root '%S' %x\n", path, err)); // close CrsHandle
CrsClose(*CrsHdl); } } else if (err == ERROR_LOCK_VIOLATION || err == ERROR_SHARING_VIOLATION) { FsArbLog(("Replica '%S' already locked\n", path)); } else { FsArbLog(("Replica '%S' probe failed %d\n", path, err)); }
return err; }
typedef struct { VolInfo_t *vol; DWORD id; }FspProbeReplicaId_t;
DWORD WINAPI ProbeThread(LPVOID arg) { FspProbeReplicaId_t *probe = (FspProbeReplicaId_t *) arg; DWORD i = probe->id; VolInfo_t *p = probe->vol; FspArbitrate_t *arb = &p->Arbitrate; NTSTATUS err; HANDLE crshdl, fshdl, notifyhdl; DWORD retry_cnt;
// set our priority
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
for (retry_cnt = 0; retry_cnt < 8; retry_cnt++) { err = FspOpenReplica(p, i, &crshdl, &fshdl, ¬ifyhdl, arb);
if (err == ERROR_SUCCESS) { // got it, we are done
break; }
// handle error
if (err == ERROR_BAD_NETPATH || err == ERROR_REM_NOT_LIST || err == ERROR_SEM_TIMEOUT) { // don't retry just bail out now
break; } else { BOOLEAN flag = FALSE;
// we try again as long as we are not cancelled and no quorum is reached
LockEnter(p->qLock); if (arb->State == ARB_STATE_BUSY && !CRS_QUORUM(arb->Count, p->DiskListSz)) { flag = TRUE; } // drop lock
LockExit(p->qLock);
// if cancelled we are out of here
if (flag == FALSE) break;
// retry in 5 seconds again
Sleep(5 * 1000); } }
return 0; }
ULONG FspFindMissingReplicas(VolInfo_t *p, ULONG set)
{ ULONG FoundSet = 0; DWORD i, err; HANDLE crshdl, fshdl, notifyfd;
if (set == 0) return 0;
for (i = 1; i < FsMaxNodes; i++) { if (p->DiskList[i] == NULL) continue; if (!(set & (1 << i))) { // drop the lock
LockExit(p->uLock);
err = FspOpenReplica(p, i, &crshdl, &fshdl, ¬ifyfd, NULL);
// get the lock
LockEnter(p->uLock);
if (err == STATUS_SUCCESS) { if (p->CrsHdl[i] == NULL) { p->NotifyFd[i] = notifyfd; p->Fd[i] = fshdl; p->CrsHdl[i] = crshdl; FoundSet |= (1 << i); } else { // someone beat us to it, close ours
CrsClose(crshdl); xFsClose(fshdl); FindCloseChangeNotification(notifyfd); } } } } if (FoundSet != 0) FsArbLog(("New replica set after probe %x\n", FoundSet));
return FoundSet; }
DWORD WINAPI FspArbitrateThread(LPVOID arg) { VolInfo_t *p = (VolInfo_t *) arg; FspArbitrate_t *arb = &p->Arbitrate; HANDLE hdl[FsMaxNodes]; DWORD i, count = 0, err; ULONG ReplicaSet; DWORD Sequence; FspProbeReplicaId_t Ids[FsMaxNodes]; FspProbeReplicaId_t *r; BOOLEAN flag;
// set our priority
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
// if we arb then no update can be going on now
LockEnter(p->uLock);
// our parent already stored this for us here
ReplicaSet = arb->Set; arb->Set = 0;
FsArbLog(("ArbitrateThread begin %x\n", ReplicaSet));
// we now start a thread for each replica and do the probe in parallel
for (i = 1; i < FsMaxNodes; i++) { if (p->DiskList[i] == NULL) continue;
if (ReplicaSet & (1 << i)) continue;
r = &Ids[i];
r->vol = p; r->id = i;
hdl[count] = CreateThread(NULL, 0, &ProbeThread, (LPVOID) r, 0, NULL); if (hdl[count] != NULL) { count++; } else { FsArbLog(("Unable to create thread to probe replica %d\n", i)); ProbeThread((LPVOID) r); } }
// we now wait
WaitForMultipleObjects(count, hdl, TRUE, INFINITE);
// Close the handles
for (i = 0; i < count; i++) CloseHandle(hdl[i]);
flag = FALSE; // grab lock
LockEnter(p->qLock); if (arb->State != ARB_STATE_BUSY) { flag = TRUE; } LockExit(p->qLock);
if (flag == TRUE) { // we got cancelled, we undo what we just did and get out
if (arb->Set) { // tell evict this not part of alive set
FspEvict(p, arb->Set, FALSE); } err = ERROR_CANCELLED; goto exit; }
count = arb->Count; ReplicaSet = arb->Set;
FsArbLog(("ArbitrateThread working %x\n", ReplicaSet));
p->WriteSet = p->ReadSet = 0; // check if we have a majority
if (CRS_QUORUM(count, p->DiskListSz)) {
FsArbLog(("I own quorum %d,%d set %x\n",count, p->DiskListSz, ReplicaSet));
// we need to join crs replicas
FspJoin(p, ReplicaSet); if (p->WriteSet != 0 || p->ReadSet != 0) { // remember event to signal if we lose quorum again
p->Event = arb->Event; err = ERROR_SUCCESS; } else { // we lost the quorum
err = ERROR_WRITE_PROTECT; }
} else { FspEvict(p, ReplicaSet, FALSE); err = ERROR_PATH_NOT_FOUND; }
exit: // clear the arb state
arb->State = ARB_STATE_IDLE;
// unlock volume
LockExit(p->uLock);
return err; }
DWORD FsIsQuorum(PVOID vHdl) {
VolInfo_t *p = (VolInfo_t *)vHdl; DWORD err = ERROR_INVALID_PARAMETER, count;
if (p) {
// Read write and avail sets. If we have a majority
// in avail set and wset is zero, we return pending.
// if wset is non-zero we return success, otherwise
// return failure
LockEnter(p->qLock); if (p->Arbitrate.State == ARB_STATE_BUSY) { count = p->Arbitrate.Count; } else { ULONG mask = p->AliveSet; count = 0; for (mask = p->AliveSet; mask ; mask = mask >> 1) { if (mask & 0x1) { count++; } } }
if (CRS_QUORUM(count, p->DiskListSz)) err = ERROR_SUCCESS; else err = ERROR_BUSY;
LockExit(p->qLock); }
return err;
}
DWORD FsArbitrate(PVOID vHdl, HANDLE event, HANDLE *wait_event) { VolInfo_t *p = (VolInfo_t *)vHdl; NTSTATUS err; HANDLE hdl;
if (p) { FspArbitrate_t *arb;
// lock volume
LockEnter(p->qLock);
arb = &p->Arbitrate;
if (p->AliveSet != 0) { // we must have already arb. before, just bail out
LockExit(p->qLock); return ERROR_SUCCESS; }
if (arb->State == ARB_STATE_CANCEL) { // there is already a pending arb, just return busy
LockExit(p->qLock); return ERROR_CANCELLED; }
if (arb->State == ARB_STATE_BUSY) { // report current status
if (CRS_QUORUM(p->Arbitrate.Count, p->DiskListSz)) err = ERROR_SUCCESS; else err = ERROR_PATH_BUSY; LockExit(p->qLock); return err; }
ASSERT(arb->State == ARB_STATE_IDLE);
arb->State = ARB_STATE_BUSY; arb->Event = event; arb->Set = p->AliveSet; // store alive set here
arb->Count = 0;
FsArbLog(("FsArb: queueing thread\n"));
// clear event
ResetEvent(event);
// drop lock
LockExit(p->qLock);
// we start a thread to do the arbitrate and return pending
hdl = CreateThread(NULL, 0, &FspArbitrateThread, (LPVOID) p, 0, NULL); if (hdl != NULL) { if (*wait_event != NULL) { CloseHandle(*wait_event); } *wait_event = hdl; err = ERROR_IO_PENDING; } else { // clear the state, no need for a lock here
arb->State = ARB_STATE_IDLE; FsLogError(("FsArb: failed %d queueing thread\n", GetLastError())); err = ERROR_INVALID_PARAMETER; } } else { err = ERROR_INVALID_PARAMETER; }
return err; }
DWORD FsCancelArbitration(PVOID vHdl)
{ VolInfo_t *p = (VolInfo_t *)vHdl; FspArbitrate_t *arb; DWORD err = ERROR_INVALID_PARAMETER;
if (p != NULL) { LockEnter(p->qLock); arb = &p->Arbitrate; if (arb->State == ARB_STATE_BUSY) { // check if we already got quorum
if (CRS_QUORUM(arb->Count, p->DiskListSz)) { arb->Event = NULL; // no need to signal it
err = ERROR_SUCCESS; } else { FsArbLog(("FsCancelArbitration\n")); arb->State = ARB_STATE_CANCEL; err = ERROR_CANCELLED; } } else if (arb->State == ARB_STATE_IDLE) { // we might already have quorum
err = (p->AliveSet) ? ERROR_SUCCESS : ERROR_CANCELLED; } else { err = ERROR_SUCCESS; } LockExit(p->qLock); }
return err; }
DWORD FsRelease(PVOID vHdl) { DWORD i; VolInfo_t *p = (VolInfo_t *)vHdl; NTSTATUS err;
if (p) { ULONG set; // lock volume
LockEnter(p->uLock);
LockEnter(p->qLock); set = p->AliveSet; p->AliveSet = 0; p->Event = 0; LockExit(p->qLock);
FsArbLog(("FsRelease %S AliveSet %x\n", p->Root, set));
FspCloseVolume(p, set); p->WriteSet = 0; p->ReadSet = 0;
FsArbLog(("FsRelease %S done\n", p->Root));
// unlock volume
LockExit(p->uLock);
err = ERROR_SUCCESS;
} else { err = ERROR_INVALID_PARAMETER; }
return err; }
DWORD FsReserve(PVOID vhdl) { VolInfo_t *p = (VolInfo_t *)vhdl; NTSTATUS err;
// check if there is a new replica online
if (p) { ULONG ReplicaSet;
LockEnter(p->qLock); if (p->Arbitrate.State != ARB_STATE_IDLE) { // we are busy, just return success
LockExit(p->qLock); return ERROR_SUCCESS; } ReplicaSet = p->AliveSet; // drop lock now
LockExit(p->qLock);
// get update lock, do a try only if we can't do bother and try again latter
if (!LockTryEnter(p->uLock)) return ERROR_SUCCESS;
ReplicaSet = FspFindMissingReplicas(p, ReplicaSet);
// we found new disks
if (ReplicaSet > 0) { // Add new finds
FspJoin(p, ReplicaSet); } LockExit(p->uLock); }
if (p) { // check each crs handle to be valid
IO_STATUS_BLOCK ios[FsMaxNodes]; DWORD sid;
FspInitAnswers(ios, NULL, NULL, 0);
sid = SendAvailRequest(FspCheckFs, p, NULL, NULL, 0, NULL, 0, ios);
if (ios[sid].Status == STATUS_MEDIA_WRITE_PROTECTED && ios[sid].Information > 0) err = ERROR_SUCCESS; else err = RtlNtStatusToDosError(ios[sid].Status);
} else { err = ERROR_INVALID_PARAMETER; }
if (err != ERROR_SUCCESS) FsLogError(("FsReserve vol '%x' failed 0x%x\n", p, err));
return err;
}
DWORD FsIsOnline(PVOID vHdl) { VolInfo_t *p = (VolInfo_t *)vHdl; DWORD err = ERROR_INVALID_PARAMETER, count;
if (p) {
// Read write and avail sets. If we have a majority
// in avail set and wset is zero, we return pending.
// if wset is non-zero we return success, otherwise
// return failure
LockEnter(p->uLock); ASSERT(p->DiskListSz != (DWORD)-1); if (p->WriteSet > 0 || p->ReadSet > 0) err = ERROR_SUCCESS; else { LockEnter(p->qLock); if (p->Arbitrate.State == ARB_STATE_BUSY) err = ERROR_IO_PENDING; else { ULONG mask = p->AliveSet; count = 0; for (mask = p->AliveSet; mask ; mask = mask >> 1) { if (mask & 0x1) { count++; } } if (CRS_QUORUM(count, p->DiskListSz) || count > 0) err = ERROR_IO_PENDING; else err = ERROR_BUSY; } LockExit(p->qLock); } LockExit(p->uLock); }
return err; }
DWORD FsUpdateReplicaSet(PVOID vhdl, LPWSTR new_path[], DWORD new_len) { VolInfo_t *p = (VolInfo_t *)vhdl; NTSTATUS err; DWORD i, j; ULONG evict_mask, add_mask;
if (p == NULL) { return ERROR_INVALID_PARAMETER; }
if (new_len >= FsMaxNodes) { return ERROR_TOO_MANY_NAMES; }
LockEnter(p->uLock);
// Find which current replicas are in the new set, and keep them
// We skip the IPC share, since it's local
evict_mask = 0; for (j=1; j < FsMaxNodes; j++) { BOOLEAN found; if (p->DiskList[j] == NULL) continue; found = FALSE; for (i=1; i < FsMaxNodes; i++) { if (new_path[i] != NULL && wcscmp(new_path[i], p->DiskList[j]) == 0) { // keep this replica
found = TRUE; break; } } if (found == FALSE) { // This replica is evicted from the new set, add to evict set mask
evict_mask |= (1 << j); FsArbLog(("FsUpdateReplicaSet evict replica # %d '%S' set 0x%x\n", j, p->DiskList[j], evict_mask)); } }
// At this point we have all the replicas in the current and new sets. We now need
// to find replicas that are in the new set but missing from current set.
add_mask = 0; for (i=1; i < FsMaxNodes; i++) { BOOLEAN found; if (new_path[i] == NULL) continue; found = FALSE; for (j=1; j < FsMaxNodes; j++) { if (p->DiskList[j] != NULL && wcscmp(new_path[i], p->DiskList[j]) == 0) { // keep this replica
found = TRUE; break; } } if (found == FALSE) { add_mask |= (1 << i); FsArbLog(("FsUpdateReplicaSet adding replica # %d '%S' set 0x%x\n", i, new_path[i], add_mask)); } }
// we now update our disklist with new disklist
for (i = 1; i < FsMaxNodes; i++) { if ((evict_mask & 1 << i) || (add_mask & (1 << i))) FsArbLog(("FsUpdateReplicat %d: %S -> %S\n", i, p->DiskList[i], new_path[i])); p->DiskList[i] = new_path[i]; } p->DiskListSz = new_len;
// If we are alive, apply changes
if (p->WriteSet != 0 || p->ReadSet != 0) { // At this point we evict old replicas
if (evict_mask != 0) FspEvict(p, evict_mask, TRUE);
// check if there is a new replica online
if (add_mask > 0) { ULONG ReplicaSet = 0;
// try to get the lock
if (LockTryEnter(p->qLock)) { ReplicaSet = p->AliveSet; LockExit(p->qLock); } ReplicaSet = FspFindMissingReplicas(p, ReplicaSet);
// we found new disks
if (ReplicaSet > 0) { FspJoin(p, ReplicaSet); } } }
LockExit(p->uLock);
return ERROR_SUCCESS; }
|