shithub: gefs

ref: 093c8160b677469b68ba957a2c1f6c9aa093e9d2
dir: /dat.h/

View raw version
typedef struct Blk	Blk;
typedef struct Amsg	Amsg;
typedef struct Gefs	Gefs;
typedef struct Fmsg	Fmsg;
typedef struct Fid	Fid;
typedef struct Msg	Msg;
typedef struct Key	Key;
typedef struct Val	Val;
typedef struct Kvp	Kvp;
typedef struct Xdir	Xdir;
typedef struct Bptr	Bptr;
typedef struct Bfree	Bfree;
typedef struct Scan	Scan;
typedef struct Dent	Dent;
typedef struct Scanp	Scanp;
typedef struct Fshdr	Fshdr;
typedef struct Arena	Arena;
typedef struct Arange	Arange;
typedef struct Bucket	Bucket;
typedef struct Chan	Chan;
typedef struct Syncq	Syncq;
typedef struct Qent	Qent;
typedef struct Tree	Tree;
typedef struct Dlist	Dlist;
typedef struct Mount	Mount;
typedef struct User	User;
typedef struct Stats	Stats;
typedef struct Conn	Conn;

enum {
	KiB	= 1024ULL,
	MiB	= 1024ULL*KiB,
	GiB	= 1024ULL*MiB,
	TiB	= 1024ULL*GiB,

	Lgblk	= 14,
	Blksz	= (1ULL<<Lgblk),

	Nrefbuf	= 1024,			/* number of ref incs before syncing */
	Nfidtab	= 1024,			/* number of fit hash entries */
	Nflushtab = 1024,		/* flush table size */
	Ndtab	= 1024,			/* number of dir tab entries */
	Max9p	= 32*KiB,		/* biggest message size we're willing to negotiate */
	Nsec	= 1000LL*1000*1000,	/* nanoseconds to the second */
	Maxname	= 256,			/* maximum size of a name element */
	Maxent	= 9+Maxname+1,		/* maximum size of ent key, with terminator */
	Maxtag	= 1<<16,		/* maximum tag in 9p */

	/*
	 * Kpmax must be no more than 1/4 of pivspc, or
	 * there is no way to get a valid split of a
	 * maximally filled tree.
	 */
	Keymax	= 128,			/* key data limit */
	Inlmax	= 512,			/* inline data limit */
	Ptrsz	= 24,			/* off, hash, gen */
	Pptrsz	= 26,			/* off, hash, gen, fill */
	Fillsz	= 2,			/* block fill count */
	Offksz	= 17,			/* type, qid, off */
	Snapsz	= 9,			/* tag, snapid */
	Dpfxsz	= 9,			/* directory prefix */
	Upksz	= 9,			/* directory prefix */
	Dlksz	= 1+8+8,		/* tag, death, birth */
	Dlvsz	= Ptrsz+Ptrsz,		/* hd,tl of deadlist */
	Dlkvpsz	= Dlksz+Dlvsz,		/* full size of dlist kvp */
	Linksz	= 1+8+8,		/* gen, prev of snap */
	Treesz	= 4+4+4+8+8+Ptrsz,	/* ref, height, gen, prev, root */
	Kvmax	= Keymax + Inlmax,	/* Key and value */
	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
	Wstatmax = 4+8+8+8,		/* mode, size, atime, mtime */
	Arenasz	= 8+8+8+8,		/* loghd, loghash, size, used */
	
	Pivhdsz		= 10,
	Leafhdsz	= 6,
	Loghdsz		= 12,				/* type, len, hash */
	Loghashsz	= 8,
	Dlhdsz		= 2+2+Ptrsz,			/* type, size, chain */
	Dlspc		= Blksz - Dlhdsz,
	Rootsz		= 4+Ptrsz,			/* root pointer */
	Pivsz		= Blksz - Pivhdsz,
	Bufspc		= (Blksz - Pivhdsz)/2,		/* pivot room */
	Pivspc		= Blksz - Pivhdsz - Bufspc,
	Logspc		= Blksz - Loghdsz,
	Logslop		= 16+16+8,			/* val, nextb, chain */
	Leafspc 	= Blksz - Leafhdsz,
	Msgmax  	= 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
};

enum {
	Eactive	= 1UL<<30,	/* epoch active flag */
};

enum {
	/*
	 * dent: pqid[8] qid[8] -- a directory entry key.
	 * ptr:  off[8] hash[8] gen[8] -- a key for an Dir block.
	 * dir:  serialized Xdir
	 */

	/* fs keys */
	Kdat,	/* qid[8] off[8] => ptr:		pointer to data page */
	Kent,	/* pqid[8] name[n] => dir[n]:		serialized Dir */
	Kup,	/* qid[8] => Kent:			parent dir */

	/* snapshot keys */
	Klabel,	/* name[] => snapid[]:			snapshot label */
	Ktref,	/* tag[8] = snapid[]			scratch snapshot label */
	Ksnap,	/* sid[8] => ref[8], tree[52]:		snapshot root */
	Kdlist,	/* snap[8] gen[8] => hd[ptr],tl[ptr]	deadlist  */
	Kslink,	/* snap[8] next[8] => []		snap successor list */
};

enum {
	Bdirty	= 1 << 0,
	Bfinal	= 1 << 1,
	Bfreed	= 1 << 2,
	Bcached	= 1 << 3,
};

enum {
	Qdump = 1ULL << 63,
};

#define Zb (Bptr){-1, -1, -1}

/* internal errors */
#define Efs	(abort(), "fs broke")
extern char Eimpl[];
extern char Ebotch[];
extern char Eio[];
extern char Enofid[];
extern char Efid[];
extern char Etype[];
extern char Edscan[];
extern char Esrch[];
extern char Eexist[];
extern char Emode[];
extern char Efull[];
extern char Eauth[];
extern char Elength[];
extern char Eperm[];
extern char Einuse[];
extern char Ebadf[];
extern char Ename[];
extern char Enomem[];
extern char Eattach[];
extern char Enosnap[];
extern char Esnap[];
extern char Edir[];
extern char Esyntax[];
extern char Enouser[];
extern char Efsize[];
extern char Ebadu[];
extern char Erdonly[];
extern char Elocked[];
extern char Eauthp[];
extern char Eauthd[];
extern char Eauthph[];
extern char Ephase[];
extern char Enone[];
extern char Enoauth[];

extern char Ewstatb[];
extern char Ewstatd[];
extern char Ewstatg[];
extern char Ewstatl[];
extern char Ewstatm[];
extern char Ewstato[];
extern char Ewstatp[];
extern char Ewstatq[];
extern char Ewstatu[];
extern char Ewstatv[];
extern char Enempty[];

/*
 * All metadata blocks share a common header:
 * 
 *	type[2]
 *
 * The None type is reserved for file data blocks
 * and refcount blocks.
 *
 * The superblock has this layout:
 *	version[8]	always "gefsNNNNN"
 *	blksz[4]	block size in bytes
 *	bufsz[4]	portion of leaf nodes
 *			allocated to buffers,
 *			in bytes
 *	height[4]	tree height of root node
 *	rootb[8]	address of root in last
 *			snapshot.
 *	rooth[8]	hash of root node
 *	narena[4]	number of arenas in tree
 *	arenasz[8]	maximum size of arenas;
 *			they may be smaller.
 *	gen[8]		The flush generation
 *
 * The arena zone blocks have this layout, and
 * are overwritten in place:
 *
 *	log[8]		The head of the alloc log
 *	logh[8]		The hash of the alloc log
 *
 * The log blocks have this layout, and are one of
 * two types of blocks that get overwritten in place:
 *
 *	hash[8]		The hash of the previous log block
 *
 *	The remainder of the block is filled with log
 *	entries. Each log entry has at least 8 bytes
 *	of entry. Some are longer. The opcode is or'ed
 *	into the low order bits of the first vlong.
 *	These ops take the following form:
 *
 *	Alloc, Free:
 *		off[8] len[8]
 *	Alloc1, Free1:
 *		off[8]
 *	Ref:
 *		off[8]
 *	Flush:	
 *		gen[8]
 *
 * Pivots have the following layout:
 *
 *	nval[2]
 *	valsz[2]
 *	nbuf[2]
 *	bufsz[2]
 *
 * Leaves have the following layout:
 *
 *	nval[2]
 *	valsz[2]
 *	pad[4]sure, 
 *
 * Within these nodes, pointers have the following
 * layout:
 *
 *	off[8] hash[8] fill[2]
 */
enum {
	Tdat,
	Tpivot,
	Tleaf,
	Tlog,
	Tdlist,
	Tmagic,
	Tarena,
	Tsuper = 0x6765,	/* 'ge' bigendian */
};

enum {
	Vinl,	/* Inline value */
	Vref,	/* Block pointer */
};

enum {
	GBraw	= 1<<0,
	GBwrite	= 1<<1,
	GBnochk	= 1<<2,
};

enum {
	Onop,		/* nothing */
	Oinsert,	/* new kvp */
	Odelete,	/* delete kvp */
	Oclearb,	/* free block ptr if exists */
	Oclobber,	/* remove file if it exists */
	Owstat,		/* update kvp dirent */
	Orefsnap,	/* increment snap refcount by delta */
	Nmsgtype,	/* maximum message type */
};

enum {
	Magic = 0x979b929e98969c8c,
};

/*
 * Wstat ops come with associated data, in the order
 * of the bit flags.
 */
enum{
	/* wstat flags */
	Owsize	= 1<<0,	/* [8]fsize: update file size */
	Owmode	= 1<<1,	/* [4]mode: update file mode */
	Owmtime	= 1<<2, /* [8]mtime: update mtime, in nsec */
	Owatime	= 1<<3, /* [8]atime: update atime, in nsec */
	Owuid	= 1<<4,	/* [4]uid: set uid */
	Owgid	= 1<<5,	/* [4]uid: set gid */
	Owmuid	= 1<<6,	/* [4]uid: set muid */
};

/*
 * Operations for the allocation log.
 */
enum {
	LogNop,		/* unused */
	/* 1-wide entries */
	LogAlloc1,	/* alloc a block */
	LogFree1,	/* free a block */
	LogChain,	/* point to next log block */
	LogSync,	/* sync barrier for replay */
	LogEnd,		/* end of log */

	/* 2-wide entries */
#define	Log2wide	LogAlloc
	LogAlloc,	/* alloc a range */
	LogFree,	/* free a range */
};

enum {
	AOnone,
	AOsnap,
	AOsync,
	AOclear,
};

struct Bptr {
	vlong	addr;
	uvlong	hash;
	vlong	gen;
};

struct Key{
	char	*k;
	int	nk;
};

struct Val {
	short	nv;
	char	*v;
};

struct Kvp {
	Key;
	Val;
};

struct Msg {
	char	op;
	Kvp;
};

struct Dlist {
	Dlist	*cnext;	/* cache next entry */
	Dlist	*cprev;	/* cache prev entry */
	Dlist	*chain;	/* hash table chain */
	Blk	*ins;	/* loaded head */

	vlong	gen;	/* deadlist gen */
	vlong	bgen;	/* birth gen */
	Bptr	hd;	/* deadlist head */
	Bptr	tl;	/* deadlist tail */
};

struct Arange {
	Avl;
	vlong	off;
	vlong	len;
};

struct Bucket {
	Lock;
	Blk	*b;
};

struct Amsg {
	int	op;
	int	fd;
	union {
		struct {	/* AOsnap */
			char	old[128];
			char	new[128];
		};
		struct {	/* AOsync */
			int	halt;
		};
		struct {	/* AOclear */
			Mount	*mnt;
			Dent	*dent;
			vlong	qpath;
			vlong	off;
			vlong	length;
		};
	};
};

struct Fmsg {
	Fcall;
	Conn	*conn;
	int	sz;	/* the size of the message buf */
	uchar	buf[];
};

struct Tree {
	/* in-memory */
	Lock	lk;
	long	memref;	/* number of in-memory references to this */
	int	dirty;

	/* on-disk */
	int	nsucc;	/* number snapshots after us */
	int	nlbl;	/* number of labels referring to us */
	int	ht;	/* height of the tree */
	Bptr	bp;	/* block pointer of root */
	vlong	memgen;	/* wip next generation */
	vlong	gen;	/* generation */
	vlong	prev;	/* previous snapshot */

	Msg	mq[64];
	int	qsz;
	int	nq;
	char	buf[Bufspc];
};

enum {
	DFblk,
	DFtree,
};

struct Bfree {
	Bfree	*next;
	int	op;
	Tree	*t;
	Blk	*b;
	Bptr	bp;
};

struct User {
	int	id;
	int	lead;
	int	*memb;
	int	nmemb;
	char	name[128];
};

enum {
	Qflush,
	Qwrite,
	Qfree,
};

struct Qent {
	long	qgen;
	Bptr	bp;
	Blk	*b;
	int	op;
};

struct Syncq {
	QLock	lk;
	Rendez	fullrz;
	Rendez	emptyrz;
	Qent	*heap;
	int	nheap;
	int	heapsz;
};

struct Stats {
	vlong	cachehit;
	vlong	cachelook;
};

struct Fshdr {
	int	blksz;
	int	bufspc;
	Tree	snap;
	int	narena;
	vlong	arenasz;
	vlong	nextqid;
	vlong	nextgen;
	vlong	qgen;
	Bptr	*arenabp;
};

/*
 * Overall state of the file sytem.
 * Shadows the superblock contents.
 */
struct Gefs {
	Fshdr;
	/* superblocks */
	Blk	*sb0;	/* primary */
	Blk	*sb1;	/* backup */

	/* arena allocation */
	Arena	*arenas;
	long	roundrobin;
	long	syncing;
	long	nsyncers;
	long	nreaders;

	int	gotinfo;
	QLock	synclk;
	Rendez	syncrz;

	Lock	mountlk;
	Mount	*mounts;
	Mount	*snapmnt;
	Lock	connlk;
	Conn	*conns;

	Chan	*wrchan;
	Chan	*admchan;
	Chan	**rdchan;

	QLock	mutlk;
	int	nworker;
	long	epoch;
	long	lepoch[32];
	Bfree	*limbo[3];

	Syncq	syncq[32];


	int	fd;
	long	broken;
	long	rdonly;
	int	noauth;

	/* user list */
	RWLock	userlk;
	User	*users;
	int	nusers;

	/* open directory entries */
	Lock	dtablk;
	Dent	*dtab[Ndtab];

	/* slow block io */
	QLock	blklk[32];
	
	/* deadlist cache */
	Dlist	**dlcache;
	Dlist	*dlhead;
	Dlist	*dltail;
	int	dlcount;
	int	dlcmax;

	/* block lru */
	QLock	lrulk;
	Rendez	lrurz;
	Bucket	*bcache;
	Blk	*chead;
	Blk	*ctail;
	usize	ccount;
	usize	cmax;

	RWLock	flushq[Nflushtab];

	Stats	stats;
};

struct Arena {
	Lock;
	Avltree *free;
	Blk	**queue;
	int	nqueue;
	Blk	*hd;			/* arena header */
	Blk	*tl;			/* arena footer */
	Blk	**q;			/* write queue */
	vlong	nq;
	vlong	size;
	vlong	used;
	vlong	reserve;
	/* allocation log */
	vlong	nlog;		/* logged since last copression */
	Bptr	loghd;		/* allocation log */
	Blk	*logtl;		/* end of the log, open for writing */
	Syncq	*sync;
};

struct Xdir {
	/* file data */
	uvlong	flag;	/* storage flags */
	Qid	qid;	/* unique id from server */
	ulong	mode;	/* permissions */
	vlong	atime;	/* last read time: nsec */
	vlong	mtime;	/* last write time: nsec */
	uvlong	length;	/* file length */
	int	uid;	/* owner name */
	int	gid;	/* group name */
	int	muid;	/* last modifier name */
	char	*name;	/* last element of path */
};

struct Dent {
	RWLock;
	Key;
	Xdir;
	Dent	*next;
	vlong	up;
	long	ref;
	char	gone;

	char	buf[Maxent];
};

struct Mount {
	Lock;
	Mount	*next;
	long	ref;
	vlong	gen;
	char	*name;
	Tree	*root;	/* EBR protected */
};

struct Conn {
	Conn	*next;
	QLock	wrlk;
	int	rfd;
	int	wfd;
	int	iounit;
	int	versioned;

	/* fid hash table */
	Lock	fidtablk[Nfidtab];
	Fid	*fidtab[Nfidtab];
};

struct Fid {
	Lock;
	Fid	*next;
	/*
	 * if opened with OEXEC, we want to use a snapshot,
	 * instead of the most recent root, to prevent
	 * paging in the wrong executable.
	 */
	Mount	*mnt;
	Scan	*scan;	/* in progres scan */
	Dent	*dent;	/* (pqid, name) ref, modified on rename */	
	void	*auth;

	u32int	fid;
	vlong	qpath;
	vlong	pqpath;
	long	ref;
	int	mode;
	int	iounit;

	int	uid;
	int	duid;
	int	dgid;
	int	dmode;
};

enum {
	POmod,
	POrot,
	POsplit,
	POmerge,
};

struct Scanp {
	int	bi;
	int	vi;
	Blk	*b;
};

struct Scan {
	vlong	offset;	/* last read offset */

	char	first;
	char	done;
	char	overflow;
	char	present;
	Kvp	kv;
	Key	pfx;
	char	kvbuf[Kvmax];
	char	pfxbuf[Keymax];
	Scanp	*path;
};

struct Blk {
	/* cache entry */
	Blk	*cnext;
	Blk	*cprev;
	Blk	*hnext;

	/* Freelist entry */
	Blk	*fnext;

	long	flag;

	/* serialized to disk in header */
	short	type;	/* @0, for all */
	union {
		struct {
			short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
			short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
			short   nbuf;	/* @6, for Pivot */
			short   bufsz;	/* @8, for Pivot */
		};
		struct {
			int	logsz;	/* @2 for allocation log */
			vlong	loghash; /* @6 for log */
		};
		struct {
			int	deadsz;	/* @2 size of deadlist */
			Bptr	deadp;	/* @4 next deadlist chain */
		};
	};

	/* debug */
	uintptr lasthold;
	uintptr lastdrop;
	uintptr	enqueued;
	uintptr cached;
	uintptr uncached;
	uintptr	alloced;
	uintptr	freed;

	Bptr	bp;
	long	ref;
	char	*data;
	char	buf[Blksz];
	vlong	magic;
};

struct Chan {
	int	size;	/* size of queue */
	long	count;	/* how many in queue (semaphore) */
	long	avail;	/* how many available to send (semaphore) */
	Lock	rl, wl;	/* circular pointers */
	void	**rp;
	void	**wp;
	void*	args[];	/* list of saved pointers, [->size] */
};