shithub: riscv

Download patch

ref: f3f93925173d15ca48e90ce1624452d7e3b7726f
parent: 93117262c2e377d9d4f1588924032d1b69e7e2f9
author: cinap_lenrek <[email protected]>
date: Sun Oct 29 19:09:54 EDT 2017

kernel: introduce devswap #¶ to serve /dev/swap and handle swapfile encryption

--- a/lib/namespace
+++ b/lib/namespace
@@ -10,6 +10,7 @@
 bind -c #s /srv
 bind -q #σ /shr
 bind -a #¤ /dev
+bind -qa #¶ /dev
 
 # authentication
 mount -b /srv/factotum /mnt
--- a/sys/man/3/cons
+++ b/sys/man/3/cons
@@ -22,7 +22,6 @@
 .B /dev/ppid
 .B /dev/random
 .B /dev/reboot
-.B /dev/swap
 .B /dev/sysname
 .B /dev/sysstat
 .B /dev/time
@@ -239,41 +238,6 @@
 Writing anything to
 .B sysstat
 resets all of the counts on all processors.
-.PP
-The
-.B swap
-device holds a text block giving memory usage statistics:
-.IP
-.EX
-\fIn\fP memory
-\fIn\fP pagesize
-\fIn\fP kernel
-\fIn\fP/\fIm\fP user
-\fIn\fP/\fIm\fP swap
-\fIa\fP/\fIn\fP/\fIm\fP kernel malloc
-\fIa\fP/\fIn\fP/\fIm\fP kernel draw
-.EE
-.PP
-These are total memory (bytes), system page size (bytes),
-kernel memory (pages), user memory (pages), swap space (pages),
-kernel malloced data (bytes), and kernel graphics data (bytes).
-The expression
-.IR n / m
-indicates
-.I n
-used out of
-.I m
-available.
-For kernel malloc and kernel draw,
-.IR a
-indicates the current allocation in bytes.
-These numbers are not blank padded.
-.PP
-To turn on swapping, write to
-.B swap
-the textual file descriptor number of a file or device on which to swap.
-See
-.IR swap (8).
 .PP
 Reads and writes to
 .IR mordor
--- /dev/null
+++ b/sys/man/3/swap
@@ -1,0 +1,46 @@
+.TH SWAP 3
+.SH NAME
+swap \- memory usage statistics and pagefile control
+.SH SYNOPSIS
+.nf
+.B bind -a #¶ /dev
+
+.B /dev/swap
+.fi
+.SH DESCRIPTION
+The
+.B swap
+device holds a text block giving memory usage statistics:
+.IP
+.EX
+\fIn\fP memory
+\fIn\fP pagesize
+\fIn\fP kernel
+\fIn\fP/\fIm\fP user
+\fIn\fP/\fIm\fP swap
+\fIa\fP/\fIn\fP/\fIm\fP kernel malloc
+\fIa\fP/\fIn\fP/\fIm\fP kernel draw
+.EE
+.PP
+These are total memory (bytes), system page size (bytes),
+kernel memory (pages), user memory (pages), swap space (pages),
+kernel malloced data (bytes), and kernel graphics data (bytes).
+The expression
+.IR n / m
+indicates
+.I n
+used out of
+.I m
+available.
+For kernel malloc and kernel draw,
+.IR a
+indicates the current allocation in bytes.
+These numbers are not blank padded.
+.PP
+To turn on swapping, write to
+.B swap
+the textual file descriptor number of a file or device on which to swap.
+.SH SEE ALSO
+.IR swap (8).
+.SH SOURCE
+.B /sys/src/9/port/devswap.c
--- a/sys/man/8/swap
+++ b/sys/man/8/swap
@@ -35,4 +35,5 @@
 ctl-message in
 .IR proc (3)).
 .SH "SEE ALSO"
+.IR swap (3),
 .IR proc (3)
--- a/sys/src/9/bcm/main.c
+++ b/sys/src/9/bcm/main.c
@@ -270,7 +270,6 @@
 	links();
 	chandevreset();			/* most devices are discovered here */
 	pageinit();
-	swapinit();
 	userinit();
 	gpiomeminit();
 	schedinit();
--- a/sys/src/9/bcm/mkfile
+++ b/sys/src/9/bcm/mkfile
@@ -33,7 +33,6 @@
 	rdb.$O\
 	rebootcmd.$O\
 	segment.$O\
-	swap.$O\
 	syscallfmt.$O\
 	sysfile.$O\
 	sysproc.$O\
--- a/sys/src/9/bcm/picpuf
+++ b/sys/src/9/bcm/picpuf
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	env
 	pipe
 	proc
--- a/sys/src/9/bcm/pif
+++ b/sys/src/9/bcm/pif
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	env
 	pipe
 	proc
--- a/sys/src/9/boot/bootrc
+++ b/sys/src/9/boot/bootrc
@@ -10,7 +10,7 @@
 
 bind -q '#d' /fd
 bind -q '#p' /proc
-for(i in S f k æ t b m)
+for(i in ¶ P S f k æ t b m)
 	bind -qa '#'^$i /dev
 
 # bind in an ip interface
--- a/sys/src/9/kw/main.c
+++ b/sys/src/9/kw/main.c
@@ -322,7 +322,6 @@
 	chandevreset();			/* most devices are discovered here */
 
 	pageinit();
-	swapinit();
 	userinit();
 	schedinit();
 	panic("schedinit returned");
--- a/sys/src/9/kw/mkfile
+++ b/sys/src/9/kw/mkfile
@@ -32,7 +32,6 @@
 	qio.$O\
 	qlock.$O\
 	segment.$O\
-	swap.$O\
 	syscallfmt.$O\
 	sysfile.$O\
 	sysproc.$O\
--- a/sys/src/9/kw/plug
+++ b/sys/src/9/kw/plug
@@ -3,6 +3,7 @@
 dev
 	root
 	cons
+	swap
 	env
 	pipe
 	proc
--- a/sys/src/9/mtx/main.c
+++ b/sys/src/9/mtx/main.c
@@ -35,7 +35,6 @@
 	links();
 	chandevreset();
 	pageinit();
-	swapinit();
 	fpsave(&initfp);
 	initfp.fpscr = 0;
 	userinit();
--- a/sys/src/9/mtx/mkfile
+++ b/sys/src/9/mtx/mkfile
@@ -30,7 +30,6 @@
 	qlock.$O\
 	rdb.$O\
 	segment.$O\
-	swap.$O\
 	sysfile.$O\
 	sysproc.$O\
 	taslock.$O\
--- a/sys/src/9/mtx/mtx
+++ b/sys/src/9/mtx/mtx
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	arch
 	pnp		pci
 	env
--- a/sys/src/9/mtx/mtxcpu
+++ b/sys/src/9/mtx/mtxcpu
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	arch
 	pnp		pci
 	env
--- a/sys/src/9/omap/beagle
+++ b/sys/src/9/omap/beagle
@@ -2,6 +2,7 @@
 dev
 	root
 	cons
+	swap
 	env
 	pipe
 	proc
--- a/sys/src/9/omap/main.c
+++ b/sys/src/9/omap/main.c
@@ -276,7 +276,6 @@
 //	i8250console();			/* too early; see init0 */
 
 	pageinit();
-	swapinit();
 	userinit();
 	schedinit();
 }
--- a/sys/src/9/omap/mkfile
+++ b/sys/src/9/omap/mkfile
@@ -33,7 +33,6 @@
 	qio.$O\
 	qlock.$O\
 	segment.$O\
-	swap.$O\
 	sysfile.$O\
 	sysproc.$O\
 	taslock.$O\
--- a/sys/src/9/pc/main.c
+++ b/sys/src/9/pc/main.c
@@ -62,7 +62,6 @@
 	chandevreset();
 	netconsole();
 	pageinit();
-	swapinit();
 	userinit();
 	schedinit();
 }
--- a/sys/src/9/pc/mkfile
+++ b/sys/src/9/pc/mkfile
@@ -35,7 +35,6 @@
 	rdb.$O\
 	rebootcmd.$O\
 	segment.$O\
-	swap.$O\
 	syscallfmt.$O\
 	sysfile.$O\
 	sysproc.$O\
--- a/sys/src/9/pc/pc
+++ b/sys/src/9/pc/pc
@@ -2,6 +2,7 @@
 dev
 	root
 	cons
+	swap
 	arch
 	pnp		pci
 	env
--- a/sys/src/9/pc64/main.c
+++ b/sys/src/9/pc64/main.c
@@ -332,7 +332,6 @@
 	netconsole();
 	preallocpages();
 	pageinit();
-	swapinit();
 	userinit();
 	schedinit();
 }
--- a/sys/src/9/pc64/mkfile
+++ b/sys/src/9/pc64/mkfile
@@ -33,7 +33,6 @@
 	rdb.$O\
 	rebootcmd.$O\
 	segment.$O\
-	swap.$O\
 	syscallfmt.$O\
 	sysfile.$O\
 	sysproc.$O\
--- a/sys/src/9/pc64/pc64
+++ b/sys/src/9/pc64/pc64
@@ -2,6 +2,7 @@
 dev
 	root
 	cons
+	swap
 	arch
 	pnp		pci
 	env
--- a/sys/src/9/port/devcons.c
+++ b/sys/src/9/port/devcons.c
@@ -5,7 +5,6 @@
 #include	"fns.h"
 #include	"../port/error.h"
 
-#include	<pool.h>
 #include	<authsrv.h>
 
 void	(*consdebug)(void) = nil;
@@ -324,7 +323,6 @@
 	Qppid,
 	Qrandom,
 	Qreboot,
-	Qswap,
 	Qsysname,
 	Qsysstat,
 	Qtime,
@@ -357,7 +355,6 @@
 	"ppid",		{Qppid},	NUMSIZE,	0444,
 	"random",	{Qrandom},	0,		0444,
 	"reboot",	{Qreboot},	0,		0664,
-	"swap",		{Qswap},	0,		0664,
 	"sysname",	{Qsysname},	0,		0664,
 	"sysstat",	{Qsysstat},	0,		0666,
 	"time",		{Qtime},	NUMSIZE+3*VLNUMSIZE,	0664,
@@ -471,8 +468,6 @@
 	int i, k, id;
 	vlong offset = off;
 	extern char configfile[];
-	extern Image fscache;
-	extern Image swapimage;
 
 	if(n <= 0)
 		return n;
@@ -592,33 +587,6 @@
 		poperror();
 		return n;
 
-	case Qswap:
-		snprint(tmp, sizeof tmp,
-			"%llud memory\n"
-			"%llud pagesize\n"
-			"%lud kernel\n"
-			"%lud/%lud user\n"
-			"%lud/%lud swap\n"
-			"%llud/%llud/%llud kernel malloc\n"
-			"%llud/%llud/%llud kernel draw\n"
-			"%llud/%llud/%llud kernel secret\n",
-			(uvlong)conf.npage*BY2PG,
-			(uvlong)BY2PG,
-			conf.npage-conf.upages,
-			palloc.user-palloc.freecount-fscache.pgref-swapimage.pgref, palloc.user,
-			conf.nswap-swapalloc.free, conf.nswap,
-			(uvlong)mainmem->curalloc,
-			(uvlong)mainmem->cursize,
-			(uvlong)mainmem->maxsize,
-			(uvlong)imagmem->curalloc,
-			(uvlong)imagmem->cursize,
-			(uvlong)imagmem->maxsize,
-			(uvlong)secrmem->curalloc,
-			(uvlong)secrmem->cursize,
-			(uvlong)secrmem->maxsize);
-
-		return readstr((ulong)offset, buf, n, tmp);
-
 	case Qsysname:
 		if(sysname == nil)
 			return 0;
@@ -669,8 +637,7 @@
 	long l, bp;
 	char *a;
 	Mach *mp;
-	int id, fd;
-	Chan *swc;
+	int id;
 	ulong offset;
 	Cmdbuf *cb;
 	Cmdtab *ct;
@@ -763,25 +730,6 @@
 				mp->tlbpurge = 0;
 			}
 		}
-		break;
-
-	case Qswap:
-		if(n >= sizeof buf)
-			error(Egreg);
-		memmove(buf, va, n);	/* so we can NUL-terminate */
-		buf[n] = 0;
-		/* start a pager if not already started */
-		if(strncmp(buf, "start", 5) == 0){
-			kickpager();
-			break;
-		}
-		if(!iseve())
-			error(Eperm);
-		if(buf[0]<'0' || '9'<buf[0])
-			error(Ebadarg);
-		fd = strtoul(buf, 0, 0);
-		swc = fdtochan(fd, ORDWR, 1, 1);
-		setswapchan(swc);
 		break;
 
 	case Qsysname:
--- /dev/null
+++ b/sys/src/9/port/devswap.c
@@ -1,0 +1,612 @@
+#include	"u.h"
+#include	"../port/lib.h"
+#include	"mem.h"
+#include	"dat.h"
+#include	"fns.h"
+#include	"../port/error.h"
+
+#include	<libsec.h>
+#include	<pool.h>
+
+static int	canflush(Proc*, Segment*);
+static void	executeio(void);
+static void	pageout(Proc*, Segment*);
+static void	pagepte(int, Page**);
+static void	pager(void*);
+
+Image 	swapimage = {
+	.notext = 1,
+};
+
+static Chan	*swapchan;
+static uchar	*swapbuf;
+static AESstate *swapkey;
+
+static Page	**iolist;
+static int	ioptr;
+
+static ushort	ageclock;
+
+static void
+swapinit(void)
+{
+	swapalloc.swmap = xalloc(conf.nswap);
+	swapalloc.top = &swapalloc.swmap[conf.nswap];
+	swapalloc.alloc = swapalloc.swmap;
+	swapalloc.last = swapalloc.swmap;
+	swapalloc.free = conf.nswap;
+	swapalloc.xref = 0;
+
+	iolist = xalloc(conf.nswppo*sizeof(Page*));
+	if(swapalloc.swmap == nil || iolist == nil)
+		panic("swapinit: not enough memory");
+}
+
+static uintptr
+newswap(void)
+{
+	uchar *look;
+
+	lock(&swapalloc);
+	if(swapalloc.free == 0) {
+		unlock(&swapalloc);
+		return ~0;
+	}
+	look = memchr(swapalloc.last, 0, swapalloc.top-swapalloc.last);
+	if(look == nil)
+		look = memchr(swapalloc.swmap, 0, swapalloc.last-swapalloc.swmap);
+	*look = 2;	/* ref for pte + io transaction */
+	swapalloc.last = look;
+	swapalloc.free--;
+	unlock(&swapalloc);
+	return (look-swapalloc.swmap) * BY2PG;
+}
+
+void
+putswap(Page *p)
+{
+	uchar *idx;
+
+	lock(&swapalloc);
+	idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
+	if(*idx == 0)
+		panic("putswap %#p ref == 0", p);
+
+	if(*idx == 255) {
+		if(swapalloc.xref == 0)
+			panic("putswap %#p xref == 0", p);
+
+		if(--swapalloc.xref == 0) {
+			for(idx = swapalloc.swmap; idx < swapalloc.top; idx++) {
+				if(*idx == 255) {
+					*idx = 0;
+					swapalloc.free++;
+				}
+			}
+		}
+	} else {
+		if(--(*idx) == 0)
+			swapalloc.free++;
+	}
+	unlock(&swapalloc);
+}
+
+void
+dupswap(Page *p)
+{
+	uchar *idx;
+
+	lock(&swapalloc);
+	idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
+	if(*idx == 255)
+		swapalloc.xref++;
+	else {
+		if(++(*idx) == 255)
+			swapalloc.xref += 255;
+	}
+	unlock(&swapalloc);
+}
+
+int
+swapcount(uintptr daddr)
+{
+	return swapalloc.swmap[daddr/BY2PG];
+}
+
+void
+kickpager(void)
+{
+	static Ref started;
+
+	if(started.ref || incref(&started) != 1)
+		wakeup(&swapalloc.r);
+	else
+		kproc("pager", pager, 0);
+}
+
+static int
+reclaim(void)
+{
+	ulong np;
+
+	for(;;){
+		if((np = pagereclaim(&fscache, 1000)) > 0) {
+			if(0) print("reclaim: %lud fscache\n", np);
+		} else if((np = pagereclaim(&swapimage, 1000)) > 0) {
+			if(0) print("reclaim: %lud swap\n", np);
+		} else if((np = imagereclaim(1000)) > 0) {
+			if(0) print("reclaim: %lud image\n", np);
+		}
+		if(!needpages(nil))
+			return 1;	/* have pages, done */
+		if(np == 0)
+			return 0;	/* didnt reclaim, need to swap */
+		sched();
+	}
+}
+
+static void
+pager(void*)
+{
+	int i;
+	Segment *s;
+	Proc *p, *ep;
+
+	p = proctab(0);
+	ep = &p[conf.nproc];
+
+	while(waserror())
+		;
+
+	for(;;){
+		up->psstate = "Reclaim";
+		if(reclaim()){
+			up->psstate = "Idle";
+			wakeup(&palloc.pwait[0]);
+			wakeup(&palloc.pwait[1]);
+			sleep(&swapalloc.r, needpages, nil);
+			continue;
+		}
+
+		if(swapimage.c == nil || swapalloc.free == 0){
+		Killbig:
+			if(!freebroken())
+				killbig("out of memory");
+			sched();
+			continue;
+		}
+
+		i = ageclock;
+		do {
+			if(++p >= ep){
+				if(++ageclock == i)
+					goto Killbig;
+				p = proctab(0);
+			}
+		} while(p->state == Dead || p->noswap || !canqlock(&p->seglock));
+		up->psstate = "Pageout";
+		for(i = 0; i < NSEG; i++) {
+			if((s = p->seg[i]) != nil) {
+				switch(s->type&SG_TYPE) {
+				default:
+					break;
+				case SG_TEXT:
+					pageout(p, s);
+					break;
+				case SG_DATA:
+				case SG_BSS:
+				case SG_STACK:
+				case SG_SHARED:
+					pageout(p, s);
+					break;
+				}
+			}
+		}
+		qunlock(&p->seglock);
+
+		if(ioptr > 0) {
+			up->psstate = "I/O";
+			executeio();
+		}
+	}
+}
+
+static void
+pageout(Proc *p, Segment *s)
+{
+	int type, i, size;
+	short age;
+	Pte *l;
+	Page **pg, *entry;
+
+	if(!canqlock(s))	/* We cannot afford to wait, we will surely deadlock */
+		return;
+
+	if(!canflush(p, s)	/* Able to invalidate all tlbs with references */
+	|| waserror()) {
+		qunlock(s);
+		putseg(s);
+		return;
+	}
+
+	/* Pass through the pte tables looking for memory pages to swap out */
+	type = s->type&SG_TYPE;
+	size = s->mapsize;
+	for(i = 0; i < size; i++) {
+		l = s->map[i];
+		if(l == nil)
+			continue;
+		for(pg = l->first; pg <= l->last; pg++) {
+			entry = *pg;
+			if(pagedout(entry))
+				continue;
+			if(entry->modref & PG_REF) {
+				entry->modref &= ~PG_REF;
+				entry->refage = ageclock;
+				continue;
+			}
+			age = (short)(ageclock - entry->refage);
+			if(age < 16)
+				continue;
+			pagepte(type, pg);
+		}
+	}
+	poperror();
+	qunlock(s);
+	putseg(s);
+}
+
+static int
+canflush(Proc *p, Segment *s)
+{
+	int i;
+	Proc *ep;
+
+	if(incref(s) == 2)		/* Easy if we are the only user */
+		return canpage(p);
+
+	/* Now we must do hardwork to ensure all processes which have tlb
+	 * entries for this segment will be flushed if we succeed in paging it out
+	 */
+	p = proctab(0);
+	ep = &p[conf.nproc];
+	while(p < ep) {
+		if(p->state != Dead) {
+			for(i = 0; i < NSEG; i++)
+				if(p->seg[i] == s)
+					if(!canpage(p))
+						return 0;
+		}
+		p++;
+	}
+	return 1;
+}
+
+static void
+pagepte(int type, Page **pg)
+{
+	uintptr daddr;
+	Page *outp;
+
+	outp = *pg;
+	switch(type) {
+	case SG_TEXT:				/* Revert to demand load */
+		putpage(outp);
+		*pg = nil;
+		break;
+
+	case SG_DATA:
+	case SG_BSS:
+	case SG_STACK:
+	case SG_SHARED:
+		if(ioptr >= conf.nswppo)
+			break;
+
+		/*
+		 *  get a new swap address with swapcount 2, one for the pte
+		 *  and one extra ref for us while we write the page to disk
+		 */
+		daddr = newswap();
+		if(daddr == ~0)
+			break;
+
+		/* clear any pages referring to it from the cache */
+		cachedel(&swapimage, daddr);
+
+		/* forget anything that it used to cache */
+		uncachepage(outp);
+
+		/*
+		 *  enter it into the cache so that a fault happening
+		 *  during the write will grab the page from the cache
+		 *  rather than one partially written to the disk
+		 */
+		outp->daddr = daddr;
+		cachepage(outp, &swapimage);
+		*pg = (Page*)(daddr|PG_ONSWAP);
+
+		/* Add page to IO transaction list */
+		iolist[ioptr++] = outp;
+		break;
+	}
+}
+
+void
+pagersummary(void)
+{
+	print("%lud/%lud memory %lud/%lud swap %d iolist\n",
+		palloc.user-palloc.freecount,
+		palloc.user, conf.nswap-swapalloc.free, conf.nswap,
+		ioptr);
+}
+
+static void
+executeio(void)
+{
+	Page *outp;
+	int i, n;
+	Chan *c;
+	char *kaddr;
+	KMap *k;
+
+	c = swapimage.c;
+	for(i = 0; i < ioptr; i++) {
+		if(ioptr > conf.nswppo)
+			panic("executeio: ioptr %d > %d", ioptr, conf.nswppo);
+		outp = iolist[i];
+
+		assert(outp->ref > 0);
+		assert(outp->image == &swapimage);
+		assert(outp->daddr != ~0);
+
+		/* only write when swap address still in use */
+		if(swapcount(outp->daddr) > 1){
+			k = kmap(outp);
+			kaddr = (char*)VA(k);
+
+			if(waserror())
+				panic("executeio: page outp I/O error");
+
+			n = devtab[c->type]->write(c, kaddr, BY2PG, outp->daddr);
+			if(n != BY2PG)
+				nexterror();
+
+			kunmap(k);
+			poperror();
+		}
+
+		/* drop our extra swap reference */
+		putswap((Page*)outp->daddr);
+
+		/* Free up the page after I/O */
+		putpage(outp);
+	}
+	ioptr = 0;
+}
+
+int
+needpages(void*)
+{
+	return palloc.freecount < swapalloc.headroom;
+}
+
+static void
+setswapchan(Chan *c)
+{
+	uchar buf[sizeof(Dir)+100];
+	Dir d;
+	int n;
+
+	if(waserror()){
+		cclose(c);
+		nexterror();
+	}
+	if(swapimage.c != nil) {
+		if(swapalloc.free != conf.nswap)
+			error(Einuse);
+		cclose(swapimage.c);
+		swapimage.c = nil;
+	}
+
+	/*
+	 *  if this isn't a file, set the swap space
+	 *  to be at most the size of the partition
+	 */
+	if(devtab[c->type]->dc != L'M'){
+		n = devtab[c->type]->stat(c, buf, sizeof buf);
+		if(n <= 0 || convM2D(buf, n, &d, nil) == 0)
+			error("stat failed in setswapchan");
+		if(d.length < conf.nswppo*BY2PG)
+			error("swap device too small");
+		if(d.length < conf.nswap*BY2PG){
+			conf.nswap = d.length/BY2PG;
+			swapalloc.top = &swapalloc.swmap[conf.nswap];
+			swapalloc.free = conf.nswap;
+		}
+	}
+	c->flag &= ~CCACHE;
+	cclunk(c);
+	poperror();
+
+	swapchan = c;
+	swapimage.c = namec("#¶/swapfile", Aopen, ORDWR, 0);
+}
+
+enum {
+	Qdir,
+	Qswap,
+	Qswapfile,
+};
+
+static Dirtab swapdir[]={
+	".",		{Qdir, 0, QTDIR},	0,		DMDIR|0555,
+	"swap",		{Qswap},		0,		0664,
+	"swapfile",	{Qswapfile},		0,		0600,
+};
+
+static Chan*
+swapattach(char *spec)
+{
+	return devattach(L'¶', spec);
+}
+
+static Walkqid*
+swapwalk(Chan *c, Chan *nc, char **name, int nname)
+{
+	return devwalk(c, nc, name, nname, swapdir, nelem(swapdir), devgen);
+}
+
+static int
+swapstat(Chan *c, uchar *dp, int n)
+{
+	return devstat(c, dp, n, swapdir, nelem(swapdir), devgen);
+}
+
+static Chan*
+swapopen(Chan *c, int omode)
+{
+	uchar key[128/8];
+
+	switch((ulong)c->qid.path){
+	case Qswapfile:
+		if(!iseve() || omode != ORDWR)
+			error(Eperm);
+		if(swapimage.c != nil)
+			error(Einuse);
+		if(swapchan == nil)
+			error(Egreg);
+
+		c->mode = openmode(omode);
+		c->flag |= COPEN;
+		c->offset = 0;
+
+		swapbuf = mallocalign(BY2PG, BY2PG, 0, 0);
+		swapkey = secalloc(sizeof(AESstate)*2);
+		if(swapbuf == nil || swapkey == nil)
+			error(Enomem);
+
+		genrandom(key, sizeof(key));
+		setupAESstate(&swapkey[0], key, sizeof(key), nil);
+		genrandom(key, sizeof(key));
+		setupAESstate(&swapkey[1], key, sizeof(key), nil);
+		memset(key, 0, sizeof(key));
+
+		return c;
+	}
+	return devopen(c, omode, swapdir, nelem(swapdir), devgen);
+}
+
+static void
+swapclose(Chan *c)
+{
+	if((c->flag & COPEN) == 0)
+		return;
+	switch((ulong)c->qid.path){
+	case Qswapfile:
+		cclose(swapchan);
+		swapchan = nil;
+		secfree(swapkey);
+		swapkey = nil;
+		free(swapbuf);
+		swapbuf = nil;
+		break;
+	}
+}
+
+static long
+swapread(Chan *c, void *va, long n, vlong off)
+{
+	char tmp[256];		/* must be >= 18*NUMSIZE (Qswap) */
+
+	switch((ulong)c->qid.path){
+	case Qdir:
+		return devdirread(c, va, n, swapdir, nelem(swapdir), devgen);
+	case Qswap:
+		snprint(tmp, sizeof tmp,
+			"%llud memory\n"
+			"%llud pagesize\n"
+			"%lud kernel\n"
+			"%lud/%lud user\n"
+			"%lud/%lud swap\n"
+			"%llud/%llud/%llud kernel malloc\n"
+			"%llud/%llud/%llud kernel draw\n"
+			"%llud/%llud/%llud kernel secret\n",
+			(uvlong)conf.npage*BY2PG,
+			(uvlong)BY2PG,
+			conf.npage-conf.upages,
+			palloc.user-palloc.freecount-fscache.pgref-swapimage.pgref, palloc.user,
+			conf.nswap-swapalloc.free, conf.nswap,
+			(uvlong)mainmem->curalloc,
+			(uvlong)mainmem->cursize,
+			(uvlong)mainmem->maxsize,
+			(uvlong)imagmem->curalloc,
+			(uvlong)imagmem->cursize,
+			(uvlong)imagmem->maxsize,
+			(uvlong)secrmem->curalloc,
+			(uvlong)secrmem->cursize,
+			(uvlong)secrmem->maxsize);
+		return readstr((ulong)off, va, n, tmp);
+	case Qswapfile:
+		if(n != BY2PG)
+			error(Ebadarg);
+		if(devtab[swapchan->type]->read(swapchan, va, n, off) != n)
+			error(Eio);
+		aes_xts_decrypt(&swapkey[0], &swapkey[1], off, va, va, n);
+		return n;
+	}
+	error(Egreg);
+	return 0;
+}
+
+static long
+swapwrite(Chan *c, void *va, long n, vlong off)
+{
+	char buf[256];
+	
+	switch((ulong)c->qid.path){
+	case Qswap:
+		if(!iseve())
+			error(Eperm);
+		if(n >= sizeof buf)
+			error(Egreg);
+		memmove(buf, va, n);	/* so we can NUL-terminate */
+		buf[n] = 0;
+		/* start a pager if not already started */
+		if(strncmp(buf, "start", 5) == 0)
+			kickpager();
+		else if(buf[0]>='0' && '9'<=buf[0])
+			setswapchan(fdtochan(strtoul(buf, nil, 0), ORDWR, 1, 1));
+		else
+			error(Ebadctl);
+		return n;
+	case Qswapfile:
+		if(n != BY2PG)
+			error(Ebadarg);
+		aes_xts_encrypt(&swapkey[0], &swapkey[1], off, va, swapbuf, n);
+		if(devtab[swapchan->type]->write(swapchan, swapbuf, n, off) != n)
+			error(Eio);
+		return n;
+	}
+	error(Egreg);
+	return 0;
+}
+
+Dev swapdevtab = {
+	L'¶',
+	"swap",
+	devreset,
+	swapinit,
+	devshutdown,
+	swapattach,
+	swapwalk,
+	swapstat,
+	swapopen,
+	devcreate,
+	swapclose,
+	swapread,
+	devbread,
+	swapwrite,
+	devbwrite,
+	devremove,
+	devwstat,
+};
--- a/sys/src/9/port/portfns.h
+++ b/sys/src/9/port/portfns.h
@@ -318,7 +318,6 @@
 void		setmalloctag(void*, uintptr);
 void		setrealloctag(void*, uintptr);
 void		setregisters(Ureg*, char*, char*, int);
-void		setswapchan(Chan*);
 void		setupwatchpts(Proc*, Watchpt*, int);
 char*		skipslash(char*);
 void		sleep(Rendez*, int(*)(void*), void*);
@@ -332,7 +331,6 @@
 void		shrrenameuser(char*, char*);
 int		swapcount(uintptr);
 int		swapfull(void);
-void		swapinit(void);
 void		syscallfmt(ulong syscallno, uintptr pc, va_list list);
 void		sysretfmt(ulong syscallno, va_list list, uintptr ret, uvlong start, uvlong stop);
 void		timeradd(Timer*);
--- a/sys/src/9/port/portmkfile
+++ b/sys/src/9/port/portmkfile
@@ -62,15 +62,15 @@
 %.db:		main.$O
 	$CC -s$stem main.c | dbfmt > $stem.db
 
-alloc.$O:	/sys/include/pool.h
+alloc.$O devswap.$O:	/sys/include/pool.h
 devmnt.$O:	/sys/include/fcall.h
 proc.$O proc.acid:	errstr.h
 devroot.$O:	errstr.h
 devaudio.$O:	../port/audioif.h
-devaoe.$O:	/$objtype/include/ureg.h
-devfs.$O:	/$objtype/include/ureg.h
-devsd.$O:	/$objtype/include/ureg.h
-sdscsi.$O:	/$objtype/include/ureg.h
+devaoe.$O:	../port/sd.h /$objtype/include/ureg.h
+devfs.$O:	../port/sd.h /$objtype/include/ureg.h
+devsd.$O:	../port/sd.h /$objtype/include/ureg.h
+sdscsi.$O:	../port/sd.h /$objtype/include/ureg.h
 trap.$O:	/$objtype/include/ureg.h
 devproc.$O:	/$objtype/include/ureg.h
 main.$O:	init.h
@@ -87,3 +87,5 @@
 devsdp.$O:	../port/thwack.h
 devproc.$O sysproc.$O:	/sys/include/tos.h
 devproc.$O edf.$O proc.$O: /sys/include/trace.h
+devcons.$O:	/sys/include/authsrv.h
+devcap.$O devfs.$O devsdp.$O devssl.$O devtls.$O devswap.$O random.$O: /sys/include/libsec.h
--- a/sys/src/9/port/swap.c
+++ /dev/null
@@ -1,430 +1,0 @@
-#include	"u.h"
-#include	"../port/lib.h"
-#include	"mem.h"
-#include	"dat.h"
-#include	"fns.h"
-#include	"../port/error.h"
-
-static int	canflush(Proc*, Segment*);
-static void	executeio(void);
-static void	pageout(Proc*, Segment*);
-static void	pagepte(int, Page**);
-static void	pager(void*);
-
-Image 	swapimage;
-
-static 	int	swopen;
-static	Page	**iolist;
-static	int	ioptr;
-
-static	ushort	ageclock;
-
-void
-swapinit(void)
-{
-	swapalloc.swmap = xalloc(conf.nswap);
-	swapalloc.top = &swapalloc.swmap[conf.nswap];
-	swapalloc.alloc = swapalloc.swmap;
-	swapalloc.last = swapalloc.swmap;
-	swapalloc.free = conf.nswap;
-	swapalloc.xref = 0;
-
-	iolist = xalloc(conf.nswppo*sizeof(Page*));
-	if(swapalloc.swmap == 0 || iolist == 0)
-		panic("swapinit: not enough memory");
-
-	swapimage.notext = 1;
-}
-
-static uintptr
-newswap(void)
-{
-	uchar *look;
-
-	lock(&swapalloc);
-	if(swapalloc.free == 0) {
-		unlock(&swapalloc);
-		return ~0;
-	}
-	look = memchr(swapalloc.last, 0, swapalloc.top-swapalloc.last);
-	if(look == nil)
-		look = memchr(swapalloc.swmap, 0, swapalloc.last-swapalloc.swmap);
-	*look = 2;	/* ref for pte + io transaction */
-	swapalloc.last = look;
-	swapalloc.free--;
-	unlock(&swapalloc);
-	return (look-swapalloc.swmap) * BY2PG;
-}
-
-void
-putswap(Page *p)
-{
-	uchar *idx;
-
-	lock(&swapalloc);
-	idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
-	if(*idx == 0)
-		panic("putswap %#p ref == 0", p);
-
-	if(*idx == 255) {
-		if(swapalloc.xref == 0)
-			panic("putswap %#p xref == 0", p);
-
-		if(--swapalloc.xref == 0) {
-			for(idx = swapalloc.swmap; idx < swapalloc.top; idx++) {
-				if(*idx == 255) {
-					*idx = 0;
-					swapalloc.free++;
-				}
-			}
-		}
-	} else {
-		if(--(*idx) == 0)
-			swapalloc.free++;
-	}
-	unlock(&swapalloc);
-}
-
-void
-dupswap(Page *p)
-{
-	uchar *idx;
-
-	lock(&swapalloc);
-	idx = &swapalloc.swmap[((uintptr)p)/BY2PG];
-	if(*idx == 255)
-		swapalloc.xref++;
-	else {
-		if(++(*idx) == 255)
-			swapalloc.xref += 255;
-	}
-	unlock(&swapalloc);
-}
-
-int
-swapcount(uintptr daddr)
-{
-	return swapalloc.swmap[daddr/BY2PG];
-}
-
-void
-kickpager(void)
-{
-	static Ref started;
-
-	if(started.ref || incref(&started) != 1)
-		wakeup(&swapalloc.r);
-	else
-		kproc("pager", pager, 0);
-}
-
-static int
-reclaim(void)
-{
-	ulong np;
-
-	for(;;){
-		if((np = pagereclaim(&fscache, 1000)) > 0) {
-			if(0) print("reclaim: %lud fscache\n", np);
-		} else if((np = pagereclaim(&swapimage, 1000)) > 0) {
-			if(0) print("reclaim: %lud swap\n", np);
-		} else if((np = imagereclaim(1000)) > 0) {
-			if(0) print("reclaim: %lud image\n", np);
-		}
-		if(!needpages(nil))
-			return 1;	/* have pages, done */
-		if(np == 0)
-			return 0;	/* didnt reclaim, need to swap */
-		sched();
-	}
-}
-
-static void
-pager(void*)
-{
-	int i;
-	Segment *s;
-	Proc *p, *ep;
-
-	p = proctab(0);
-	ep = &p[conf.nproc];
-
-	while(waserror())
-		;
-
-	for(;;){
-		up->psstate = "Reclaim";
-		if(reclaim()){
-			up->psstate = "Idle";
-			wakeup(&palloc.pwait[0]);
-			wakeup(&palloc.pwait[1]);
-			sleep(&swapalloc.r, needpages, nil);
-			continue;
-		}
-
-		if(swapimage.c == nil || swapalloc.free == 0){
-		Killbig:
-			if(!freebroken())
-				killbig("out of memory");
-			sched();
-			continue;
-		}
-
-		i = ageclock;
-		do {
-			if(++p >= ep){
-				if(++ageclock == i)
-					goto Killbig;
-				p = proctab(0);
-			}
-		} while(p->state == Dead || p->noswap || !canqlock(&p->seglock));
-		up->psstate = "Pageout";
-		for(i = 0; i < NSEG; i++) {
-			if((s = p->seg[i]) != nil) {
-				switch(s->type&SG_TYPE) {
-				default:
-					break;
-				case SG_TEXT:
-					pageout(p, s);
-					break;
-				case SG_DATA:
-				case SG_BSS:
-				case SG_STACK:
-				case SG_SHARED:
-					pageout(p, s);
-					break;
-				}
-			}
-		}
-		qunlock(&p->seglock);
-
-		if(ioptr > 0) {
-			up->psstate = "I/O";
-			executeio();
-		}
-	}
-}
-
-static void
-pageout(Proc *p, Segment *s)
-{
-	int type, i, size;
-	short age;
-	Pte *l;
-	Page **pg, *entry;
-
-	if(!canqlock(s))	/* We cannot afford to wait, we will surely deadlock */
-		return;
-
-	if(!canflush(p, s)) {	/* Able to invalidate all tlbs with references */
-		qunlock(s);
-		putseg(s);
-		return;
-	}
-
-	if(waserror()) {
-		qunlock(s);
-		putseg(s);
-		return;
-	}
-
-	/* Pass through the pte tables looking for memory pages to swap out */
-	type = s->type&SG_TYPE;
-	size = s->mapsize;
-	for(i = 0; i < size; i++) {
-		l = s->map[i];
-		if(l == nil)
-			continue;
-		for(pg = l->first; pg <= l->last; pg++) {
-			entry = *pg;
-			if(pagedout(entry))
-				continue;
-			if(entry->modref & PG_REF) {
-				entry->modref &= ~PG_REF;
-				entry->refage = ageclock;
-				continue;
-			}
-			age = (short)(ageclock - entry->refage);
-			if(age < 16)
-				continue;
-			pagepte(type, pg);
-		}
-	}
-	poperror();
-	qunlock(s);
-	putseg(s);
-}
-
-static int
-canflush(Proc *p, Segment *s)
-{
-	int i;
-	Proc *ep;
-
-	if(incref(s) == 2)		/* Easy if we are the only user */
-		return canpage(p);
-
-	/* Now we must do hardwork to ensure all processes which have tlb
-	 * entries for this segment will be flushed if we succeed in paging it out
-	 */
-	p = proctab(0);
-	ep = &p[conf.nproc];
-	while(p < ep) {
-		if(p->state != Dead) {
-			for(i = 0; i < NSEG; i++)
-				if(p->seg[i] == s)
-					if(!canpage(p))
-						return 0;
-		}
-		p++;
-	}
-	return 1;
-}
-
-static void
-pagepte(int type, Page **pg)
-{
-	uintptr daddr;
-	Page *outp;
-
-	outp = *pg;
-	switch(type) {
-	case SG_TEXT:				/* Revert to demand load */
-		putpage(outp);
-		*pg = nil;
-		break;
-
-	case SG_DATA:
-	case SG_BSS:
-	case SG_STACK:
-	case SG_SHARED:
-		if(ioptr >= conf.nswppo)
-			break;
-
-		/*
-		 *  get a new swap address with swapcount 2, one for the pte
-		 *  and one extra ref for us while we write the page to disk
-		 */
-		daddr = newswap();
-		if(daddr == ~0)
-			break;
-
-		/* clear any pages referring to it from the cache */
-		cachedel(&swapimage, daddr);
-
-		/* forget anything that it used to cache */
-		uncachepage(outp);
-
-		/*
-		 *  enter it into the cache so that a fault happening
-		 *  during the write will grab the page from the cache
-		 *  rather than one partially written to the disk
-		 */
-		outp->daddr = daddr;
-		cachepage(outp, &swapimage);
-		*pg = (Page*)(daddr|PG_ONSWAP);
-
-		/* Add page to IO transaction list */
-		iolist[ioptr++] = outp;
-		break;
-	}
-}
-
-void
-pagersummary(void)
-{
-	print("%lud/%lud memory %lud/%lud swap %d iolist\n",
-		palloc.user-palloc.freecount,
-		palloc.user, conf.nswap-swapalloc.free, conf.nswap,
-		ioptr);
-}
-
-static void
-executeio(void)
-{
-	Page *outp;
-	int i, n;
-	Chan *c;
-	char *kaddr;
-	KMap *k;
-
-	c = swapimage.c;
-	for(i = 0; i < ioptr; i++) {
-		if(ioptr > conf.nswppo)
-			panic("executeio: ioptr %d > %d", ioptr, conf.nswppo);
-		outp = iolist[i];
-
-		assert(outp->ref > 0);
-		assert(outp->image == &swapimage);
-		assert(outp->daddr != ~0);
-
-		/* only write when swap address still in use */
-		if(swapcount(outp->daddr) > 1){
-			k = kmap(outp);
-			kaddr = (char*)VA(k);
-
-			if(waserror())
-				panic("executeio: page outp I/O error");
-
-			n = devtab[c->type]->write(c, kaddr, BY2PG, outp->daddr);
-			if(n != BY2PG)
-				nexterror();
-
-			kunmap(k);
-			poperror();
-		}
-
-		/* drop our extra swap reference */
-		putswap((Page*)outp->daddr);
-
-		/* Free up the page after I/O */
-		putpage(outp);
-	}
-	ioptr = 0;
-}
-
-int
-needpages(void*)
-{
-	return palloc.freecount < swapalloc.headroom;
-}
-
-void
-setswapchan(Chan *c)
-{
-	uchar dirbuf[sizeof(Dir)+100];
-	Dir d;
-	int n;
-
-	if(waserror()){
-		cclose(c);
-		nexterror();
-	}
-	if(swapimage.c != nil) {
-		if(swapalloc.free != conf.nswap)
-			error(Einuse);
-		cclose(swapimage.c);
-		swapimage.c = nil;
-	}
-
-	/*
-	 *  if this isn't a file, set the swap space
-	 *  to be at most the size of the partition
-	 */
-	if(devtab[c->type]->dc != L'M'){
-		n = devtab[c->type]->stat(c, dirbuf, sizeof dirbuf);
-		if(n <= 0 || convM2D(dirbuf, n, &d, nil) == 0)
-			error("stat failed in setswapchan");
-		if(d.length < conf.nswppo*BY2PG)
-			error("swap device too small");
-		if(d.length < conf.nswap*BY2PG){
-			conf.nswap = d.length/BY2PG;
-			swapalloc.top = &swapalloc.swmap[conf.nswap];
-			swapalloc.free = conf.nswap;
-		}
-	}
-	c->flag &= ~CCACHE;
-	cclunk(c);
-	swapimage.c = c;
-	poperror();
-}
--- a/sys/src/9/ppc/blast
+++ b/sys/src/9/ppc/blast
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	env
 	flash
 	pipe
--- a/sys/src/9/ppc/main.c
+++ b/sys/src/9/ppc/main.c
@@ -84,7 +84,6 @@
 	links();
 	chandevreset();
 	pageinit();
-	swapinit();
 	sharedseginit();
 	fpsave(&initfp);
 	initfp.fpscr = 0;
--- a/sys/src/9/ppc/mkfile
+++ b/sys/src/9/ppc/mkfile
@@ -31,7 +31,6 @@
 	qlock.$O\
 	rdb.$O\
 	segment.$O\
-	swap.$O\
 	sysfile.$O\
 	sysproc.$O\
 	taslock.$O\
--- a/sys/src/9/sgi/indy
+++ b/sys/src/9/sgi/indy
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	uart
 	mnt
 	srv
--- a/sys/src/9/sgi/main.c
+++ b/sys/src/9/sgi/main.c
@@ -192,8 +192,6 @@
 	initseg();
 	links();
 	chandevreset();
-
-	swapinit();
 	userinit();
 	schedinit();
 	panic("schedinit returned");
--- a/sys/src/9/sgi/mkfile
+++ b/sys/src/9/sgi/mkfile
@@ -38,7 +38,6 @@
 	rdb.$O\
 	rebootcmd.$O\
 	segment.$O\
-	swap.$O\
 	syscallfmt.$O\
 	sysfile.$O\
 	sysproc.$O\
--- a/sys/src/9/teg2/main.c
+++ b/sys/src/9/teg2/main.c
@@ -455,7 +455,6 @@
 //	i8250console();			/* too early; see init0 */
 
 	pageinit();			/* prints "1020M memory: ⋯ */
-	swapinit();
 	userinit();
 
 	/*
--- a/sys/src/9/teg2/mkfile
+++ b/sys/src/9/teg2/mkfile
@@ -34,7 +34,6 @@
 	qio.$O\
 	qlock.$O\
 	segment.$O\
-	swap.$O\
 	syscallfmt.$O\
 	sysfile.$O\
 	sysproc.$O\
--- a/sys/src/9/teg2/ts
+++ b/sys/src/9/teg2/ts
@@ -2,6 +2,7 @@
 dev
 	root
 	cons
+	swap
 	env
 	pipe
 	proc
--- a/sys/src/9/xen/main.c
+++ b/sys/src/9/xen/main.c
@@ -103,8 +103,6 @@
 //	conf.monitor = 1;
 	chandevreset();
 	pageinit();
-
-	swapinit();
 	userinit();
 	schedinit();
 }
--- a/sys/src/9/xen/mkfile
+++ b/sys/src/9/xen/mkfile
@@ -32,7 +32,6 @@
 	qlock.$O\
 	rebootcmd.$O\
 	segment.$O\
-	swap.$O\
 	sysfile.$O\
 	sysproc.$O\
 	taslock.$O\
--- a/sys/src/9/xen/xenpcf
+++ b/sys/src/9/xen/xenpcf
@@ -1,6 +1,7 @@
 dev
 	root		netif
 	cons
+	swap
 	uart
 	arch
 	env
--- a/sys/src/9/zynq/main.c
+++ b/sys/src/9/zynq/main.c
@@ -393,7 +393,6 @@
 	archinit();
 	chandevreset();
 	pageinit();
-	swapinit();
 	screeninit();
 	userinit();
 	schedinit();
--- a/sys/src/9/zynq/mkfile
+++ b/sys/src/9/zynq/mkfile
@@ -31,7 +31,6 @@
 	qio.$O\
 	qlock.$O\
 	segment.$O\
-	swap.$O\
 	sysfile.$O\
 	sysproc.$O\
 	taslock.$O\
--- a/sys/src/9/zynq/zynq
+++ b/sys/src/9/zynq/zynq
@@ -1,6 +1,7 @@
 dev
 	root
 	cons
+	swap
 	arch
 	uart
 	mnt