shithub: riscv

Download patch

ref: cb35d1a132f881aac7151174fdb82ec9ff84682e
parent: 8ac28ac11cb9e96329e4d083208b3f13ee8d2b8a
author: cinap_lenrek <[email protected]>
date: Sat Dec 20 23:46:22 EST 2014

kernel: avoid inconsistent reads in /proc/#/fd and /proc/#/ns

to allow bytewise access to /proc/#/fd, the contents of the file where
recreated on each call. if fd's had been closed or reassigned between
the reads, the offset would be inconsistent and a read could start off
in the middle of a line. this happens when you cat /proc/#/fd file of
a busy process that mutates its filedescriptor table.

to fix this, we now return one line record at a time. if the line
fits in the read size, then this means the next read will always start
at the beginning of the next line record. we remember the consumed
byte count in Chan.mrock and the current record in Chan.nrock. (these
fields are free to usefor non-directory files)

if a read comes in and the offset is the same as c->mrock, we do not
need to regenerate the file and just render the next c->nrock's record.

for reads smaller than the line count, we have to regenerate the content
up to the offset and the race is still possible, but this should not
be the common case.

the same algorithm is now used for /proc/#/ns file, allowing a simpler
reimplementation and getting rid of Mntwalk state strcture.

--- a/sys/src/9/port/devdup.c
+++ b/sys/src/9/port/devdup.c
@@ -109,7 +109,7 @@
 	fd = twicefd/2;
 	if(twicefd & 1){
 		c = fdtochan(fd, -1, 0, 1);
-		procfdprint(c, fd, 0, buf, sizeof buf);
+		procfdprint(c, fd, buf, sizeof buf);
 		cclose(c);
 		return readstr((ulong)offset, va, n, buf);
 	}
--- a/sys/src/9/port/devproc.c
+++ b/sys/src/9/port/devproc.c
@@ -155,7 +155,6 @@
 int	procctlmemio(Proc*, uintptr, int, void*, int);
 Chan*	proctext(Chan*, Proc*);
 int	procstopped(void*);
-void	mntscan(Mntwalk*, Proc*);
 ulong	procpagecount(Proc *);
 
 static Traceevent *tevents;
@@ -400,6 +399,7 @@
 	case Qkregs:
 	case Qsegment:
 	case Qprofile:
+	case Qns:
 	case Qfd:
 		if(omode != OREAD)
 			error(Eperm);
@@ -428,12 +428,6 @@
 		nonone(p);
 		break;
 
-	case Qns:
-		if(omode != OREAD)
-			error(Eperm);
-		c->aux = smalloc(sizeof(Mntwalk));
-		break;
-
 	case Qnotepg:
 		nonone(p);
 		pg = p->pgrp;
@@ -514,103 +508,6 @@
 	return n;
 }
 
-
-static long
-procoffset(long offset, char *va, int *np)
-{
-	if(offset > 0) {
-		offset -= *np;
-		if(offset < 0) {
-			memmove(va, va+*np+offset, -offset);
-			*np = -offset;
-		}
-		else
-			*np = 0;
-	}
-	return offset;
-}
-
-static int
-procqidwidth(Chan *c)
-{
-	char buf[32];
-
-	return sprint(buf, "%lud", c->qid.vers);
-}
-
-int
-procfdprint(Chan *c, int fd, int w, char *s, int ns)
-{
-	int n;
-
-	if(w == 0)
-		w = procqidwidth(c);
-	n = snprint(s, ns, "%3d %.2s %C %4ld (%.16llux %*lud %.2ux) %5ld %8lld %s\n",
-		fd,
-		&"r w rw"[(c->mode&3)<<1],
-		devtab[c->type]->dc, c->dev,
-		c->qid.path, w, c->qid.vers, c->qid.type,
-		c->iounit, c->offset, c->path->s);
-	return n;
-}
-
-static int
-procfds(Proc *p, char *va, int count, long offset)
-{
-	Fgrp *f;
-	Chan *c;
-	char buf[256];
-	int n, i, w, ww;
-	char *a;
-
-	/* print to buf to avoid holding fgrp lock while writing to user space */
-	if(count > sizeof buf)
-		count = sizeof buf;
-	a = buf;
-
-	eqlock(&p->debug);
-	f = p->fgrp;
-	if(f == nil || p->dot == nil){
-		qunlock(&p->debug);
-		return 0;
-	}
-	lock(f);
-	if(waserror()){
-		unlock(f);
-		qunlock(&p->debug);
-		nexterror();
-	}
-
-	n = readstr(0, a, count, p->dot->path->s);
-	n += snprint(a+n, count-n, "\n");
-	offset = procoffset(offset, a, &n);
-	/* compute width of qid.path */
-	w = 0;
-	for(i = 0; i <= f->maxfd; i++) {
-		c = f->fd[i];
-		if(c == nil)
-			continue;
-		ww = procqidwidth(c);
-		if(ww > w)
-			w = ww;
-	}
-	for(i = 0; i <= f->maxfd; i++) {
-		c = f->fd[i];
-		if(c == nil)
-			continue;
-		n += procfdprint(c, i, w, a+n, count-n);
-		offset = procoffset(offset, a, &n);
-	}
-	unlock(f);
-	qunlock(&p->debug);
-	poperror();
-
-	/* copy result to user space, now that locks are released */
-	memmove(va, buf, n);
-
-	return n;
-}
-
 static void
 procclose(Chan *c)
 {
@@ -622,31 +519,8 @@
 			proctrace = nil;
 		unlock(&tlock);
 	}
-	if(QID(c->qid) == Qns && c->aux != nil){
-		free(c->aux);
-		c->aux = nil;
-	}
 }
 
-static void
-int2flag(int flag, char *s)
-{
-	if(flag == 0){
-		*s = '\0';
-		return;
-	}
-	*s++ = '-';
-	if(flag & MAFTER)
-		*s++ = 'a';
-	if(flag & MBEFORE)
-		*s++ = 'b';
-	if(flag & MCREATE)
-		*s++ = 'c';
-	if(flag & MCACHE)
-		*s++ = 'C';
-	*s = '\0';
-}
-
 static int
 procargs(Proc *p, char *buf, int nbuf)
 {
@@ -690,6 +564,127 @@
 	return p->pid != PID(c->qid) || p->waitq != nil;
 }
 
+static void
+int2flag(int flag, char *s)
+{
+	if(flag == 0){
+		*s = '\0';
+		return;
+	}
+	*s++ = '-';
+	if(flag & MAFTER)
+		*s++ = 'a';
+	if(flag & MBEFORE)
+		*s++ = 'b';
+	if(flag & MCREATE)
+		*s++ = 'c';
+	if(flag & MCACHE)
+		*s++ = 'C';
+	*s = '\0';
+}
+
+static int
+readns1(Chan *c, Proc *p, char *buf, int nbuf)
+{
+	Pgrp *pg;
+	Mount *t, *cm;
+	Mhead *f, *mh;
+	ulong minid, bestmid;
+	char flag[10], *srv;
+	int i;
+
+	pg = p->pgrp;
+	if(pg == nil || p->dot == nil || p->pid != PID(c->qid))
+		error(Eprocdied);
+
+	bestmid = ~0;
+	minid = c->nrock;
+	if(minid == bestmid)
+		return 0;
+
+	rlock(&pg->ns);
+
+	mh = nil;
+	cm = nil;
+	for(i = 0; i < MNTHASH; i++) {
+		for(f = pg->mnthash[i]; f != nil; f = f->hash) {
+			for(t = f->mount; t != nil; t = t->next) {
+				if(t->mountid >= minid && t->mountid < bestmid) {
+					bestmid = t->mountid;
+					cm = t;
+					mh = f;
+				}
+			}
+		}
+	}
+
+	if(bestmid == ~0) {
+		c->nrock = bestmid;
+		i = snprint(buf, nbuf, "cd %s\n", p->dot->path->s);
+	} else {
+		c->nrock = bestmid+1;
+
+		int2flag(cm->mflag, flag);
+		if(strcmp(cm->to->path->s, "#M") == 0){
+			srv = srvname(cm->to->mchan);
+			i = snprint(buf, nbuf, "mount %s %s %s %s\n", flag,
+				srv==nil? cm->to->mchan->path->s : srv,
+				mh->from->path->s, cm->spec? cm->spec : "");
+			free(srv);
+		}else{
+			i = snprint(buf, nbuf, "bind %s %s %s\n", flag,
+				cm->to->path->s, mh->from->path->s);
+		}
+	}
+
+	runlock(&pg->ns);
+
+	return i;
+}
+
+int
+procfdprint(Chan *c, int fd, char *s, int ns)
+{
+	return snprint(s, ns, "%3d %.2s %C %4ld (%.16llux %lud %.2ux) %5ld %8lld %s\n",
+		fd,
+		&"r w rw"[(c->mode&3)<<1],
+		devtab[c->type]->dc, c->dev,
+		c->qid.path, c->qid.vers, c->qid.type,
+		c->iounit, c->offset, c->path->s);
+}
+
+static int
+readfd1(Chan *c, Proc *p, char *buf, int nbuf)
+{
+	Fgrp *fg;
+	int n, i;
+
+	fg = p->fgrp;
+	if(fg == nil || p->dot == nil || p->pid != PID(c->qid))
+		return 0;
+
+	if(c->nrock == 0){
+		c->nrock = 1;
+		return snprint(buf, nbuf, "%s\n", p->dot->path->s);
+	}
+
+	lock(fg);
+	n = 0;
+	for(;;){
+		i = c->nrock-1;
+		if(i < 0 || i > fg->maxfd)
+			break;
+		c->nrock++;
+		if(fg->fd[i] != nil){
+			n = procfdprint(fg->fd[i], i, buf, nbuf);
+			break;
+		}
+	}
+	unlock(fg);
+
+	return n;
+}
+
 /*
  * userspace can't pass negative file offset for a
  * 64 bit kernel address, so we use 63 bit and sign
@@ -706,15 +701,13 @@
 static long
 procread(Chan *c, void *va, long n, vlong off)
 {
-	/* NSEG*32 was too small for worst cases */
-	char *a, flag[10], *sps, *srv, statbuf[NSEG*64];
-	int i, j, m, navail, ne, rsize;
+	char *a, *sps, statbuf[1024];
+	int i, j, navail, ne, rsize;
 	long l;
 	uchar *rptr;
 	uintptr addr;
 	ulong offset;
 	Confmem *cm;
-	Mntwalk *mw;
 	Proc *p;
 	Segment *sg, *s;
 	Ureg kur;
@@ -753,14 +746,16 @@
 	switch(QID(c->qid)){
 	case Qargs:
 		eqlock(&p->debug);
-		j = procargs(p, up->genbuf, sizeof up->genbuf);
+		j = procargs(p, statbuf, sizeof(statbuf));
 		qunlock(&p->debug);
 		if(offset >= j)
 			return 0;
 		if(offset+n > j)
 			n = j-offset;
-		memmove(a, &up->genbuf[offset], n);
+	statbufread:
+		memmove(a, statbuf+offset, n);
 		return n;
+
 	case Qsyscall:
 		eqlock(&p->debug);
 		if(waserror()){
@@ -769,13 +764,12 @@
 		}
 		if(p->pid != PID(c->qid))
 			error(Eprocdied);
+		j = 0;
 		if(p->syscalltrace != nil)
-			n = readstr(offset, a, n, p->syscalltrace);
-		else
-			n = 0;
+			j = readstr(offset, a, n, p->syscalltrace);
 		qunlock(&p->debug);
 		poperror();
-		return n;
+		return j;
 
 	case Qmem:
 		addr = off2addr(off);
@@ -830,17 +824,15 @@
 		if(p->nnote == 0)
 			n = 0;
 		else {
-			m = strlen(p->note[0].msg) + 1;
-			if(m > n)
-				m = n;
-			memmove(va, p->note[0].msg, m-1);
-			((char*)va)[m-1] = '\0';
-			p->nnote--;
+			i = strlen(p->note[0].msg) + 1;
+			if(i < n)
+				n = i;
+			memmove(a, p->note[0].msg, n-1);
+			a[n-1] = '\0';
+			if(--p->nnote == 0)
+				p->notepending = 0;
 			memmove(p->note, p->note+1, p->nnote*sizeof(Note));
-			n = m;
 		}
-		if(p->nnote == 0)
-			p->notepending = 0;
 		poperror();
 		qunlock(&p->debug);
 		return n;
@@ -905,8 +897,7 @@
 		readnum(0, statbuf+j+NUMSIZE*6, NUMSIZE, procpagecount(p)*BY2PG/1024, NUMSIZE);
 		readnum(0, statbuf+j+NUMSIZE*7, NUMSIZE, p->basepri, NUMSIZE);
 		readnum(0, statbuf+j+NUMSIZE*8, NUMSIZE, p->priority, NUMSIZE);
-		memmove(a, statbuf+offset, n);
-		return n;
+		goto statbufread;
 
 	case Qsegment:
 		j = 0;
@@ -924,10 +915,7 @@
 			return 0;
 		if(offset+n > j)
 			n = j-offset;
-		if(n == 0 && offset == 0)
-			exhausted("segments");
-		memmove(a, &statbuf[offset], n);
-		return n;
+		goto statbufread;
 
 	case Qwait:
 		if(!canqlock(&p->qwaitr))
@@ -959,96 +947,55 @@
 
 		qunlock(&p->qwaitr);
 		poperror();
-		n = snprint(a, n, "%d %lud %lud %lud %q",
+
+		j = snprint(statbuf, sizeof(statbuf), "%d %lud %lud %lud %q",
 			wq->w.pid,
 			wq->w.time[TUser], wq->w.time[TSys], wq->w.time[TReal],
 			wq->w.msg);
 		free(wq);
-		return n;
+		if(j < n)
+			n = j;
+		offset = 0;
+		goto statbufread;
 
 	case Qns:
+	case Qfd:
 		eqlock(&p->debug);
 		if(waserror()){
 			qunlock(&p->debug);
 			nexterror();
 		}
-		if(p->pgrp == nil || p->dot == nil || p->pid != PID(c->qid))
-			error(Eprocdied);
-		mw = c->aux;
-		if(mw->cddone){
-			qunlock(&p->debug);
-			poperror();
-			return 0;
-		}
-		mntscan(mw, p);
-		if(mw->mh == nil){
-			mw->cddone = 1;
-			i = snprint(a, n, "cd %s\n", p->dot->path->s);
-			qunlock(&p->debug);
-			poperror();
-			return i;
-		}
-		int2flag(mw->cm->mflag, flag);
-		if(strcmp(mw->cm->to->path->s, "#M") == 0){
-			srv = srvname(mw->cm->to->mchan);
-			i = snprint(a, n, "mount %s %s %s %s\n", flag,
-				srv==nil? mw->cm->to->mchan->path->s : srv,
-				mw->mh->from->path->s, mw->cm->spec? mw->cm->spec : "");
-			free(srv);
-		}else
-			i = snprint(a, n, "bind %s %s %s\n", flag,
-				mw->cm->to->path->s, mw->mh->from->path->s);
+		if(offset == 0 || offset != c->mrock)
+			c->nrock = c->mrock = 0;
+		do {
+			if(QID(c->qid) == Qns)
+				j = readns1(c, p, statbuf, sizeof(statbuf));
+			else
+				j = readfd1(c, p, statbuf, sizeof(statbuf));
+			if(j == 0)
+				break;
+			c->mrock += j;
+		} while(c->mrock <= offset);
+		i = c->mrock - offset;
 		qunlock(&p->debug);
 		poperror();
-		return i;
 
+		if(i <= 0)
+			return 0;
+		if(i < n)
+			n = i;
+		offset = j - i;
+		goto statbufread;
+
 	case Qnoteid:
 		return readnum(offset, va, n, p->noteid, NUMSIZE);
+
 	case Qppid:
 		return readnum(offset, va, n, p->parentpid, NUMSIZE);
-	case Qfd:
-		return procfds(p, va, n, offset);
+
 	}
 	error(Egreg);
 	return 0;		/* not reached */
-}
-
-void
-mntscan(Mntwalk *mw, Proc *p)
-{
-	Pgrp *pg;
-	Mount *t;
-	Mhead *f;
-	int nxt, i;
-	ulong last, bestmid;
-
-	pg = p->pgrp;
-	rlock(&pg->ns);
-
-	nxt = 0;
-	bestmid = ~0;
-
-	last = 0;
-	if(mw->mh != nil)
-		last = mw->cm->mountid;
-
-	for(i = 0; i < MNTHASH; i++) {
-		for(f = pg->mnthash[i]; f != nil; f = f->hash) {
-			for(t = f->mount; t != nil; t = t->next) {
-				if(mw->mh == nil ||
-				  (t->mountid > last && t->mountid < bestmid)) {
-					mw->cm = t;
-					mw->mh = f;
-					bestmid = mw->cm->mountid;
-					nxt = 1;
-				}
-			}
-		}
-	}
-	if(nxt == 0)
-		mw->mh = nil;
-
-	runlock(&pg->ns);
 }
 
 static long
--- a/sys/src/9/port/portdat.h
+++ b/sys/src/9/port/portdat.h
@@ -17,7 +17,6 @@
 typedef struct Mntcache Mntcache;
 typedef struct Mount	Mount;
 typedef struct Mntrpc	Mntrpc;
-typedef struct Mntwalk	Mntwalk;
 typedef struct Mnt	Mnt;
 typedef struct Mhead	Mhead;
 typedef struct Note	Note;
@@ -240,13 +239,6 @@
 	NSMAX	=	1000,
 	NSLOG	=	7,
 	NSCACHE	=	(1<<NSLOG),
-};
-
-struct Mntwalk				/* state for /proc/#/ns */
-{
-	int	cddone;
-	Mhead*	mh;
-	Mount*	cm;
 };
 
 struct Mount
--- a/sys/src/9/port/portfns.h
+++ b/sys/src/9/port/portfns.h
@@ -224,7 +224,7 @@
 ulong		procalarm(ulong);
 void		procctl(void);
 void		procdump(void);
-int		procfdprint(Chan*, int, int, char*, int);
+int		procfdprint(Chan*, int, char*, int);
 void		procflushseg(Segment*);
 int		procindex(ulong);
 void		procinit0(void);