shithub: riscv

Download patch

ref: 4a80d9d029891e056a7badeeea8e3b588efd694b
parent: f7c60230669e8e00bc794f07726070d577d5aa3f
author: cinap_lenrek <[email protected]>
date: Sun Feb 23 13:00:21 EST 2020

kernel: fix multiple devproc bugs and pid reuse issues

devproc assumes that when we hold the Proc.debug qlock,
the process will be prevented from exiting. but there is
another race where the process has already exited and
the Proc* slot gets reused. to solve this, on process
creation we also have to acquire the debug qlock while
initializing the fields of the process. this also means
newproc() should only initialize fields *not* protected
by the debug qlock.

always acquire the Proc.debug qlock when changing strings
in the proc structure to avoid doublefree on concurrent
update. for changing the user string, we add a procsetuser()
function that does this for auth.c and devcap.

remove pgrpnote() from pgrp.c and replace by static
postnotepg() in devproc.

avoid the assumption that the Proc* entries returned by
proctab() are continuous.

fixed devproc permission issues:
	- make sure only eve can access /proc/trace
	- none should only be allowed to read its own /proc/n/text
	- move Proc.kp checks into procopen()

pid reuse was not handled correctly, as we where only
checking if a pid had a living process, but there still
could be processes expecting a particular parentpid or
noteid.

this is now addressed with reference counted Pid
structures which are organized in a hash table.
read access to the hash table does not require locks
which will be usefull for dtracy later.

--- a/sys/src/9/port/auth.c
+++ b/sys/src/9/port/auth.c
@@ -117,8 +117,7 @@
 {
 	if(n!=4 || strncmp(a, "none", 4)!=0)
 		error(Eperm);
-	kstrdup(&up->user, "none");
-	up->basepri = PriNormal;
+	procsetuser(up, "none");
 	return n;
 }
 
@@ -130,12 +129,14 @@
 long
 hostownerwrite(char *a, int n)
 {
-	char buf[128];
+	char buf[KNAMELEN];
 
 	if(!iseve())
 		error(Eperm);
-	if(n <= 0 || n >= sizeof buf)
+	if(n <= 0)
 		error(Ebadarg);
+	if(n >= sizeof buf)
+		error(Etoolong);
 	memmove(buf, a, n);
 	buf[n] = 0;
 
@@ -143,8 +144,7 @@
 	srvrenameuser(eve, buf);
 	shrrenameuser(eve, buf);
 	kstrdup(&eve, buf);
-	kstrdup(&up->user, buf);
-	up->basepri = PriNormal;
+	procsetuser(up, buf);
 	return n;
 }
 
--- a/sys/src/9/port/devcap.c
+++ b/sys/src/9/port/devcap.c
@@ -246,9 +246,7 @@
 		}
 		secfree(p);
 
-		/* set user id */
-		kstrdup(&up->user, to);
-		up->basepri = PriNormal;
+		procsetuser(up, to);
 
 		secfree(cp);
 		poperror();
--- a/sys/src/9/port/devproc.c
+++ b/sys/src/9/port/devproc.c
@@ -74,7 +74,7 @@
 	Emask = Nevents - 1,
 };
 
-#define	STATSIZE	(2*KNAMELEN+12+9*12)
+#define	STATSIZE	(2*28+12+9*12)
 /*
  * Status, fd, and ns are left fully readable (0444) because of their use in debugging,
  * particularly on shared servers.
@@ -145,7 +145,6 @@
  *	23 bits of process slot number + 1
  *	     in vers,
  *	32 bits of pid, for consistency checking
- * If notepg, c->pgrpid.path is pgrp slot, .vers is noteid.
  */
 #define	QSHIFT	5	/* location in qid of proc slot # */
 
@@ -186,7 +185,7 @@
 		if(s == 0){
 			strcpy(up->genbuf, "trace");
 			mkqid(&qid, Qtrace, -1, QTFILE);
-			devdir(c, qid, up->genbuf, 0, eve, 0444, dp);
+			devdir(c, qid, up->genbuf, 0, eve, 0400, dp);
 			return 1;
 		}
 
@@ -206,10 +205,10 @@
 		pid = p->pid;
 		if(pid == 0)
 			return 0;
-		sprint(up->genbuf, "%lud", pid);
 		/*
 		 * String comparison is done in devwalk so name must match its formatted pid
 		*/
+		snprint(up->genbuf, sizeof(up->genbuf), "%lud", pid);
 		if(name != nil && strcmp(name, up->genbuf) != 0)
 			return -1;
 		mkqid(&qid, (s+1)<<QSHIFT, pid, QTDIR);
@@ -219,7 +218,7 @@
 	if(c->qid.path == Qtrace){
 		strcpy(up->genbuf, "trace");
 		mkqid(&qid, Qtrace, -1, QTFILE);
-		devdir(c, qid, up->genbuf, 0, eve, 0444, dp);
+		devdir(c, qid, up->genbuf, 0, eve, 0400, dp);
 		return 1;
 	}
 	if(s >= nelem(procdir))
@@ -250,8 +249,6 @@
 			len *= sizeof(*q->profile);
 		}
 		break;
-	}
-	switch(QID(tab->qid)){
 	case Qwatchpt:
 		len = lenwatchpt(p);
 		break;
@@ -324,6 +321,52 @@
 	error(Eperm);
 }
 
+static void
+changenoteid(Proc *p, ulong noteid)
+{
+	Proc *pp;
+	int i;
+
+	if(noteid <= 0)
+		error(Ebadarg);
+	if(noteid == p->noteid)
+		return;
+	if(noteid == p->pid){
+		setnoteid(p, noteid);
+		return;
+	}
+	for(i = 0; i < conf.nproc; i++){
+		pp = proctab(i);
+		if(pp->noteid != noteid || pp->kp)
+			continue;
+		if(strcmp(pp->user, p->user) == 0){
+			nonone(pp);
+			setnoteid(p, noteid);
+			return;
+		}
+	}
+	error(Eperm);
+}
+
+static void
+postnotepg(ulong noteid, char *n, int flag)
+{
+	Proc *p;
+	int i;
+
+	for(i = 0; i < conf.nproc; i++){
+		p = proctab(i);
+		if(p == up)
+			continue;
+		if(p->noteid != noteid || p->kp)
+			continue;
+		qlock(&p->debug);
+		if(p->noteid == noteid)
+			postnote(p, 0, n, flag);
+		qunlock(&p->debug);
+	}
+}
+
 static void clearwatchpt(Proc *p);
 
 static Chan*
@@ -330,7 +373,6 @@
 procopen(Chan *c, int omode0)
 {
 	Proc *p;
-	Pgrp *pg;
 	Chan *tc;
 	int pid;
 	int omode;
@@ -339,7 +381,7 @@
 		return devopen(c, omode0, 0, 0, procgen);
 
 	if(QID(c->qid) == Qtrace){
-		if (omode0 != OREAD) 
+		if (omode0 != OREAD || !iseve()) 
 			error(Eperm);
 		lock(&tlock);
 		if (waserror()){
@@ -381,6 +423,7 @@
 	case Qtext:
 		if(omode != OREAD)
 			error(Eperm);
+		nonone(p);
 		qunlock(&p->debug);
 		poperror();
 		tc = proctext(c, p);
@@ -388,10 +431,11 @@
 		cclose(c);
 		return tc;
 
+	case Qstatus:
+	case Qppid:
 	case Qproc:
 	case Qkregs:
 	case Qsegment:
-	case Qprofile:
 	case Qns:
 	case Qfd:
 		if(omode != OREAD)
@@ -398,39 +442,31 @@
 			error(Eperm);
 		break;
 
+	case Qargs:
+	case Qwait:
+	case Qnoteid:
+		if(omode == OREAD)
+			break;
+	case Qctl:
 	case Qnote:
-		if(p->privatemem)
+		if(p->kp)
 			error(Eperm);
 		break;
 
-	case Qmem:
-	case Qctl:
-		if(p->privatemem)
+	case Qnotepg:
+		if(p->kp || omode != OWRITE)
 			error(Eperm);
-		nonone(p);
+		pid = p->noteid;
 		break;
 
-	case Qargs:
-	case Qnoteid:
-	case Qstatus:
-	case Qwait:
+	case Qmem:
 	case Qregs:
 	case Qfpregs:
+	case Qprofile:
 	case Qsyscall:	
-	case Qppid:
 	case Qwatchpt:
-		nonone(p);
-		break;
-
-	case Qnotepg:
-		nonone(p);
-		pg = p->pgrp;
-		if(pg == nil)
-			error(Eprocdied);
-		if(omode!=OWRITE)
+		if(p->kp || p->privatemem)
 			error(Eperm);
-		c->pgrpid.path = pg->pgrpid+1;
-		c->pgrpid.vers = p->noteid;
 		break;
 
 	default:
@@ -437,15 +473,12 @@
 		print("procopen %#lux\n", QID(c->qid));
 		error(Egreg);
 	}
+	nonone(p);
 
 	/* Affix pid to qid */
-	if(p->state != Dead)
-		c->qid.vers = p->pid;
-
-	/* make sure the process slot didn't get reallocated while we were playing */
-	coherence();
-	if(p->pid != pid)
+	if(pid == 0)
 		error(Eprocdied);
+	c->qid.vers = pid;
 
 	tc = devopen(c, omode, 0, 0, procgen);
 	if(waserror()){
@@ -470,23 +503,31 @@
 static int
 procwstat(Chan *c, uchar *db, int n)
 {
-	Proc *p;
 	Dir *d;
+	Proc *p;
 
 	if(c->qid.type&QTDIR)
 		error(Eperm);
 
-	if(QID(c->qid) == Qtrace)
+	switch(QID(c->qid)){
+	case Qnotepg:
+	case Qtrace:
 		return devwstat(c, db, n);
+	}
 		
-	p = proctab(SLOT(c->qid));
-	nonone(p);
-	d = nil;
+	d = smalloc(sizeof(Dir)+n);
+	if(waserror()){
+		free(d);
+		nexterror();
+	}
+	n = convM2D(db, n, &d[0], (char*)&d[1]);
+	if(n == 0)
+		error(Eshortstat);
 
+	p = proctab(SLOT(c->qid));
 	eqlock(&p->debug);
 	if(waserror()){
 		qunlock(&p->debug);
-		free(d);
 		nexterror();
 	}
 
@@ -493,13 +534,10 @@
 	if(p->pid != PID(c->qid))
 		error(Eprocdied);
 
+	nonone(p);
 	if(strcmp(up->user, p->user) != 0 && !iseve())
 		error(Eperm);
 
-	d = smalloc(sizeof(Dir)+n);
-	n = convM2D(db, n, &d[0], (char*)&d[1]);
-	if(n == 0)
-		error(Eshortstat);
 	if(!emptystr(d->uid) && strcmp(d->uid, p->user) != 0){
 		if(!iseve())
 			error(Eperm);
@@ -511,6 +549,7 @@
 
 	qunlock(&p->debug);
 	poperror();
+	poperror();
 	free(d);
 	return n;
 }
@@ -551,10 +590,8 @@
 	int n;
 
 	a = p->args;
-	if(p->setargs){
-		snprint(buf, nbuf, "%s [%s]", p->text, p->args);
-		return strlen(buf);
-	}
+	if(p->setargs)
+		return snprint(buf, nbuf, "%s [%s]", p->text, p->args);
 	n = p->nargs;
 	for(j = 0; j < nbuf - 1; j += m){
 		if(n <= 0)
@@ -836,24 +873,23 @@
 static long
 procread(Chan *c, void *va, long n, vlong off)
 {
-	char *a, *sps, statbuf[1024];
-	int i, j, navail, ne, rsize;
-	ulong l;
+	char statbuf[1024], *sps;
+	ulong offset;
+	int i, j, rsize;
 	uchar *rptr;
 	uintptr addr;
-	ulong offset;
-	Confmem *cm;
-	Proc *p;
-	Segment *sg, *s;
+	Segment *s;
 	Ureg kur;
 	Waitq *wq;
+	Proc *p;
 	
-	a = va;
 	offset = off;
 	if(c->qid.type & QTDIR)
-		return devdirread(c, a, n, 0, 0, procgen);
+		return devdirread(c, va, n, 0, 0, procgen);
 
 	if(QID(c->qid) == Qtrace){
+		int navail, ne;
+
 		if(!eventsavailable(nil))
 			return 0;
 
@@ -879,33 +915,6 @@
 		error(Eprocdied);
 
 	switch(QID(c->qid)){
-	case Qargs:
-		eqlock(&p->debug);
-		j = procargs(p, statbuf, sizeof(statbuf));
-		qunlock(&p->debug);
-		if(offset >= j)
-			return 0;
-		if(offset+n > j)
-			n = j-offset;
-	statbufread:
-		memmove(a, statbuf+offset, n);
-		return n;
-
-	case Qsyscall:
-		eqlock(&p->debug);
-		if(waserror()){
-			qunlock(&p->debug);
-			nexterror();
-		}
-		if(p->pid != PID(c->qid))
-			error(Eprocdied);
-		j = 0;
-		if(p->syscalltrace != nil)
-			j = readstr(offset, a, n, p->syscalltrace);
-		qunlock(&p->debug);
-		poperror();
-		return j;
-
 	case Qmem:
 		addr = off2addr(off);
 		if(addr < KZERO)
@@ -918,21 +927,27 @@
 		if(addr < (uintptr)end) {
 			if(addr+n > (uintptr)end)
 				n = (uintptr)end - addr;
-			memmove(a, (char*)addr, n);
+			memmove(va, (char*)addr, n);
 			return n;
 		}
 		for(i=0; i<nelem(conf.mem); i++){
-			cm = &conf.mem[i];
+			Confmem *cm = &conf.mem[i];
 			/* klimit-1 because klimit might be zero! */
 			if(cm->kbase <= addr && addr <= cm->klimit-1){
 				if(addr+n >= cm->klimit-1)
 					n = cm->klimit - addr;
-				memmove(a, (char*)addr, n);
+				memmove(va, (char*)addr, n);
 				return n;
 			}
 		}
 		error(Ebadarg);
 
+	case Qnoteid:
+		return readnum(offset, va, n, p->noteid, NUMSIZE);
+
+	case Qppid:
+		return readnum(offset, va, n, p->parentpid, NUMSIZE);
+
 	case Qprofile:
 		s = p->seg[TSEG];
 		if(s == nil || s->profile == nil)
@@ -943,42 +958,13 @@
 			return 0;
 		if(offset+n > i)
 			n = i - offset;
-		memmove(a, ((char*)s->profile)+offset, n);
+		memmove(va, ((char*)s->profile)+offset, n);
 		return n;
 
-	case Qnote:
-		eqlock(&p->debug);
-		if(waserror()){
-			qunlock(&p->debug);
-			nexterror();
-		}
-		if(p->pid != PID(c->qid))
-			error(Eprocdied);
-		if(n < 1)	/* must accept at least the '\0' */
-			error(Etoosmall);
-		if(p->nnote == 0)
-			n = 0;
-		else {
-			i = strlen(p->note[0].msg) + 1;
-			if(i < n)
-				n = i;
-			memmove(a, p->note[0].msg, n-1);
-			a[n-1] = '\0';
-			if(--p->nnote == 0)
-				p->notepending = 0;
-			memmove(p->note, p->note+1, p->nnote*sizeof(Note));
-		}
-		poperror();
-		qunlock(&p->debug);
-		return n;
-
 	case Qproc:
-		if(offset >= sizeof(Proc))
-			return 0;
-		if(offset+n > sizeof(Proc))
-			n = sizeof(Proc) - offset;
-		memmove(a, ((char*)p)+offset, n);
-		return n;
+		rptr = (uchar*)p;
+		rsize = sizeof(Proc);
+		goto regread;
 
 	case Qregs:
 		rptr = (uchar*)p->dbgreg;
@@ -1004,53 +990,48 @@
 			return 0;
 		if(offset+n > rsize)
 			n = rsize - offset;
-		memmove(a, rptr+offset, n);
+		memmove(va, rptr+offset, n);
 		return n;
 
 	case Qstatus:
-		if(offset >= STATSIZE)
-			return 0;
-		if(offset+n > STATSIZE)
-			n = STATSIZE - offset;
-
 		sps = p->psstate;
 		if(sps == nil)
 			sps = statename[p->state];
+		j = snprint(statbuf, sizeof(statbuf),
+			"%-27s %-27s %-11s "
+			"%11lud %11lud %11lud "
+			"%11lud %11lud %11lud "
+			"%11lud %11lud %11lud\n",
+			p->text, p->user, sps,
+			tk2ms(p->time[TUser]),
+			tk2ms(p->time[TSys]),
+			tk2ms(MACHP(0)->ticks - p->time[TReal]),
+			tk2ms(p->time[TCUser]),
+			tk2ms(p->time[TCSys]),
+			tk2ms(p->time[TCReal]),
+			(ulong)(procpagecount(p)*BY2PG/1024),
+			p->basepri, p->priority);
+	statbufread:
+		if(offset >= j)
+			return 0;
+		if(offset+n > j)
+			n = j - offset;
+		memmove(va, statbuf+offset, n);
+		return n;
 
-		memset(statbuf, ' ', sizeof statbuf);
-		readstr(0, statbuf+0*KNAMELEN, KNAMELEN-1, p->text);
-		readstr(0, statbuf+1*KNAMELEN, KNAMELEN-1, p->user);
-		readstr(0, statbuf+2*KNAMELEN, 11, sps);
-
-		j = 2*KNAMELEN + 12;
-		for(i = 0; i < 6; i++) {
-			l = p->time[i];
-			if(i == TReal)
-				l = MACHP(0)->ticks - l;
-			readnum(0, statbuf+j+NUMSIZE*i, NUMSIZE, tk2ms(l), NUMSIZE);
-		}
-
-		readnum(0, statbuf+j+NUMSIZE*6, NUMSIZE, procpagecount(p)*BY2PG/1024, NUMSIZE);
-		readnum(0, statbuf+j+NUMSIZE*7, NUMSIZE, p->basepri, NUMSIZE);
-		readnum(0, statbuf+j+NUMSIZE*8, NUMSIZE, p->priority, NUMSIZE);
-		goto statbufread;
-
 	case Qsegment:
 		j = 0;
 		for(i = 0; i < NSEG; i++) {
-			sg = p->seg[i];
-			if(sg == nil)
+			s = p->seg[i];
+			if(s == nil)
 				continue;
-			j += sprint(statbuf+j, "%-6s %c%c %8p %8p %4ld\n",
-				sname[sg->type&SG_TYPE],
-				sg->type&SG_FAULT ? 'F' : (sg->type&SG_RONLY ? 'R' : ' '),
-				sg->profile ? 'P' : ' ',
-				sg->base, sg->top, sg->ref);
+			j += snprint(statbuf+j, sizeof(statbuf)-j,
+				"%-6s %c%c %8p %8p %4ld\n",
+				sname[s->type&SG_TYPE],
+				s->type&SG_FAULT ? 'F' : (s->type&SG_RONLY ? 'R' : ' '),
+				s->profile ? 'P' : ' ',
+				s->base, s->top, s->ref);
 		}
-		if(offset >= j)
-			return 0;
-		if(offset+n > j)
-			n = j-offset;
 		goto statbufread;
 
 	case Qwait:
@@ -1089,18 +1070,21 @@
 			wq->w.time[TUser], wq->w.time[TSys], wq->w.time[TReal],
 			wq->w.msg);
 		free(wq);
-		if(j < n)
-			n = j;
 		offset = 0;
 		goto statbufread;
+	}
 
+	eqlock(&p->debug);
+	if(waserror()){
+		qunlock(&p->debug);
+		nexterror();
+	}
+	if(p->pid != PID(c->qid))
+		error(Eprocdied);
+
+	switch(QID(c->qid)){
 	case Qns:
 	case Qfd:
-		eqlock(&p->debug);
-		if(waserror()){
-			qunlock(&p->debug);
-			nexterror();
-		}
 		if(offset == 0 || offset != c->mrock)
 			c->nrock = c->mrock = 0;
 		do {
@@ -1115,7 +1099,6 @@
 		i = c->mrock - offset;
 		qunlock(&p->debug);
 		poperror();
-
 		if(i <= 0 || i > j)
 			return 0;
 		if(i < n)
@@ -1122,55 +1105,75 @@
 			n = i;
 		offset = j - i;
 		goto statbufread;
-
-	case Qnoteid:
-		return readnum(offset, va, n, p->noteid, NUMSIZE);
-
-	case Qppid:
-		return readnum(offset, va, n, p->parentpid, NUMSIZE);
 	
+	case Qargs:
+		j = procargs(p, statbuf, sizeof(statbuf));
+		qunlock(&p->debug);
+		poperror();
+		goto statbufread;
+
 	case Qwatchpt:
-		eqlock(&p->debug);
 		j = readwatchpt(p, statbuf, sizeof(statbuf));
 		qunlock(&p->debug);
-		if(offset >= j)
-			return 0;
-		if(offset+n > j)
-			n = j - offset;
+		poperror();
 		goto statbufread;
 
+	case Qsyscall:
+		n = 0;
+		if(p->syscalltrace != nil)
+			n = readstr(offset, va, n, p->syscalltrace);
+		break;
+
+	case Qnote:
+		if(n < 1)	/* must accept at least the '\0' */
+			error(Etoosmall);
+		if(p->nnote == 0)
+			n = 0;
+		else {
+			i = strlen(p->note[0].msg) + 1;
+			if(i < n)
+				n = i;
+			memmove(va, p->note[0].msg, n-1);
+			((char*)va)[n-1] = '\0';
+			if(--p->nnote == 0)
+				p->notepending = 0;
+			memmove(p->note, p->note+1, p->nnote*sizeof(Note));
+		}
+		break;
+
+	default:
+		print("unknown qid in procwread\n");
+		error(Egreg);
 	}
-	error(Egreg);
-	return 0;		/* not reached */
+
+	qunlock(&p->debug);
+	poperror();
+	return n;
 }
 
 static long
 procwrite(Chan *c, void *va, long n, vlong off)
 {
-	int id, m;
-	Proc *p, *t, *et;
-	char *a, *arg, buf[ERRMAX];
+	char buf[ERRMAX], *arg;
 	ulong offset;
+	Proc *p;
+	int m;
 
-	a = va;
 	offset = off;
 	if(c->qid.type & QTDIR)
 		error(Eisdir);
 
-	/* Use the remembered noteid in the channel rather
-	 * than the process pgrpid
-	 */
+	/* use the remembered noteid in the channel qid */
 	if(QID(c->qid) == Qnotepg) {
 		if(n >= ERRMAX-1)
 			error(Etoobig);
 		memmove(buf, va, n);
 		buf[n] = 0;
-		pgrpnote(NOTEID(c->pgrpid), buf, NUser);
+		postnotepg(NOTEID(c->qid), buf, NUser);
 		return n;
 	}
 
 	p = proctab(SLOT(c->qid));
-
 	eqlock(&p->debug);
 	if(waserror()){
 		qunlock(&p->debug);
@@ -1193,8 +1196,8 @@
 		if(arg[m-1] != 0)
 			arg[m++] = 0;
 		free(p->args);
-		p->nargs = m;
 		p->args = arg;
+		p->nargs = m;
 		p->setargs = 1;
 		break;
 
@@ -1229,9 +1232,7 @@
 		break;
 
 	case Qnote:
-		if(p->kp)
-			error(Eperm);
-		if(n >= ERRMAX-1)
+		if(n >= sizeof(buf))
 			error(Etoobig);
 		memmove(buf, va, n);
 		buf[n] = 0;
@@ -1238,34 +1239,19 @@
 		if(!postnote(p, 0, buf, NUser))
 			error("note not posted");
 		break;
+
 	case Qnoteid:
-		if(p->kp)
-			error(Eperm);
-		id = atoi(a);
-		if(id <= 0)
-			error(Ebadarg);
-		if(id == p->pid) {
-			p->noteid = id;
-			break;
-		}
-		t = proctab(0);
-		for(et = t+conf.nproc; t < et; t++) {
-			if(t->state == Dead || t->kp)
-				continue;
-			if(id == t->noteid) {
-				nonone(t);
-				if(strcmp(p->user, t->user) != 0)
-					error(Eperm);
-				p->noteid = id;
-				break;
-			}
-		}
-		if(p->noteid != id)
-			error(Ebadarg);
+		if(n >= sizeof(buf))
+			error(Etoobig);
+		memmove(buf, va, n);
+		buf[n] = 0;
+		changenoteid(p, atoi(buf));
 		break;
+
 	case Qwatchpt:
 		writewatchpt(p, va, n, off);
 		break;
+
 	default:
 		print("unknown qid in procwrite\n");
 		error(Egreg);
@@ -1352,7 +1338,8 @@
 	if(waserror()) {
 		up->psstate = state;
 		qlock(&p->debug);
-		p->pdbg = nil;
+		if(p->pdbg == up)
+			p->pdbg = nil;
 		nexterror();
 	}
 	sleep(&up->sleep, procstopped, p);
@@ -1439,9 +1426,6 @@
 	vlong time;
 	char *e;
 	void (*pt)(Proc*, int, vlong);
-
-	if(p->kp)	/* no ctl requests to kprocs */
-		error(Eperm);
 
 	cb = parsecmd(va, n);
 	if(waserror()){
--- a/sys/src/9/port/pgrp.c
+++ b/sys/src/9/port/pgrp.c
@@ -12,24 +12,6 @@
 static Ref pgrpid;
 static Ref mountid;
 
-void
-pgrpnote(ulong noteid, char *n, int flag)
-{
-	Proc *p, *ep;
-
-	p = proctab(0);
-	for(ep = p+conf.nproc; p < ep; p++) {
-		if(p->state == Dead)
-			continue;
-		if(up != p && p->noteid == noteid && p->kp == 0) {
-			qlock(&p->debug);
-			if(p->noteid == noteid)
-				postnote(p, 0, n, flag);
-			qunlock(&p->debug);
-		}
-	}
-}
-
 Pgrp*
 newpgrp(void)
 {
--- a/sys/src/9/port/portdat.h
+++ b/sys/src/9/port/portdat.h
@@ -187,7 +187,6 @@
 	Mnt*	mux;			/* Mnt for clients using me for messages */
 	union {
 		void*	aux;
-		Qid	pgrpid;		/* for #p/notepg */
 		ulong	mid;		/* for ns in devproc */
 	};
 	Chan*	mchan;			/* channel to mounted server */
@@ -649,21 +648,22 @@
 	Mach	*mach;		/* machine running this proc */
 	char	*text;
 	char	*user;
+
 	char	*args;
 	int	nargs;		/* number of bytes of args */
 	int	setargs;	/* process changed its args */
+
 	Proc	*rnext;		/* next process in run queue */
 	Proc	*qnext;		/* next process on queue for a QLock */
-	QLock	*qlock;		/* addr of qlock being queued for DEBUG */
-	int	state;
+
 	char	*psstate;	/* What /proc/#/status reports */
-	Segment	*seg[NSEG];
-	QLock	seglock;	/* locked whenever seg[] changes */
+	int	state;
+
 	ulong	pid;
 	ulong	noteid;		/* Equivalent of note group */
 	ulong	parentpid;
-	Proc	*pidhash;	/* next proc in pid hash */
 
+	Proc	*parent;	/* Process to send wait record on exit */
 	Lock	exl;		/* Lock count and waitq */
 	Waitq	*waitq;		/* Exited processes wait children */
 	int	nchild;		/* Number of living children */
@@ -670,7 +670,9 @@
 	int	nwait;		/* Number of uncollected wait records */
 	QLock	qwaitr;
 	Rendez	waitr;		/* Place to hang out in wait */
-	Proc	*parent;
+
+	QLock	seglock;	/* locked whenever seg[] changes */
+	Segment	*seg[NSEG];
 
 	Pgrp	*pgrp;		/* Process group for namespace */
 	Egrp 	*egrp;		/* Environment group */
--- a/sys/src/9/port/portfns.h
+++ b/sys/src/9/port/portfns.h
@@ -215,8 +215,7 @@
 ulong		perfticks(void);
 void		pexit(char*, int);
 void		pgrpcpy(Pgrp*, Pgrp*);
-void		pgrpnote(ulong, char*, int);
-int		pidalloc(Proc*);
+ulong		pidalloc(Proc*);
 #define		poperror()		up->nerrlab--
 void		portcountpagerefs(ulong*, int);
 int		postnote(Proc*, int, char*, int);
@@ -226,7 +225,6 @@
 void		printinit(void);
 ulong		procalarm(ulong);
 void		procctl(void);
-void		procdump(void);
 int		procfdprint(Chan*, int, char*, int);
 void		procflushseg(Segment*);
 void		procflushpseg(Physseg*);
@@ -235,6 +233,7 @@
 void		procinit0(void);
 ulong		procpagecount(Proc*);
 void		procpriority(Proc*, int, int);
+void		procsetuser(Proc*, char*);
 Proc*		proctab(int);
 extern void	(*proctrace)(Proc*, int, vlong); 
 void		procwired(Proc*, int);
@@ -304,7 +303,6 @@
 Proc*		runproc(void);
 void		savefpregs(FPsave*);
 void		sched(void);
-void		scheddump(void);
 void		schedinit(void);
 void		(*screenputs)(char*, int);
 void*		secalloc(ulong);
@@ -318,6 +316,7 @@
 void		setkernur(Ureg*, Proc*);
 int		setlabel(Label*);
 void		setmalloctag(void*, uintptr);
+ulong		setnoteid(Proc*, ulong);
 void		setrealloctag(void*, uintptr);
 void		setregisters(Ureg*, char*, char*, int);
 void		setupwatchpts(Proc*, Watchpt*, int);
--- a/sys/src/9/port/proc.c
+++ b/sys/src/9/port/proc.c
@@ -21,7 +21,6 @@
 static struct Procalloc
 {
 	Lock;
-	Proc*	ht[128];
 	Proc*	arena;
 	Proc*	free;
 } procalloc;
@@ -53,8 +52,9 @@
 	"Waitrelease",
 };
 
-static void pidfree(Proc*);
 static void rebalance(void);
+static void pidinit(void);
+static void pidfree(Proc*);
 
 /*
  * Always splhi()'ed.
@@ -606,31 +606,14 @@
 		lock(&procalloc);
 	}
 	procalloc.free = p->qnext;
+	p->qnext = nil;
 	unlock(&procalloc);
 
-	assert(p->state == Dead);
 	p->psstate = "New";
-	p->mach = nil;
-	p->eql = nil;
-	p->qnext = nil;
-	p->nchild = 0;
-	p->nwait = 0;
-	p->waitq = nil;
-	p->parent = nil;
-	p->pgrp = nil;
-	p->egrp = nil;
-	p->fgrp = nil;
-	p->rgrp = nil;
-	p->pdbg = nil;
 	p->fpstate = FPinit;
-	p->kp = 0;
 	p->procctl = 0;
-	p->syscalltrace = nil;	
-	p->notepending = 0;
 	p->ureg = nil;
 	p->dbgreg = nil;
-	p->privatemem = 0;
-	p->noswap = 0;
 	p->nerrlab = 0;
 	p->errstr = p->errbuf0;
 	p->syserrstr = p->errbuf1;
@@ -639,11 +622,6 @@
 	p->nlocks = 0;
 	p->delaysched = 0;
 	p->trace = 0;
-	p->nargs = 0;
-	p->setargs = 0;
-	memset(p->seg, 0, sizeof p->seg);
-	p->parentpid = 0;
-	p->noteid = 0;
 	if(p->kstack == nil)
 		p->kstack = smalloc(KSTACK);
 
@@ -673,8 +651,8 @@
 		/* pick a machine to wire to */
 		memset(nwired, 0, sizeof(nwired));
 		p->wired = nil;
-		pp = proctab(0);
-		for(i=0; i<conf.nproc; i++, pp++){
+		for(i=0; i<conf.nproc; i++){
+			pp = proctab(i);
 			wm = pp->wired;
 			if(wm != nil && pp->pid)
 				nwired[wm->machno]++;
@@ -724,6 +702,8 @@
 	for(i=0; i<conf.nproc-1; i++, p++)
 		p->qnext = p+1;
 	p->qnext = nil;
+
+	pidinit();
 }
 
 /*
@@ -1082,7 +1062,7 @@
 pexit(char *exitstr, int freemem)
 {
 	Proc *p;
-	Segment **s, **es;
+	Segment **s;
 	ulong utime, stime;
 	Waitq *wq;
 	Fgrp *fgrp;
@@ -1124,18 +1104,14 @@
 	if(pgrp != nil)
 		closepgrp(pgrp);
 
-	/*
-	 * if not a kernel process and have a parent,
-	 * do some housekeeping.
-	 */
-	if(up->kp)
-		goto Nowait;
+	if(up->parentpid == 0){
+		if(exitstr == nil)
+			exitstr = "unknown";
+		panic("boot process died: %s", exitstr);
+	}
 
 	p = up->parent;
-	if(p != nil){
-		if(p->pid != up->parentpid || p->state == Broken)
-			goto Nowait;
-
+	if(p != nil && p->pid == up->parentpid && p->state != Broken){
 		wq = smalloc(sizeof(Waitq));
 		wq->w.pid = up->pid;
 		utime = up->time[TUser] + up->time[TCUser];
@@ -1175,29 +1151,16 @@
 		if(wq != nil)
 			free(wq);
 	}
-	else if(up->parentpid == 0){
-		if(exitstr == nil)
-			exitstr = "unknown";
-		panic("boot process died: %s", exitstr);
-	}
 
-Nowait:
 	if(!freemem)
 		addbroken(up);
 
-	qlock(&up->seglock);
-	es = &up->seg[NSEG];
-	for(s = up->seg; s < es; s++) {
-		if(*s != nil) {
-			putseg(*s);
-			*s = nil;
-		}
-	}
-	qunlock(&up->seglock);
+	qlock(&up->debug);
 
 	lock(&up->exl);		/* Prevent my children from leaving waits */
 	pidfree(up);
-	up->pid = 0;
+	up->parent = nil;
+	up->nchild = up->nwait = 0;
 	wakeup(&up->waitr);
 	unlock(&up->exl);
 
@@ -1207,7 +1170,6 @@
 	}
 
 	/* release debuggers */
-	qlock(&up->debug);
 	if(up->pdbg != nil) {
 		wakeup(&up->pdbg->sleep);
 		up->pdbg = nil;
@@ -1223,6 +1185,15 @@
 	up->nwatchpt = 0;
 	qunlock(&up->debug);
 
+	qlock(&up->seglock);
+	for(s = up->seg; s < &up->seg[NSEG]; s++) {
+		if(*s != nil) {
+			putseg(*s);
+			*s = nil;
+		}
+	}
+	qunlock(&up->seglock);
+
 	/* Sched must not loop for these locks */
 	lock(&procalloc);
 	lock(&palloc);
@@ -1281,7 +1252,8 @@
 Proc*
 proctab(int i)
 {
-	return &procalloc.arena[i];
+#define proctab(x) (&procalloc.arena[(x)])
+	return proctab(i);
 }
 
 void
@@ -1306,23 +1278,6 @@
 		p->lastlock ? p->lastlock->pc : 0, p->priority);
 }
 
-void
-procdump(void)
-{
-	int i;
-	Proc *p;
-
-	if(up != nil)
-		print("up %lud\n", up->pid);
-	else
-		print("no current process\n");
-	for(i=0; i<conf.nproc; i++) {
-		p = &procalloc.arena[i];
-		if(p->state != Dead)
-			dumpaproc(p);
-	}
-}
-
 /*
  *  wait till all matching processes have flushed their mmu
  */
@@ -1339,7 +1294,7 @@
 	memset(await, 0, conf.nmach*sizeof(await[0]));
 	nwait = 0;
 	for(i = 0; i < conf.nproc; i++){
-		p = &procalloc.arena[i];
+		p = proctab(i);
 		if(p->state != Dead && (*match)(p, a)){
 			p->newtlb = 1;
 			for(nm = 0; nm < conf.nmach; nm++){
@@ -1422,24 +1377,6 @@
 }
 
 void
-scheddump(void)
-{
-	Proc *p;
-	Schedq *rq;
-
-	for(rq = &runq[Nrq-1]; rq >= runq; rq--){
-		if(rq->head == nil)
-			continue;
-		print("rq%zd:", rq-runq);
-		for(p = rq->head; p != nil; p = p->rnext)
-			print(" %lud(%lud)", p->pid, m->ticks - p->readytime);
-		print("\n");
-		delay(150);
-	}
-	print("nrdy %d\n", nrdy);
-}
-
-void
 kproc(char *name, void (*func)(void *), void *arg)
 {
 	static Pgrp *kpgrp;
@@ -1447,6 +1384,7 @@
 
 	p = newproc();
 
+	qlock(&p->debug);
 	if(up != nil){
 		p->slash = up->slash;
 		p->dot = up->slash;	/* unlike fork, do not inherit the dot for kprocs */
@@ -1460,9 +1398,12 @@
 	p->nnote = 0;
 	p->notify = nil;
 	p->notified = 0;
+	p->notepending = 0;
 
 	p->procmode = 0640;
+	p->privatemem = 1;
 	p->noswap = 1;
+	p->hang = 0;
 	p->kp = 1;
 
 	kprocchild(p, func, arg);
@@ -1470,6 +1411,8 @@
 	kstrdup(&p->text, name);
 	kstrdup(&p->user, eve);
 	kstrdup(&p->args, "");
+	p->nargs = 0;
+	p->setargs = 0;
 
 	if(kpgrp == nil)
 		kpgrp = newpgrp();
@@ -1480,8 +1423,10 @@
 	memset(p->time, 0, sizeof(p->time));
 	p->time[TReal] = MACHP(0)->ticks;
 
-	p->noteid = pidalloc(p);
+	pidalloc(p);
 
+	qunlock(&p->debug);
+
 	procpriority(p, PriKproc, 0);
 
 	p->psstate = nil;
@@ -1597,12 +1542,12 @@
 	int i;
 	Segment *s;
 	ulong l, max;
-	Proc *p, *ep, *kp;
+	Proc *p, *kp;
 
 	max = 0;
 	kp = nil;
-	ep = procalloc.arena+conf.nproc;
-	for(p = procalloc.arena; p < ep; p++) {
+	for(i = 0; i < conf.nproc; i++) {
+		p = proctab(i);
 		if(p->state == Dead || p->kp)
 			continue;
 		if((p->noswap || (p->procmode & 0222) == 0) && strcmp(eve, p->user) == 0)
@@ -1617,7 +1562,8 @@
 		return;
 	print("%lud: %s killed: %s\n", kp->pid, kp->text, why);
 	qlock(&kp->seglock);
-	for(p = procalloc.arena; p < ep; p++) {
+	for(i = 0; i < conf.nproc; i++) {
+		p = proctab(i);
 		if(p->state == Dead || p->kp)
 			continue;
 		if(p != kp && p->seg[BSEG] != nil && p->seg[BSEG] == kp->seg[BSEG])
@@ -1649,14 +1595,27 @@
 void
 renameuser(char *old, char *new)
 {
-	Proc *p, *ep;
+	Proc *p;
+	int i;
 
-	ep = procalloc.arena+conf.nproc;
-	for(p = procalloc.arena; p < ep; p++)
+	for(i = 0; i < conf.nproc; i++){
+		p = proctab(i);
+		qlock(&p->debug);
 		if(p->user != nil && strcmp(old, p->user) == 0)
 			kstrdup(&p->user, new);
+		qunlock(&p->debug);
+	}
 }
 
+void
+procsetuser(Proc *p, char *new)
+{
+	qlock(&p->debug);
+	kstrdup(&p->user, new);
+	p->basepri = PriNormal;
+	qunlock(&p->debug);
+}
+
 /*
  *  time accounting called by clock() splhi'd
  */
@@ -1707,65 +1666,204 @@
 	m->load = load/100;
 }
 
-int
-pidalloc(Proc *p)
+/*
+ *  A Pid structure is a reference counted hashtable entry
+ *  with "pid" being the key and "procindex" being the value.
+ *  A entry is allocated atomically by changing the key from
+ *  negative or zero to the positive process id number.
+ *  Pid's outlive ther Proc's as long as other processes hold
+ *  a reference to them such as noteid or parentpid.
+ *  This prevents pid reuse when the pid generator wraps.
+ */
+typedef struct Pid Pid;
+struct Pid
 {
-	static int gen, wrapped;
-	int pid, h;
-	Proc *x;
+	Ref;
+	long	pid;
+	int	procindex;
+};
 
-	lock(&procalloc);
-Retry:
-	pid = ++gen & 0x7FFFFFFF;
-	if(pid == 0){
-		wrapped = 1;
-		goto Retry;
+enum {
+	PIDMASK = 0x7FFFFFFF,
+	PIDSHIFT = 4,	/* log2 bucket size of the hash table */
+};
+
+static Pid *pidhashtab;
+static ulong pidhashmask;
+
+static void
+pidinit(void)
+{
+	/*
+	 * allocate 3 times conf.nproc Pid structures for the hash table
+	 * and round up to a power of two as each process can reference
+	 * up to 3 unique Pid structures:
+	 *	- pid
+	 *	- noteid
+	 *	- parentpid
+	 */
+	pidhashmask = 1<<PIDSHIFT;
+	while(pidhashmask < conf.nproc*3)
+		pidhashmask <<= 1;
+
+	pidhashtab = xalloc(pidhashmask * sizeof(pidhashtab[0]));
+	if(pidhashtab == nil){
+		xsummary();
+		panic("cannot allocate pid hashtable of size %lud", pidhashmask);
 	}
-	h = pid % nelem(procalloc.ht);
-	if(wrapped)
-		for(x = procalloc.ht[h]; x != nil; x = x->pidhash)
-			if(x->pid == pid)
-				goto Retry;
-	if(p != nil){
-		p->pid = pid;
-		p->pidhash = procalloc.ht[h];
-		procalloc.ht[h] = p;
+
+	/* make it a mask */
+	pidhashmask--;
+}
+
+static Pid*
+pidlookup(long pid)
+{
+	Pid *i, *e;
+	long o;
+
+	i = &pidhashtab[(pid<<PIDSHIFT) & pidhashmask];
+	for(e = &i[1<<PIDSHIFT]; i < e; i++){
+		o = i->pid;
+		if(o == pid)
+			return i;
+		if(o == 0)
+			break;
 	}
-	unlock(&procalloc);
-	return pid;
+	return nil;
 }
 
-static void
-pidfree(Proc *p)
+/*
+ *  increment the reference count of a pid (pid>0)
+ *  or allocate a new one (pid<=0)
+ */
+static Pid*
+pidadd(long pid)
 {
-	int h;
-	Proc **l;
+	Pid *i, *e;
+	long o;
 
-	h = p->pid % nelem(procalloc.ht);
-	lock(&procalloc);
-	for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash)
-		if(*l == p){
-			*l = p->pidhash;
-			break;
+	if(pid > 0){
+		i = pidlookup(pid);
+		if(i != nil)
+			incref(i);
+		return i;
+	}
+Again:
+	do {
+		static Ref gen;
+		pid = incref(&gen) & PIDMASK;
+	} while(pid == 0 || pidlookup(pid) != nil);
+
+	i = &pidhashtab[(pid<<PIDSHIFT) & pidhashmask];
+	for(e = &i[1<<PIDSHIFT]; i < e; i++){
+		while((o = i->pid) <= 0){
+			if(cmpswap(&i->pid, o, pid)){
+				incref(i);
+				return i;
+			}
 		}
-	unlock(&procalloc);
+	}
+	/* bucket full, try a different pid */
+	goto Again;
 }
 
+/*
+ *  decrement reference count of a pid and free it
+ *  when no references are remaining
+ */
+static void
+piddel(Pid *i)
+{
+	if(decref(i))
+		return;
+	i->pid = -1;	/* freed */
+}
+
 int
 procindex(ulong pid)
 {
-	Proc *p;
-	int h;
-	int s;
+	Pid *i;
 
-	s = -1;
-	h = pid % nelem(procalloc.ht);
-	lock(&procalloc);
-	for(p = procalloc.ht[h]; p != nil; p = p->pidhash)
-		if(p->pid == pid){
-			s = p - procalloc.arena;
-			break;
-		}
-	unlock(&procalloc);
-	return s;
+	i = pidlookup(pid);
+	if(i != nil){
+		int x = i->procindex;
+		if(proctab(x)->pid == pid)
+			return x;
+	}
+	return -1;
+}
+
+ulong
+setnoteid(Proc *p, ulong noteid)
+{
+	Pid *i, *o;
+
+	/*
+	 * avoid allocating a new pid when we are the only
+	 * user of the noteid
+	 */
+	o = pidlookup(p->noteid);
+	if(noteid == 0 && o->ref == 1)
+		return o->pid;
+
+	i = pidadd(noteid);
+	if(i == nil)
+		error(Ebadarg);
+	piddel(o);
+	return p->noteid = i->pid;
+}
+
+static ulong
+setparentpid(Proc *p, Proc *pp)
+{
+	Pid *i;
+
+	i = pidadd(pp->pid);
+	return p->parentpid = i->pid;
+}
+
+/*
+ *  allocate pid, noteid and parentpid to a process
+ */
+ulong
+pidalloc(Proc *p)
+{
+	Pid *i;
+
+	/* skip for the boot process */
+	if(up != nil)
+		setparentpid(p, up);
+
+	i = pidadd(0);
+	i->procindex = (int)(p - procalloc.arena);
+
+	if(p->noteid == 0){
+		incref(i);
+		p->noteid = i->pid;
+	} else
+		pidadd(p->noteid);
+
+	return p->pid = i->pid;
+}
+
+/*
+ *  release pid, noteid and parentpid from a process
+ */
+static void
+pidfree(Proc *p)
+{
+	Pid *i;
+
+	i = pidlookup(p->pid);
+	piddel(i);
+
+	if(p->noteid != p->pid)
+		i = pidlookup(p->noteid);
+	piddel(i);
+
+	if(p->parentpid != 0)
+		piddel(pidlookup(p->parentpid));
+
+	p->pid = p->noteid = p->parentpid = 0;
 }
--- a/sys/src/9/port/sysproc.c
+++ b/sys/src/9/port/sysproc.c
@@ -80,12 +80,14 @@
 			closeegrp(oeg);
 		}
 		if(flag & RFNOTEG)
-			up->noteid = pidalloc(nil);
+			setnoteid(up, 0);
 		return 0;
 	}
 
 	p = newproc();
 
+	qlock(&p->debug);
+
 	p->scallnr = up->scallnr;
 	p->s = up->s;
 	p->slash = up->slash;
@@ -96,10 +98,11 @@
 	p->nnote = up->nnote;
 	p->notify = up->notify;
 	p->notified = 0;
+	p->notepending = 0;
 	p->lastnote = up->lastnote;
+	if((flag & RFNOTEG) == 0)
+		p->noteid = up->noteid;
 
-	p->noteid = up->noteid;
-	p->parentpid = up->pid;
 	p->procmode = up->procmode;
 	p->privatemem = up->privatemem;
 	p->noswap = up->noswap;
@@ -106,6 +109,7 @@
 	p->hang = up->hang;
 	if(up->procctl == Proc_tracesyscall)
 		p->procctl = Proc_tracesyscall;
+	p->kp = 0;
 
 	/* Craft a return frame which will cause the child to pop out of
 	 * the scheduler in user mode with the return register zero
@@ -115,6 +119,8 @@
 	kstrdup(&p->text, up->text);
 	kstrdup(&p->user, up->user);
 	kstrdup(&p->args, "");
+	p->nargs = 0;
+	p->setargs = 0;
 
 	p->insyscall = 0;
 	memset(p->time, 0, sizeof(p->time));
@@ -122,6 +128,8 @@
 
 	pid = pidalloc(p);
 
+	qunlock(&p->debug);
+
 	/* Abort the child process on error */
 	if(waserror()){
 		p->kp = 1;
@@ -188,9 +196,6 @@
 		p->egrp = up->egrp;
 		incref(p->egrp);
 	}
-
-	if(flag & RFNOTEG)
-		p->noteid = pid;
 
 	procfork(p);
 
--- a/sys/src/9/port/userinit.c
+++ b/sys/src/9/port/userinit.c
@@ -61,6 +61,7 @@
 	 */
 	up->kp = 0;
 	up->noswap = 0;
+	up->privatemem = 0;
 	procpriority(up, PriNormal, 0);
 	procsetup(up);