shithub: riscv

Download patch

ref: 6cadd03bbeace1c256ba875c2e6a877f924877cd
parent: 6d99096136278f06f6333f927da34105a8dfe0bf
author: cinap_lenrek <[email protected]>
date: Mon Dec 31 16:09:46 EST 2012

fix utf and rune handling in preparation for 32bit runes

--- a/sys/include/ape/utf.h
+++ b/sys/include/ape/utf.h
@@ -14,7 +14,8 @@
 	UTFmax		= 3,		/* maximum bytes per rune */
 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
-	Runeerror	= 0x80,		/* decoding error in UTF */
+	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax		= 0xFFFF,	/* 16 bit rune */
 };
 
 /*
--- a/sys/include/libc.h
+++ b/sys/include/libc.h
@@ -45,6 +45,7 @@
 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
 	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax		= 0xFFFF,	/* 16 bit rune */
 };
 
 /*
--- a/sys/src/9/pc/cga.c
+++ b/sys/src/9/pc/cga.c
@@ -99,7 +99,9 @@
 	int i;
 	uchar *p;
 
-	if(c == '\n'){
+	if(c == '\0')
+		return;
+	else if(c == '\n'){
 		cgapos = cgapos/Width;
 		cgapos = (cgapos+1)*Width;
 	}
@@ -138,8 +140,10 @@
 static void
 cgascreenputs(char* s, int n)
 {
+	static char rb[UTFmax];
+	static int nrb;
+	char *e;
 	Rune r;
-	int i;
 
 	if(!islo()){
 		/*
@@ -152,11 +156,14 @@
 	else
 		lock(&cgascreenlock);
 
-	while(n > 0){
-		i = chartorune(&r, s);
-		cgascreenputc(r);
-		s += i;
-		n -= i;
+	e = s + n;
+	while(s < e){
+		rb[nrb++] = *s++;
+		if(nrb >= UTFmax || fullrune(rb, nrb)){
+			chartorune(&r, rb);
+			cgascreenputc(r);
+			nrb = 0;
+		}
 	}
 
 	unlock(&cgascreenlock);
--- a/sys/src/9/pc/vga.c
+++ b/sys/src/9/pc/vga.c
@@ -119,9 +119,10 @@
 static void
 vgascreenputs(char* s, int n)
 {
-	int i, gotdraw;
-	Rune r;
-	char buf[4];
+	static char rb[UTFmax+1];
+	static int nrb;
+	char *e;
+	int gotdraw;
 	VGAscr *scr;
 	Rectangle flushr;
 
@@ -146,13 +147,14 @@
 
 	flushr = Rect(10000, 10000, -10000, -10000);
 
-	while(n > 0){
-		i = chartorune(&r, s);
-		memmove(buf, s, i);
-		buf[i] = 0;
-		n -= i;
-		s += i;
-		vgascreenputc(scr, buf, &flushr);
+	e = s + n;
+	while(s < e){
+		rb[nrb++] = *s++;
+		if(nrb >= UTFmax || fullrune(rb, nrb)){
+			rb[nrb] = 0;
+			vgascreenputc(scr, rb, &flushr);
+			nrb = 0;
+		}
 	}
 	flushmemscreen(flushr);
 
--- a/sys/src/9/port/lib.h
+++ b/sys/src/9/port/lib.h
@@ -38,7 +38,8 @@
 	UTFmax		= 3,	/* maximum bytes per rune */
 	Runesync	= 0x80,	/* cannot represent part of a UTF sequence */
 	Runeself	= 0x80,	/* rune and UTF sequences are the same (<) */
-	Runeerror	= 0x80,	/* decoding error in UTF */
+	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax		= 0xFFFF,	/* 16 bit rune */
 };
 
 /*
--- a/sys/src/ape/lib/ap/gen/mbwc.c
+++ b/sys/src/ape/lib/ap/gen/mbwc.c
@@ -1,4 +1,5 @@
 #include <stdlib.h>
+#include <utf.h>
 
 /*
  * Use the FSS-UTF transformation proposed by posix.
@@ -7,6 +8,7 @@
  *	Tx	10xxxxxx	6 free bits
  *	T1	110xxxxx	5 free bits
  *	T2	1110xxxx	4 free bits
+ *	T3	11110xxx	3 free bits
  *
  *	Encoding is as follows.
  *	From hex	Thru hex	Sequence		Bits
@@ -13,6 +15,7 @@
  *	00000000	0000007F	T0			7
  *	00000080	000007FF	T1 Tx			11
  *	00000800	0000FFFF	T2 Tx Tx		16
+ *	00010000	0010FFFF	T3 Tx Tx Tx		20 (and change)
  */
 
 int
@@ -25,7 +28,7 @@
 int
 mbtowc(wchar_t *pwc, const char *s, size_t n)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	if(!s)
@@ -70,7 +73,25 @@
 		return 3;
 	}
 
-	/*
+	if(n < 4)
+		goto bad;
+	if(UTFmax >= 4) {
+		c3 = (s[3] ^ 0x80) & 0xff;
+		if(c3 & 0xC0)
+			goto bad;
+		if(c < 0xf8) {
+			l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff;
+			if(l <= 0x10000)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			if(pwc)
+				*pwc = l;
+			return 4;
+		}
+	}
+
+ 	/*
 	 * bad decoding
 	 */
 bad:
@@ -86,7 +107,10 @@
 	if(!s)
 		return 0;
 
-	c = wchar & 0xFFFF;
+	c = wchar;
+	if(c > Runemax)
+		c = Runeerror;
+
 	if(c < 0x80) {
 		s[0] = c;
 		return 1;
@@ -98,10 +122,18 @@
 		return 2;
 	}
 
-	s[0] = 0xE0 |  (c >> 12);
-	s[1] = 0x80 | ((c >> 6) & 0x3F);
-	s[2] = 0x80 |  (c & 0x3F);
-	return 3;
+	if(c < 0x10000) {
+		s[0] = 0xE0 |  (c >> 12);
+		s[1] = 0x80 | ((c >> 6) & 0x3F);
+		s[2] = 0x80 |  (c & 0x3F);
+		return 3;
+	}
+
+	s[0] = 0xf0 | c >> 18;
+	s[1] = 0x80 | (c >> 12) & 0x3F;
+	s[2] = 0x80 | (c >> 6) & 0x3F;
+	s[3] = 0x80 | (c & 0x3F);
+	return 4;
 }
 
 size_t
@@ -117,7 +149,7 @@
 				break;
 			s++;
 		} else {
-			d = mbtowc(pwcs, s, 3);
+			d = mbtowc(pwcs, s, UTFmax);
 			if(d <= 0)
 				return (size_t)((d<0) ? -1 : i);
 			s += d;
@@ -133,10 +165,10 @@
 	int i, d;
 	long c;
 	char *p, *pe;
-	char buf[3];
+	char buf[UTFmax];
 
 	p = s;
-	pe = p+n-3;
+	pe = p+n-UTFmax;
 	while(p < pe) {
 		c = *pwcs++;
 		if(c < 0x80)
@@ -146,17 +178,14 @@
 		if(c == 0)
 			return p-s;
 	}
-	while(p < pe+3) {
+	while(p < pe+UTFmax) {
 		c = *pwcs++;
 		d = wctomb(buf, c);
-		if(p+d <= pe+3) {
-			*p++ = buf[0];
-			if(d > 1) {
-				*p++ = buf[2];
-				if(d > 2)
-					*p++ = buf[3];
-			}
-		}
+		if(p+d <= pe+UTFmax) {
+			for(i = 0; i < d; i++)
+				p[i] = buf[i];
+			p += d;
+ 		}
 		if(c == 0)
 			break;
 	}
--- a/sys/src/ape/lib/fmt/dofmt.c
+++ b/sys/src/ape/lib/fmt/dofmt.c
@@ -546,12 +546,15 @@
 int
 __badfmt(Fmt *f)
 {
-	char x[3];
+	char x[2+UTFmax];
+	Rune r;
+	int n;
 
+	r = f->r;
 	x[0] = '%';
-	x[1] = f->r;
-	x[2] = '%';
-	f->prec = 3;
-	__fmtcpy(f, (const void*)x, 3, 3);
+	n = 1+runetochar(x+1, &r);
+	x[n++] = '%';
+	f->prec = n;
+	__fmtcpy(f, x, n, n);
 	return 0;
 }
--- a/sys/src/ape/lib/utf/rune.c
+++ b/sys/src/ape/lib/utf/rune.c
@@ -23,6 +23,7 @@
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
@@ -29,10 +30,12 @@
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
@@ -43,7 +46,7 @@
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -88,6 +91,25 @@
 		return 3;
 	}
 
+ 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -101,11 +123,14 @@
 {
 	long c;
 
+	c = *rune;
+	if(c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * one character sequence
 	 *	00000-0007F => 00-7F
 	 */
-	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		return 1;
@@ -125,10 +150,22 @@
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if(c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
 }
 
 int
@@ -135,7 +172,7 @@
 runelen(long c)
 {
 	Rune rune;
-	char str[10];
+	char str[UTFmax];
 
 	rune = c;
 	return runetochar(str, &rune);
@@ -155,7 +192,10 @@
 		if(c <= Rune2)
 			nb += 2;
 		else
+		if(c <= Rune3 || c > Runemax)
 			nb += 3;
+		else
+			nb += 4;
 	}
 	return nb;
 }
@@ -165,13 +205,15 @@
 {
 	int c;
 
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(UTFmax == 3 || c < T4)
+		return n >= 3;
+	return n >= 4;
 }
+
--- a/sys/src/cmd/1c/swt.c
+++ b/sys/src/cmd/1c/swt.c
@@ -244,26 +244,26 @@
 }
 
 long
-outlstring(ushort *s, long n)
+outlstring(Rune *s, long n)
 {
-	char buf[2];
-	int c;
+	char buf[sizeof(Rune)];
+	int c, i;
 	long r;
 
-	while(nstring & 1)
+	while(nstring % sizeof buf)
 		outstring("", 1);
 	r = nstring;
 	while(n > 0) {
 		c = *s++;
 		if(align(0, types[TCHAR], Aarg1)) {
-			buf[0] = c>>8;
-			buf[1] = c;
+			for(i = sizeof buf; i > 0; c >>= 8)
+				buf[--i] = c;
 		} else {
-			buf[0] = c;
-			buf[1] = c>>8;
+			for(i = 0; i < sizeof buf; c >>= 8)
+				buf[i++] = c;
 		}
-		outstring(buf, 2);
-		n -= sizeof(ushort);
+		outstring(buf, sizeof buf);
+		n -= sizeof buf;
 	}
 	return r;
 }
--- a/sys/src/cmd/2c/swt.c
+++ b/sys/src/cmd/2c/swt.c
@@ -324,26 +324,26 @@
 }
 
 long
-outlstring(ushort *s, long n)
+outlstring(Rune *s, long n)
 {
-	char buf[2];
-	int c;
+	char buf[sizeof(Rune)];
+	int c, i;
 	long r;
 
-	while(nstring & 1)
+	while(nstring % sizeof buf)
 		outstring("", 1);
 	r = nstring;
 	while(n > 0) {
 		c = *s++;
 		if(align(0, types[TCHAR], Aarg1)) {
-			buf[0] = c>>8;
-			buf[1] = c;
+			for(i = sizeof buf; i > 0; c >>= 8)
+				buf[--i] = c;
 		} else {
-			buf[0] = c;
-			buf[1] = c>>8;
+			for(i = 0; i < sizeof buf; c >>= 8)
+				buf[i++] = c;
 		}
-		outstring(buf, 2);
-		n -= sizeof(ushort);
+		outstring(buf, sizeof buf);
+		n -= sizeof buf;
 	}
 	return r;
 }
--- a/sys/src/cmd/acme/regx.c
+++ b/sys/src/cmd/acme/regx.c
@@ -487,7 +487,7 @@
 			exprp++;	/* eat '-' */
 			if((c2 = nextrec()) == ']')
 				goto Error;
-			classp[n+0] = 0xFFFF;
+			classp[n+0] = Runemax;
 			classp[n+1] = c1;
 			classp[n+2] = c2;
 			n += 3;
@@ -509,7 +509,7 @@
 
 	p = class[classno];
 	while(*p){
-		if(*p == 0xFFFF){
+		if(*p == Runemax){
 			if(p[1]<=c && c<=p[2])
 				return !negate;
 			p += 3;
--- a/sys/src/cmd/auth/convkeys.c
+++ b/sys/src/cmd/auth/convkeys.c
@@ -121,7 +121,7 @@
 
 	for (; *s != '\0'; s += n) {
 		n = chartorune(&r, s);
-		if (n == 1 && r == Runeerror)
+		if (r == Runeerror)
 			return 1;
 	}
 	return 0;
--- a/sys/src/cmd/bitsy/keyboard.c
+++ b/sys/src/cmd/bitsy/keyboard.c
@@ -395,7 +395,7 @@
 			if(strcmp(args[0], "keyboard:")==0 || strcmp(args[0], "scribble:")==0)
 			if(strcmp(args[1], "value") == 0){
 				n = atoi(args[2]);
-				if(n <= 0xFFFF){
+				if(n <= Runemax){
 					r = n;
 					i = runetochar(str, &r);
 					write(kbdfd, str, i);
--- a/sys/src/cmd/bitsy/prompter.c
+++ b/sys/src/cmd/bitsy/prompter.c
@@ -282,7 +282,7 @@
 			n = atoi(args[2]);
 			if(n == '\033')	/* Escape exits */
 				break;
-			if(n <= 0xFFFF){
+			if(n <= Runemax){
 				r = n;
 				send(kbdctl->c, &r);
 			}
--- a/sys/src/cmd/cc/cc.h
+++ b/sys/src/cmd/cc/cc.h
@@ -51,7 +51,7 @@
 	double	fconst;		/* fp constant */
 	vlong	vconst;		/* non fp const */
 	char*	cstring;	/* character string */
-	ushort*	rstring;	/* rune string */
+	Rune*	rstring;	/* rune string */
 
 	Sym*	sym;
 	Type*	type;
@@ -336,6 +336,8 @@
 	TFILE,
 	TOLD,
 	NALLTYPES,
+
+	TRUNE	= sizeof(Rune)==4? TUINT: TUSHORT,
 };
 enum
 {
@@ -740,7 +742,7 @@
 void	gextern(Sym*, Node*, long, long);
 void	ginit(void);
 long	outstring(char*, long);
-long	outlstring(ushort*, long);
+long	outlstring(Rune*, long);
 void	sextern(Sym*, Node*, long, long);
 void	xcom(Node*);
 long	exreg(Type*);
--- a/sys/src/cmd/cc/cc.y
+++ b/sys/src/cmd/cc/cc.y
@@ -855,9 +855,9 @@
 	LLSTRING
 	{
 		$$ = new(OLSTRING, Z, Z);
-		$$->type = typ(TARRAY, types[TUSHORT]);
-		$$->type->width = $1.l + sizeof(ushort);
-		$$->rstring = (ushort*)$1.s;
+		$$->type = typ(TARRAY, types[TRUNE]);
+		$$->type->width = $1.l + sizeof(Rune);
+		$$->rstring = (Rune*)$1.s;
 		$$->sym = symstring;
 		$$->etype = TARRAY;
 		$$->class = CSTATIC;
@@ -867,16 +867,16 @@
 		char *s;
 		int n;
 
-		n = $1->type->width - sizeof(ushort);
+		n = $1->type->width - sizeof(Rune);
 		s = alloc(n+$2.l+MAXALIGN);
 
 		memcpy(s, $1->rstring, n);
 		memcpy(s+n, $2.s, $2.l);
-		*(ushort*)(s+n+$2.l) = 0;
+		*(Rune*)(s+n+$2.l) = 0;
 
 		$$ = $1;
 		$$->type->width += $2.l;
-		$$->rstring = (ushort*)s;
+		$$->rstring = (Rune*)s;
 	}
 
 zelist:
--- a/sys/src/cmd/cc/com.c
+++ b/sys/src/cmd/cc/com.c
@@ -633,10 +633,11 @@
 		break;
 
 	case OLSTRING:
-		if(n->type->link != types[TUSHORT]) {
+		if(n->type->link != types[TRUNE]) {
 			o = outstring(0, 0);
 			while(o & 3) {
-				outlstring(L"", sizeof(ushort));
+				Rune str[1] = {0};
+				outlstring(str, sizeof(Rune));
 				o = outlstring(0, 0);
 			}
 		}
--- a/sys/src/cmd/cc/dpchk.c
+++ b/sys/src/cmd/cc/dpchk.c
@@ -67,13 +67,14 @@
 {
 	Bits flag;
 	int f;
-	char *fmt;
+	char *fmt, *e;
 	Rune c;
 
 	fmt = fmtbuf;
+	e = fmtbuf + sizeof(fmtbuf)-1;
 	flag = zbits;
 	nstar = 0;
-	for(;;) {
+	while(fmt < e){
 		s += chartorune(&c, s);
 		fmt += runetochar(fmt, &c);
 		if(c == 0 || c >= nelem(flagbits))
@@ -175,7 +176,7 @@
 {
 	Sym *s;
 	int n, c;
-	char *t;
+	char *t, *e;
 	Rune r;
 	Type *ty;
 
@@ -225,6 +226,7 @@
 	if(c != '"')
 		goto bad;
 	t = fmtbuf;
+	e = t + sizeof(fmtbuf)-1;
 	for(;;) {
 		r = getr();
 		if(r == ' ' || r == '\n')
@@ -231,6 +233,8 @@
 			goto bad;
 		if(r == '"')
 			break;
+		if(t >= e)
+			goto bad;
 		t += runetochar(t, &r);
 	}
 	*t = 0;
--- a/sys/src/cmd/cc/lex.c
+++ b/sys/src/cmd/cc/lex.c
@@ -467,7 +467,7 @@
 				yyerror("missing '");
 				peekc = c1;
 			}
-			yylval.vval = convvtox(c, TUSHORT);
+			yylval.vval = convvtox(c, TRUNE);
 			return LUCONST;
 		}
 		if(c == '"') {
@@ -541,15 +541,15 @@
 			c = escchar('"', 1, 0);
 			if(c == EOF)
 				break;
-			cp = allocn(cp, c1, sizeof(ushort));
-			*(ushort*)(cp + c1) = c;
-			c1 += sizeof(ushort);
+			cp = allocn(cp, c1, sizeof(Rune));
+			*(Rune*)(cp + c1) = c;
+			c1 += sizeof(Rune);
 		}
 		yylval.sval.l = c1;
 		do {
-			cp = allocn(cp, c1, sizeof(ushort));
-			*(ushort*)(cp + c1) = 0;
-			c1 += sizeof(ushort);
+			cp = allocn(cp, c1, sizeof(Rune));
+			*(Rune*)(cp + c1) = 0;
+			c1 += sizeof(Rune);
 		} while(c1 & MAXALIGN);
 		yylval.sval.s = cp;
 		return LLSTRING;
@@ -1027,7 +1027,7 @@
 	} else
 		c = GETC();
 	for(;;) {
-		if(!isspace(c))
+		if(c >= Runeself || !isspace(c))
 			return c;
 		if(c == '\n') {
 			lineno++;
--- a/sys/src/cmd/cc/pswt.c
+++ b/sys/src/cmd/cc/pswt.c
@@ -132,28 +132,28 @@
 }
 
 long
-outlstring(ushort *s, long n)
+outlstring(Rune *s, long n)
 {
-	char buf[2];
-	int c;
+	char buf[sizeof(Rune)];
+	int c, i;
 	long r;
 
 	if(suppress)
 		return nstring;
-	while(nstring & 1)
+	while(nstring % sizeof buf)
 		outstring("", 1);
 	r = nstring;
 	while(n > 0) {
 		c = *s++;
 		if(align(0, types[TCHAR], Aarg1)) {
-			buf[0] = c>>8;
-			buf[1] = c;
+			for(i = sizeof buf; i > 0; c >>= 8)
+				buf[--i] = c;
 		} else {
-			buf[0] = c;
-			buf[1] = c>>8;
+			for(i = 0; i < sizeof buf; c >>= 8)
+				buf[i++] = c;
 		}
-		outstring(buf, 2);
-		n -= sizeof(ushort);
+		outstring(buf, sizeof buf);
+		n -= sizeof buf;
 	}
 	return r;
 }
--- a/sys/src/cmd/disk/9660/cdrdwr.c
+++ b/sys/src/cmd/disk/9660/cdrdwr.c
@@ -503,7 +503,6 @@
 {
 	Rune r[256];
 
-	strtorune(r, s);
 	Cputrs(cd, strtorune(r, s), size);
 }
 
--- a/sys/src/cmd/disk/9660/jchar.c
+++ b/sys/src/cmd/disk/9660/jchar.c
@@ -45,8 +45,7 @@
 
 	if(utflen(s) > 64)
 		return 1;
-	strtorune(r, s);
-	for(p=r; *p; p++)
+	for(p=strtorune(r, s); *p; p++)
 		if(isjolietfrog(*p))
 			return 1;
 	return 0;
--- a/sys/src/cmd/ed.c
+++ b/sys/src/cmd/ed.c
@@ -54,7 +54,7 @@
 int	peekc;
 int	pflag;
 int	rescuing;
-Rune	rhsbuf[LBSIZE/2];
+Rune	rhsbuf[LBSIZE/sizeof(Rune)];
 char	savedfile[FNSIZE];
 jmp_buf	savej;
 int	subnewa;
@@ -990,11 +990,11 @@
 	lp = linebuf;
 	bp = getblock(tl, OREAD);
 	nl = nleft;
-	tl &= ~((BLKSIZE/2) - 1);
+	tl &= ~((BLKSIZE/sizeof(Rune)) - 1);
 	while(*lp++ = *bp++) {
 		nl -= sizeof(Rune);
 		if(nl == 0) {
-			bp = getblock(tl += BLKSIZE/2, OREAD);
+			bp = getblock(tl += BLKSIZE/sizeof(Rune), OREAD);
 			nl = nleft;
 		}
 	}
@@ -1012,7 +1012,7 @@
 	tl = tline;
 	bp = getblock(tl, OWRITE);
 	nl = nleft;
-	tl &= ~((BLKSIZE/2)-1);
+	tl &= ~((BLKSIZE/sizeof(Rune))-1);
 	while(*bp = *lp++) {
 		if(*bp++ == '\n') {
 			bp[-1] = 0;
@@ -1021,7 +1021,7 @@
 		}
 		nl -= sizeof(Rune);
 		if(nl == 0) {
-			tl += BLKSIZE/2;
+			tl += BLKSIZE/sizeof(Rune);
 			bp = getblock(tl, OWRITE);
 			nl = nleft;
 		}
@@ -1048,8 +1048,8 @@
 	static uchar ibuff[BLKSIZE];
 	static uchar obuff[BLKSIZE];
 
-	bno = atl / (BLKSIZE/2);
-	off = (atl<<1) & (BLKSIZE-1) & ~03;
+	bno = atl / (BLKSIZE/sizeof(Rune));
+	off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~03;
 	if(bno >= NBLK) {
 		lastc = '\n';
 		error(T);
@@ -1240,7 +1240,7 @@
 		if(c == '\\') {
 			c = getchr();
 			*p++ = ESCFLG;
-			if(p >= &rhsbuf[LBSIZE/2])
+			if(p >= &rhsbuf[nelem(rhsbuf)])
 				error(Q);
 		} else
 		if(c == '\n' && (!globp || !globp[0])) {
@@ -1251,7 +1251,7 @@
 		if(c == seof)
 			break;
 		*p++ = c;
-		if(p >= &rhsbuf[LBSIZE/2])
+		if(p >= &rhsbuf[nelem(rhsbuf)])
 			error(Q);
 	}
 	*p = 0;
--- a/sys/src/cmd/file.c
+++ b/sys/src/cmd/file.c
@@ -359,7 +359,7 @@
 		rb = malloc(nbuf+1);
 		memmove(rb, buf+2, nbuf);
 		p = (char*)buf;
-		e = p+nbuf-4;
+		e = p+sizeof(buf)-UTFmax-1;
 		for(i=0; i<nbuf && p < e; i+=2){
 			r = rb[i+1] | rb[i]<<8;
 			p += runetochar(p, &r);
@@ -376,7 +376,7 @@
 		rb = malloc(nbuf+1);
 		memmove(rb, buf+2, nbuf);
 		p = (char*)buf;
-		e = p+nbuf-4;
+		e = p+sizeof(buf)-UTFmax-1;
 		for(i=0; i<nbuf && p < e; i+=2){
 			r = rb[i] | rb[i+1]<<8;
 			p += runetochar(p, &r);
--- a/sys/src/cmd/ip/ftpfs/proto.c
+++ b/sys/src/cmd/ip/ftpfs/proto.c
@@ -1525,7 +1525,7 @@
 	if(*p == 0)
 		return nil;
 
-	to = malloc(3*strlen(from)+2);
+	to = malloc(UTFmax*strlen(from)+2);
 	if(to == nil)
 		return nil;
 	for(p = to; *from; from++){
--- a/sys/src/cmd/ip/httpd/wikipost.c
+++ b/sys/src/cmd/ip/httpd/wikipost.c
@@ -59,7 +59,7 @@
 	t = v;
 	while(*s){
 		/* in decoding error, assume latin1 */
-		if((n=chartorune(&r, s)) == 1 && r == 0x80)
+		if((n=chartorune(&r, s)) == 1 && r == Runeerror)
 			r = *s;
 		s += n;
 		t += runetochar(t, &r);
--- a/sys/src/cmd/join.c
+++ b/sys/src/cmd/join.c
@@ -286,7 +286,7 @@
 {
 	int i;
 	Rune *temp;
-	char buf[BUFSIZ];
+	char buf[BUFSIZ*UTFmax+1];
 
 	if (no <= 0) {	/* default case */
 		printf("%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
--- a/sys/src/cmd/postscript/common/rune.c
+++ b/sys/src/cmd/postscript/common/rune.c
@@ -7,6 +7,7 @@
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
@@ -13,10 +14,12 @@
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
@@ -27,7 +30,7 @@
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -72,6 +75,25 @@
 		return 3;
 	}
 
+ 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(unsigned char*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -85,11 +107,14 @@
 {
 	long c;
 
+	c = *rune;
+	if(c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * one character sequence
 	 *	00000-0007F => 00-7F
 	 */
-	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		return 1;
@@ -109,10 +134,22 @@
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if(c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
 }
 
 int
@@ -119,7 +156,7 @@
 runelen(long c)
 {
 	Rune rune;
-	char str[10];
+	char str[UTFmax];
 
 	rune = c;
 	return runetochar(str, &rune);
@@ -126,17 +163,41 @@
 }
 
 int
+runenlen(Rune *r, int nrune)
+{
+	int nb, c;
+
+	nb = 0;
+	while(nrune--) {
+		c = *r++;
+		if(c <= Rune1)
+			nb++;
+		else
+		if(c <= Rune2)
+			nb += 2;
+		else
+		if(c <= Rune3 || c > Runemax)
+			nb += 3;
+		else
+			nb += 4;
+	}
+	return nb;
+}
+
+int
 fullrune(char *str, int n)
 {
 	int c;
 
-	if(n > 0) {
-		c = *(unsigned char*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(unsigned char*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(UTFmax == 3 || c < T4)
+		return n >= 3;
+	return n >= 4;
 }
+
--- a/sys/src/cmd/postscript/common/rune.h
+++ b/sys/src/cmd/postscript/common/rune.h
@@ -14,6 +14,7 @@
 	UTFmax		= 3,		/* maximum bytes per rune */
 	Runesync	= 0x80,		/* cannot represent part of a utf sequence (<) */
 	Runeself	= 0x80,		/* rune and utf sequences are the same (<) */
-	Runeerror	= 0xFFFD,		/* decoding error in utf */
+	Runeerror	= 0xFFFD,	/* decoding error in utf */
+	Runemax		= 0xFFFF,	/* 16 bit rune */
 };
 #endif
--- a/sys/src/cmd/sam/cmd.c
+++ b/sys/src/cmd/sam/cmd.c
@@ -71,7 +71,7 @@
 inputc(void)
 {
 	int n, nbuf;
-	char buf[3];
+	char buf[UTFmax];
 	Rune r;
 
     Again:
--- a/sys/src/cmd/sam/regexp.c
+++ b/sys/src/cmd/sam/regexp.c
@@ -494,7 +494,7 @@
 			exprp++;	/* eat '-' */
 			if((c2 = nextrec()) == ']')
 				goto Error;
-			classp[n+0] = 0xFFFF;
+			classp[n+0] = Runemax;
 			classp[n+1] = c1;
 			classp[n+2] = c2;
 			n += 3;
@@ -516,7 +516,7 @@
 
 	p = class[classno];
 	while(*p){
-		if(*p == 0xFFFF){
+		if(*p == Runemax){
 			if(p[1]<=c && c<=p[2])
 				return !negate;
 			p += 3;
--- a/sys/src/cmd/samterm/mesg.c
+++ b/sys/src/cmd/samterm/mesg.c
@@ -429,7 +429,7 @@
 void
 outTslS(Tmesg type, int s1, long l1, Rune *s)
 {
-	char buf[DATASIZE*3+1];
+	char buf[DATASIZE*UTFmax+1];
 	char *c;
 
 	outstart(type);
--- a/sys/src/cmd/sed.c
+++ b/sys/src/cmd/sed.c
@@ -625,7 +625,7 @@
 	while ((r = *cp++) != '\0') {
 		if(r == '\\') {
 			if (rhs < end)
-				*rhs++ = 0xFFFF;
+				*rhs++ = Runemax;
 			else
 				return 0;
 			r = *cp++;
@@ -1055,7 +1055,7 @@
 			sp = place(sp, loc1, loc2);
 			continue;
 		}
-		if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') {
+		if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') {
 			n = c-'0';
 			if (subexp[n].rsp && subexp[n].rep) {
 				sp = place(sp, subexp[n].rsp, subexp[n].rep);
@@ -1336,7 +1336,7 @@
 arout(void)
 {
 	int	c;
-	char	*s;
+	char	*s, *e;
 	char	buf[128];
 	Rune	*p1;
 	Biobuf	*fi;
@@ -1347,7 +1347,7 @@
 				Bputrune(&fout, *p1);
 			Bputc(&fout, '\n');
 		} else {
-			for(s = buf, p1 = (*aptr)->text; *p1; p1++)
+			for(s = buf, e = buf+sizeof(buf)-UTFmax-1, p1 = (*aptr)->text; *p1 && s < e; p1++)
 				s += runetochar(s, p1);
 			*s = '\0';
 			if((fi = Bopen(buf, OREAD)) == 0)
--- a/sys/src/cmd/tcs/utf.c
+++ b/sys/src/cmd/tcs/utf.c
@@ -93,7 +93,7 @@
 			if(!fullisorune(buf+i, tot-i))
 				break;
 			c = isochartorune(&runes[j], buf+i);
-			if(runes[j] == Runeerror && c == 1){
+			if(runes[j] == Runeerror){
 				if(squawk)
 					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
 				if(clean){
--- a/sys/src/cmd/tr.c
+++ b/sys/src/cmd/tr.c
@@ -15,7 +15,7 @@
 #define	CLEARBIT(a,c)		((a)[(c)/8] &= ~bits[(c)&07])
 #define	BITSET(a,c)		((a)[(c)/8] & bits[(c)&07])
 
-#define	MAXRUNE	0xFFFF
+#define	MAXRUNE	Runemax
 
 uchar	f[(MAXRUNE+1)/8];
 uchar	t[(MAXRUNE+1)/8];
--- a/sys/src/cmd/tweak.c
+++ b/sys/src/cmd/tweak.c
@@ -803,13 +803,14 @@
 }
 
 int
-type(char *buf, char *tag)
+type(char *buf, int nbuf, char *tag)
 {
 	Rune r;
-	char *p;
+	char *p, *e;
 
 	esetcursor(&busy);
 	p = buf;
+	e = buf + nbuf-UTFmax-1;
 	for(;;){
 		*p = 0;
 		mesg("%s: %s", tag, buf);
@@ -827,7 +828,8 @@
 				--p;
 			break;
 		default:
-			p += runetochar(p, &r);
+			if(p < e)
+				p += runetochar(p, &r);
 		}
 	}
 }
@@ -846,7 +848,7 @@
 	Thing *nt;
 
 	buttons(Up);
-	if(type(buf, tag) == 0)
+	if(type(buf, sizeof(buf), tag) == 0)
 		return;
 	if(strcmp(tag, "file") == 0){
 		for(s=buf; *s; s++)
@@ -1174,7 +1176,7 @@
 	long l;
 
 	buttons(Up);
-	if(type(buf, tag) == 0)
+	if(type(buf, sizeof(buf), tag) == 0)
 		return;
 	if(strcmp(tag, "mag") == 0){
 		if(buf[0]<'0' || '9'<buf[0] || (l=atoi(buf))<=0 || l>Maxmag){
@@ -1806,7 +1808,7 @@
 			return;
 		}
 	}
-	if(type(buf, "char (hex or character or hex-hex)") == 0)
+	if(type(buf, sizeof(buf), "char (hex or character or hex-hex)") == 0)
 		return;
 	if(utflen(buf) == 1){
 		chartorune(&r, buf);
@@ -2000,7 +2002,7 @@
 	sel = emenuhit(3, &mouse, &menu3);
 	switch(sel){
 	case Mopen:
-		if(type(buf, "file")){
+		if(type(buf, sizeof(buf), "file")){
 			t = tget(buf);
 			if(t)
 				drawthing(t, 1);
--- a/sys/src/cmd/unicode.c
+++ b/sys/src/cmd/unicode.c
@@ -51,13 +51,13 @@
 			return "bad range";
 		}
 		min = strtoul(q, &q, 16);
-		if(min<0 || min>0xFFFF || *q!='-')
+		if(min<0 || min>Runemax || *q!='-')
 			goto err;
 		q++;
 		if(strchr(hex, *q) == 0)
 			goto err;
 		max = strtoul(q, &q, 16);
-		if(max<0 || max>0xFFFF || max<min || *q!=0)
+		if(max<0 || max>Runemax || max<min || *q!=0)
 			goto err;
 		i = 0;
 		do{
@@ -111,7 +111,7 @@
 			return "bad char";
 		}
 		m = strtoul(q, &q, 16);
-		if(m<0 || m>0xFFFF || *q!=0)
+		if(m<0 || m>Runemax || *q!=0)
 			goto err;
 		Bprint(&bout, "%C", m);
 		if(!text)
--- a/sys/src/cmd/unix/drawterm/libc/dofmt.c
+++ b/sys/src/cmd/unix/drawterm/libc/dofmt.c
@@ -528,12 +528,15 @@
 int
 __badfmt(Fmt *f)
 {
-	char x[3];
+	char x[2+UTFmax];
+	Rune r;
+	int n;
 
+	r = f->r;
 	x[0] = '%';
-	x[1] = f->r;
-	x[2] = '%';
-	f->prec = 3;
-	__fmtcpy(f, (const void*)x, 3, 3);
+	n = 1+runetochar(x+1, &r);
+	x[n++] = '%';
+	f->prec = n;
+	_fmtcpy(f, x, n, n);
 	return 0;
 }
--- a/sys/src/cmd/unix/drawterm/libc/rune.c
+++ b/sys/src/cmd/unix/drawterm/libc/rune.c
@@ -8,6 +8,7 @@
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
@@ -14,10 +15,12 @@
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
@@ -28,7 +31,7 @@
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -73,6 +76,25 @@
 		return 3;
 	}
 
+ 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -86,11 +108,14 @@
 {
 	long c;
 
+	c = *rune;
+	if(c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * one character sequence
 	 *	00000-0007F => 00-7F
 	 */
-	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		return 1;
@@ -110,10 +135,22 @@
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if(c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
 }
 
 int
@@ -140,7 +177,10 @@
 		if(c <= Rune2)
 			nb += 2;
 		else
+		if(c <= Rune3 || c > Runemax)
 			nb += 3;
+		else
+			nb += 4;
 	}
 	return nb;
 }
@@ -150,13 +190,15 @@
 {
 	int c;
 
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(UTFmax == 3 || c < T4)
+		return n >= 3;
+	return n >= 4;
 }
+
--- a/sys/src/cmd/unix/drawterm/libc/utf.h
+++ b/sys/src/cmd/unix/drawterm/libc/utf.h
@@ -8,7 +8,8 @@
 	UTFmax		= 3,		/* maximum bytes per rune */
 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
-	Runeerror	= 0x80,		/* decoding error in UTF */
+	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax		= 0xFFFF,	/* 16 bit rune */
 };
 
 /*
--- a/sys/src/cmd/unix/u9fs/rune.c
+++ b/sys/src/cmd/unix/u9fs/rune.c
@@ -1,6 +1,7 @@
 #include	<plan9.h>
 
 char *argv0;
+
 enum
 {
 	Bit1	= 7,
@@ -8,6 +9,7 @@
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
@@ -14,21 +16,23 @@
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
 
-	Bad	= Runeerror
+	Bad	= Runeerror,
 };
 
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -73,6 +77,25 @@
 		return 3;
 	}
 
+ 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -86,11 +109,14 @@
 {
 	long c;
 
+	c = *rune;
+	if(c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * one character sequence
 	 *	00000-0007F => 00-7F
 	 */
-	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		return 1;
@@ -110,10 +136,22 @@
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if(c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
 }
 
 int
@@ -120,7 +158,7 @@
 runelen(long c)
 {
 	Rune rune;
-	char str[10];
+	char str[UTFmax];
 
 	rune = c;
 	return runetochar(str, &rune);
@@ -127,22 +165,41 @@
 }
 
 int
-utflen(char *s)
+runenlen(Rune *r, int nrune)
 {
-	int c;
-	long n;
-	Rune rune;
+	int nb, c;
 
-	n = 0;
-	for(;;) {
-		c = *(uchar*)s;
-		if(c < Runeself) {
-			if(c == 0)
-				return n;
-			s++;
-		} else
-			s += chartorune(&rune, s);
-		n++;
+	nb = 0;
+	while(nrune--) {
+		c = *r++;
+		if(c <= Rune1)
+			nb++;
+		else
+		if(c <= Rune2)
+			nb += 2;
+		else
+		if(c <= Rune3 || c > Runemax)
+			nb += 3;
+		else
+			nb += 4;
 	}
-	return 0;
+	return nb;
 }
+
+int
+fullrune(char *str, int n)
+{
+	int c;
+
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(UTFmax == 3 || c < T4)
+		return n >= 3;
+	return n >= 4;
+}
+
--- a/sys/src/cmd/upas/fs/mbox.c
+++ b/sys/src/cmd/upas/fs/mbox.c
@@ -1223,12 +1223,12 @@
 		return 0;
 
 	n += e-in;
-	*out = p = malloc(n+1);
+	*out = p = malloc(UTFmax*n+1);
 	if(p == nil)
 		return 0;
 
 	for(; in < e; in++){
-		r = (uchar)*in;
+		r = (*in) & 0xff;
 		p += runetochar(p, &r);
 	}
 	*p = 0;
--- a/sys/src/cmd/upas/vf/vf.c
+++ b/sys/src/cmd/upas/vf/vf.c
@@ -954,7 +954,7 @@
 {
 	String *s;
 	char decoded[1024];
-	char utfbuf[2*1024];
+	char utfbuf[UTFmax*1024];
 	int i, len;
 	char *e;
 	char *token;
--- a/sys/src/cmd/vnc/screen.c
+++ b/sys/src/cmd/vnc/screen.c
@@ -335,6 +335,8 @@
 		addflush(r);
 		curpos.x = *xp;
 		break;
+	case '\0':
+		break;
 	default:
 		p = memsubfontwidth(memdefont, buf);
 		w = p.x;
@@ -354,23 +356,19 @@
 void
 screenputs(char *s, int n)
 {
-	int i;
-	Rune r;
-	char buf[4];
+	static char rb[UTFmax+1];
+	static int nrb;
+	char *e;
 
 	drawlock();
-	while(n > 0){
-		i = chartorune(&r, s);
-		if(i == 0){
-			s++;
-			--n;
-			continue;
+	e = s + n;
+	while(s < e){
+		rb[nrb++] = *s++;
+		if(nrb >= UTFmax || fullrune(rb, nrb)){
+			rb[nrb] = 0;
+			screenputc(rb);
+			nrb = 0;
 		}
-		memmove(buf, s, i);
-		buf[i] = 0;
-		n -= i;
-		s += i;
-		screenputc(buf);
 	}
 	screenflush();
 	drawunlock();
--- a/sys/src/cmd/yacc.c
+++ b/sys/src/cmd/yacc.c
@@ -141,7 +141,7 @@
 
 char*	infile;			/* input file name */
 int	numbval;		/* value of an input number */
-char	tokname[NAMESIZE+4];	/* input token name, slop for runes and 0 */
+char	tokname[NAMESIZE+UTFmax+1];	/* input token name, slop for runes and 0 */
 
 	/* structure declarations */
 
--- a/sys/src/libbio/bgetrune.c
+++ b/sys/src/libbio/bgetrune.c
@@ -7,7 +7,7 @@
 {
 	int c, i;
 	Rune rune;
-	char str[4];
+	char str[UTFmax];
 
 	c = Bgetc(bp);
 	if(c < Runeself) {		/* one char */
--- a/sys/src/libbio/bputrune.c
+++ b/sys/src/libbio/bputrune.c
@@ -6,7 +6,7 @@
 Bputrune(Biobufhdr *bp, long c)
 {
 	Rune rune;
-	char str[4];
+	char str[UTFmax];
 	int n;
 
 	rune = c;
--- a/sys/src/libc/fmt/dofmt.c
+++ b/sys/src/libc/fmt/dofmt.c
@@ -512,12 +512,15 @@
 int
 _badfmt(Fmt *f)
 {
-	char x[3];
+	char x[2+UTFmax];
+	Rune r;
+	int n;
 
+	r = f->r;
 	x[0] = '%';
-	x[1] = f->r;
-	x[2] = '%';
-	f->prec = 3;
-	_fmtcpy(f, x, 3, 3);
+	n = 1+runetochar(x+1, &r);
+	x[n++] = '%';
+	f->prec = n;
+	_fmtcpy(f, x, n, n);
 	return 0;
 }
--- a/sys/src/libc/port/rune.c
+++ b/sys/src/libc/port/rune.c
@@ -8,6 +8,7 @@
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
@@ -14,10 +15,12 @@
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
@@ -28,7 +31,7 @@
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -73,6 +76,25 @@
 		return 3;
 	}
 
+ 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
 	/*
 	 * bad decoding
 	 */
@@ -86,11 +108,14 @@
 {
 	long c;
 
+	c = *rune;
+	if(c > Runemax)
+		c = Runeerror;
+
 	/*
 	 * one character sequence
 	 *	00000-0007F => 00-7F
 	 */
-	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		return 1;
@@ -110,10 +135,22 @@
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if(c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
 }
 
 int
@@ -120,7 +157,7 @@
 runelen(long c)
 {
 	Rune rune;
-	char str[10];
+	char str[UTFmax];
 
 	rune = c;
 	return runetochar(str, &rune);
@@ -140,7 +177,10 @@
 		if(c <= Rune2)
 			nb += 2;
 		else
+		if(c <= Rune3 || c > Runemax)
 			nb += 3;
+		else
+			nb += 4;
 	}
 	return nb;
 }
@@ -150,13 +190,15 @@
 {
 	int c;
 
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(UTFmax == 3 || c < T4)
+		return n >= 3;
+	return n >= 4;
 }
+
--- a/sys/src/libdraw/buildfont.c
+++ b/sys/src/libdraw/buildfont.c
@@ -70,7 +70,7 @@
 		}
 		max = strtol(s, &s, 0);
 		s = skip(s);
-		if(*s==0 || min>=65536 || max>=65536 || min>max){
+		if(*s==0 || min>Runemax || max>Runemax || min>max){
 			werrstr("illegal subfont range");
     Err3:
 			freefont(fnt);
--- a/sys/src/libdraw/event.c
+++ b/sys/src/libdraw/event.c
@@ -199,7 +199,7 @@
 ekeyslave(int fd)
 {
 	Rune r;
-	char t[3], k[10];
+	char t[1+UTFmax], k[10];
 	int kr, kn, w;
 
 	if(eforkslave(Ekeyboard) < MAXSLAVE)
@@ -215,10 +215,9 @@
 		}
 		w = chartorune(&r, k);
 		kn -= w;
+		memmove(t+1, k, w);
 		memmove(k, &k[w], kn);
-		t[1] = r;
-		t[2] = r>>8;
-		if(write(epipe[1], t, 3) != 3)
+		if(write(epipe[1], t, sizeof(t)) != sizeof(t))
 			break;
 	}
 breakout:;
@@ -302,7 +301,7 @@
 		s->head = (Ebuf *)1;
 		return;
 	}
-	if(i == Skeyboard && n != 3)
+	if(i == Skeyboard && n != (1+UTFmax))
 		drawerror(display, "events: protocol error: keyboard");
 	if(i == Smouse){
 		if(n < 1+1+2*12)
@@ -418,14 +417,13 @@
 ekbd(void)
 {
 	Ebuf *eb;
-	int c;
+	Rune r;
 
 	if(Skeyboard < 0)
 		drawerror(display, "events: keyboard not initialzed");
 	eb = ebread(&eslave[Skeyboard]);
-	c = eb->buf[0] + (eb->buf[1]<<8);
-	free(eb);
-	return c;
+	chartorune(&r, (char*)eb->buf);
+	return r;
 }
 
 void
--- a/sys/src/libhtml/lex.c
+++ b/sys/src/libhtml/lex.c
@@ -1310,9 +1310,9 @@
 		break;
 	case UTF_8:
 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
-		n = chartorune(&r, (char*)(buf+ts->i));
 		if(ok) {
-			if(warn && c == 0x80)
+			n = chartorune(&r, (char*)(buf+ts->i));
+			if(warn && c == Runeerror)
 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
 			ts->i += n;
 			c = r;
--- a/sys/src/libhtml/utils.c
+++ b/sys/src/libhtml/utils.c
@@ -535,7 +535,7 @@
 
 // Convert buf[0:n], Unicode characters,
 // into an emalloc'd null-terminated string in character set chset.
-// Use 0x80 for unconvertable characters.
+// Use Runeerror for unconvertable characters.
 uchar*
 fromStr(Rune* buf, int n, int chset)
 {
@@ -554,7 +554,7 @@
 		for(i = 0; i < n; i++) {
 			ch = buf[i];
 			if(ch > lim)
-				ch = 0x80;
+				ch = Runeerror;
 			ans[i] = ch;
 		}
 		ans[n] = 0;