ref: 6cadd03bbeace1c256ba875c2e6a877f924877cd
parent: 6d99096136278f06f6333f927da34105a8dfe0bf
author: cinap_lenrek <[email protected]>
date: Mon Dec 31 16:09:46 EST 2012
fix utf and rune handling in preparation for 32bit runes
--- a/sys/include/ape/utf.h
+++ b/sys/include/ape/utf.h
@@ -14,7 +14,8 @@
UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
- Runeerror = 0x80, /* decoding error in UTF */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0xFFFF, /* 16 bit rune */
};
/*
--- a/sys/include/libc.h
+++ b/sys/include/libc.h
@@ -45,6 +45,7 @@
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0xFFFF, /* 16 bit rune */
};
/*
--- a/sys/src/9/pc/cga.c
+++ b/sys/src/9/pc/cga.c
@@ -99,7 +99,9 @@
int i;
uchar *p;
- if(c == '\n'){
+ if(c == '\0')
+ return;
+ else if(c == '\n'){
cgapos = cgapos/Width;
cgapos = (cgapos+1)*Width;
}
@@ -138,8 +140,10 @@
static void
cgascreenputs(char* s, int n)
{
+ static char rb[UTFmax];
+ static int nrb;
+ char *e;
Rune r;
- int i;
if(!islo()){
/*
@@ -152,11 +156,14 @@
else
lock(&cgascreenlock);
- while(n > 0){
- i = chartorune(&r, s);
- cgascreenputc(r);
- s += i;
- n -= i;
+ e = s + n;
+ while(s < e){
+ rb[nrb++] = *s++;
+ if(nrb >= UTFmax || fullrune(rb, nrb)){
+ chartorune(&r, rb);
+ cgascreenputc(r);
+ nrb = 0;
+ }
}
unlock(&cgascreenlock);
--- a/sys/src/9/pc/vga.c
+++ b/sys/src/9/pc/vga.c
@@ -119,9 +119,10 @@
static void
vgascreenputs(char* s, int n)
{
- int i, gotdraw;
- Rune r;
- char buf[4];
+ static char rb[UTFmax+1];
+ static int nrb;
+ char *e;
+ int gotdraw;
VGAscr *scr;
Rectangle flushr;
@@ -146,13 +147,14 @@
flushr = Rect(10000, 10000, -10000, -10000);
- while(n > 0){
- i = chartorune(&r, s);
- memmove(buf, s, i);
- buf[i] = 0;
- n -= i;
- s += i;
- vgascreenputc(scr, buf, &flushr);
+ e = s + n;
+ while(s < e){
+ rb[nrb++] = *s++;
+ if(nrb >= UTFmax || fullrune(rb, nrb)){
+ rb[nrb] = 0;
+ vgascreenputc(scr, rb, &flushr);
+ nrb = 0;
+ }
}
flushmemscreen(flushr);
--- a/sys/src/9/port/lib.h
+++ b/sys/src/9/port/lib.h
@@ -38,7 +38,8 @@
UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
- Runeerror = 0x80, /* decoding error in UTF */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0xFFFF, /* 16 bit rune */
};
/*
--- a/sys/src/ape/lib/ap/gen/mbwc.c
+++ b/sys/src/ape/lib/ap/gen/mbwc.c
@@ -1,4 +1,5 @@
#include <stdlib.h>
+#include <utf.h>
/*
* Use the FSS-UTF transformation proposed by posix.
@@ -7,6 +8,7 @@
* Tx 10xxxxxx 6 free bits
* T1 110xxxxx 5 free bits
* T2 1110xxxx 4 free bits
+ * T3 11110xxx 3 free bits
*
* Encoding is as follows.
* From hex Thru hex Sequence Bits
@@ -13,6 +15,7 @@
* 00000000 0000007F T0 7
* 00000080 000007FF T1 Tx 11
* 00000800 0000FFFF T2 Tx Tx 16
+ * 00010000 0010FFFF T3 Tx Tx Tx 20 (and change)
*/
int
@@ -25,7 +28,7 @@
int
mbtowc(wchar_t *pwc, const char *s, size_t n)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
if(!s)
@@ -70,7 +73,25 @@
return 3;
}
- /*
+ if(n < 4)
+ goto bad;
+ if(UTFmax >= 4) {
+ c3 = (s[3] ^ 0x80) & 0xff;
+ if(c3 & 0xC0)
+ goto bad;
+ if(c < 0xf8) {
+ l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff;
+ if(l <= 0x10000)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ if(pwc)
+ *pwc = l;
+ return 4;
+ }
+ }
+
+ /*
* bad decoding
*/
bad:
@@ -86,7 +107,10 @@
if(!s)
return 0;
- c = wchar & 0xFFFF;
+ c = wchar;
+ if(c > Runemax)
+ c = Runeerror;
+
if(c < 0x80) {
s[0] = c;
return 1;
@@ -98,10 +122,18 @@
return 2;
}
- s[0] = 0xE0 | (c >> 12);
- s[1] = 0x80 | ((c >> 6) & 0x3F);
- s[2] = 0x80 | (c & 0x3F);
- return 3;
+ if(c < 0x10000) {
+ s[0] = 0xE0 | (c >> 12);
+ s[1] = 0x80 | ((c >> 6) & 0x3F);
+ s[2] = 0x80 | (c & 0x3F);
+ return 3;
+ }
+
+ s[0] = 0xf0 | c >> 18;
+ s[1] = 0x80 | (c >> 12) & 0x3F;
+ s[2] = 0x80 | (c >> 6) & 0x3F;
+ s[3] = 0x80 | (c & 0x3F);
+ return 4;
}
size_t
@@ -117,7 +149,7 @@
break;
s++;
} else {
- d = mbtowc(pwcs, s, 3);
+ d = mbtowc(pwcs, s, UTFmax);
if(d <= 0)
return (size_t)((d<0) ? -1 : i);
s += d;
@@ -133,10 +165,10 @@
int i, d;
long c;
char *p, *pe;
- char buf[3];
+ char buf[UTFmax];
p = s;
- pe = p+n-3;
+ pe = p+n-UTFmax;
while(p < pe) {
c = *pwcs++;
if(c < 0x80)
@@ -146,17 +178,14 @@
if(c == 0)
return p-s;
}
- while(p < pe+3) {
+ while(p < pe+UTFmax) {
c = *pwcs++;
d = wctomb(buf, c);
- if(p+d <= pe+3) {
- *p++ = buf[0];
- if(d > 1) {
- *p++ = buf[2];
- if(d > 2)
- *p++ = buf[3];
- }
- }
+ if(p+d <= pe+UTFmax) {
+ for(i = 0; i < d; i++)
+ p[i] = buf[i];
+ p += d;
+ }
if(c == 0)
break;
}
--- a/sys/src/ape/lib/fmt/dofmt.c
+++ b/sys/src/ape/lib/fmt/dofmt.c
@@ -546,12 +546,15 @@
int
__badfmt(Fmt *f)
{
- char x[3];
+ char x[2+UTFmax];
+ Rune r;
+ int n;
+ r = f->r;
x[0] = '%';
- x[1] = f->r;
- x[2] = '%';
- f->prec = 3;
- __fmtcpy(f, (const void*)x, 3, 3);
+ n = 1+runetochar(x+1, &r);
+ x[n++] = '%';
+ f->prec = n;
+ __fmtcpy(f, x, n, n);
return 0;
}
--- a/sys/src/ape/lib/utf/rune.c
+++ b/sys/src/ape/lib/utf/rune.c
@@ -23,6 +23,7 @@
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
+ Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
@@ -29,10 +30,12 @@
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
@@ -43,7 +46,7 @@
int
chartorune(Rune *rune, char *str)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
/*
@@ -88,6 +91,25 @@
return 3;
}
+ /*
+ * four character sequence
+ * 10000-10FFFF => T4 Tx Tx Tx
+ */
+ if(UTFmax >= 4) {
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if(c3 & Testx)
+ goto bad;
+ if(c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if(l <= Rune3)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+ }
+
/*
* bad decoding
*/
@@ -101,11 +123,14 @@
{
long c;
+ c = *rune;
+ if(c > Runemax)
+ c = Runeerror;
+
/*
* one character sequence
* 00000-0007F => 00-7F
*/
- c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
@@ -125,10 +150,22 @@
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
- return 3;
+ if(c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
}
int
@@ -135,7 +172,7 @@
runelen(long c)
{
Rune rune;
- char str[10];
+ char str[UTFmax];
rune = c;
return runetochar(str, &rune);
@@ -155,7 +192,10 @@
if(c <= Rune2)
nb += 2;
else
+ if(c <= Rune3 || c > Runemax)
nb += 3;
+ else
+ nb += 4;
}
return nb;
}
@@ -165,13 +205,15 @@
{
int c;
- if(n > 0) {
- c = *(uchar*)str;
- if(c < Tx)
- return 1;
- if(n > 1)
- if(c < T3 || n > 2)
- return 1;
- }
- return 0;
+ if(n <= 0)
+ return 0;
+ c = *(uchar*)str;
+ if(c < Tx)
+ return 1;
+ if(c < T3)
+ return n >= 2;
+ if(UTFmax == 3 || c < T4)
+ return n >= 3;
+ return n >= 4;
}
+
--- a/sys/src/cmd/1c/swt.c
+++ b/sys/src/cmd/1c/swt.c
@@ -244,26 +244,26 @@
}
long
-outlstring(ushort *s, long n)
+outlstring(Rune *s, long n)
{
- char buf[2];
- int c;
+ char buf[sizeof(Rune)];
+ int c, i;
long r;
- while(nstring & 1)
+ while(nstring % sizeof buf)
outstring("", 1);
r = nstring;
while(n > 0) {
c = *s++;
if(align(0, types[TCHAR], Aarg1)) {
- buf[0] = c>>8;
- buf[1] = c;
+ for(i = sizeof buf; i > 0; c >>= 8)
+ buf[--i] = c;
} else {
- buf[0] = c;
- buf[1] = c>>8;
+ for(i = 0; i < sizeof buf; c >>= 8)
+ buf[i++] = c;
}
- outstring(buf, 2);
- n -= sizeof(ushort);
+ outstring(buf, sizeof buf);
+ n -= sizeof buf;
}
return r;
}
--- a/sys/src/cmd/2c/swt.c
+++ b/sys/src/cmd/2c/swt.c
@@ -324,26 +324,26 @@
}
long
-outlstring(ushort *s, long n)
+outlstring(Rune *s, long n)
{
- char buf[2];
- int c;
+ char buf[sizeof(Rune)];
+ int c, i;
long r;
- while(nstring & 1)
+ while(nstring % sizeof buf)
outstring("", 1);
r = nstring;
while(n > 0) {
c = *s++;
if(align(0, types[TCHAR], Aarg1)) {
- buf[0] = c>>8;
- buf[1] = c;
+ for(i = sizeof buf; i > 0; c >>= 8)
+ buf[--i] = c;
} else {
- buf[0] = c;
- buf[1] = c>>8;
+ for(i = 0; i < sizeof buf; c >>= 8)
+ buf[i++] = c;
}
- outstring(buf, 2);
- n -= sizeof(ushort);
+ outstring(buf, sizeof buf);
+ n -= sizeof buf;
}
return r;
}
--- a/sys/src/cmd/acme/regx.c
+++ b/sys/src/cmd/acme/regx.c
@@ -487,7 +487,7 @@
exprp++; /* eat '-' */
if((c2 = nextrec()) == ']')
goto Error;
- classp[n+0] = 0xFFFF;
+ classp[n+0] = Runemax;
classp[n+1] = c1;
classp[n+2] = c2;
n += 3;
@@ -509,7 +509,7 @@
p = class[classno];
while(*p){
- if(*p == 0xFFFF){
+ if(*p == Runemax){
if(p[1]<=c && c<=p[2])
return !negate;
p += 3;
--- a/sys/src/cmd/auth/convkeys.c
+++ b/sys/src/cmd/auth/convkeys.c
@@ -121,7 +121,7 @@
for (; *s != '\0'; s += n) {
n = chartorune(&r, s);
- if (n == 1 && r == Runeerror)
+ if (r == Runeerror)
return 1;
}
return 0;
--- a/sys/src/cmd/bitsy/keyboard.c
+++ b/sys/src/cmd/bitsy/keyboard.c
@@ -395,7 +395,7 @@
if(strcmp(args[0], "keyboard:")==0 || strcmp(args[0], "scribble:")==0)
if(strcmp(args[1], "value") == 0){
n = atoi(args[2]);
- if(n <= 0xFFFF){
+ if(n <= Runemax){
r = n;
i = runetochar(str, &r);
write(kbdfd, str, i);
--- a/sys/src/cmd/bitsy/prompter.c
+++ b/sys/src/cmd/bitsy/prompter.c
@@ -282,7 +282,7 @@
n = atoi(args[2]);
if(n == '\033') /* Escape exits */
break;
- if(n <= 0xFFFF){
+ if(n <= Runemax){
r = n;
send(kbdctl->c, &r);
}
--- a/sys/src/cmd/cc/cc.h
+++ b/sys/src/cmd/cc/cc.h
@@ -51,7 +51,7 @@
double fconst; /* fp constant */
vlong vconst; /* non fp const */
char* cstring; /* character string */
- ushort* rstring; /* rune string */
+ Rune* rstring; /* rune string */
Sym* sym;
Type* type;
@@ -336,6 +336,8 @@
TFILE,
TOLD,
NALLTYPES,
+
+ TRUNE = sizeof(Rune)==4? TUINT: TUSHORT,
};
enum
{
@@ -740,7 +742,7 @@
void gextern(Sym*, Node*, long, long);
void ginit(void);
long outstring(char*, long);
-long outlstring(ushort*, long);
+long outlstring(Rune*, long);
void sextern(Sym*, Node*, long, long);
void xcom(Node*);
long exreg(Type*);
--- a/sys/src/cmd/cc/cc.y
+++ b/sys/src/cmd/cc/cc.y
@@ -855,9 +855,9 @@
LLSTRING
{
$$ = new(OLSTRING, Z, Z);
- $$->type = typ(TARRAY, types[TUSHORT]);
- $$->type->width = $1.l + sizeof(ushort);
- $$->rstring = (ushort*)$1.s;
+ $$->type = typ(TARRAY, types[TRUNE]);
+ $$->type->width = $1.l + sizeof(Rune);
+ $$->rstring = (Rune*)$1.s;
$$->sym = symstring;
$$->etype = TARRAY;
$$->class = CSTATIC;
@@ -867,16 +867,16 @@
char *s;
int n;
- n = $1->type->width - sizeof(ushort);
+ n = $1->type->width - sizeof(Rune);
s = alloc(n+$2.l+MAXALIGN);
memcpy(s, $1->rstring, n);
memcpy(s+n, $2.s, $2.l);
- *(ushort*)(s+n+$2.l) = 0;
+ *(Rune*)(s+n+$2.l) = 0;
$$ = $1;
$$->type->width += $2.l;
- $$->rstring = (ushort*)s;
+ $$->rstring = (Rune*)s;
}
zelist:
--- a/sys/src/cmd/cc/com.c
+++ b/sys/src/cmd/cc/com.c
@@ -633,10 +633,11 @@
break;
case OLSTRING:
- if(n->type->link != types[TUSHORT]) {
+ if(n->type->link != types[TRUNE]) {
o = outstring(0, 0);
while(o & 3) {
- outlstring(L"", sizeof(ushort));
+ Rune str[1] = {0};
+ outlstring(str, sizeof(Rune));
o = outlstring(0, 0);
}
}
--- a/sys/src/cmd/cc/dpchk.c
+++ b/sys/src/cmd/cc/dpchk.c
@@ -67,13 +67,14 @@
{
Bits flag;
int f;
- char *fmt;
+ char *fmt, *e;
Rune c;
fmt = fmtbuf;
+ e = fmtbuf + sizeof(fmtbuf)-1;
flag = zbits;
nstar = 0;
- for(;;) {
+ while(fmt < e){
s += chartorune(&c, s);
fmt += runetochar(fmt, &c);
if(c == 0 || c >= nelem(flagbits))
@@ -175,7 +176,7 @@
{
Sym *s;
int n, c;
- char *t;
+ char *t, *e;
Rune r;
Type *ty;
@@ -225,6 +226,7 @@
if(c != '"')
goto bad;
t = fmtbuf;
+ e = t + sizeof(fmtbuf)-1;
for(;;) {
r = getr();
if(r == ' ' || r == '\n')
@@ -231,6 +233,8 @@
goto bad;
if(r == '"')
break;
+ if(t >= e)
+ goto bad;
t += runetochar(t, &r);
}
*t = 0;
--- a/sys/src/cmd/cc/lex.c
+++ b/sys/src/cmd/cc/lex.c
@@ -467,7 +467,7 @@
yyerror("missing '");
peekc = c1;
}
- yylval.vval = convvtox(c, TUSHORT);
+ yylval.vval = convvtox(c, TRUNE);
return LUCONST;
}
if(c == '"') {
@@ -541,15 +541,15 @@
c = escchar('"', 1, 0);
if(c == EOF)
break;
- cp = allocn(cp, c1, sizeof(ushort));
- *(ushort*)(cp + c1) = c;
- c1 += sizeof(ushort);
+ cp = allocn(cp, c1, sizeof(Rune));
+ *(Rune*)(cp + c1) = c;
+ c1 += sizeof(Rune);
}
yylval.sval.l = c1;
do {
- cp = allocn(cp, c1, sizeof(ushort));
- *(ushort*)(cp + c1) = 0;
- c1 += sizeof(ushort);
+ cp = allocn(cp, c1, sizeof(Rune));
+ *(Rune*)(cp + c1) = 0;
+ c1 += sizeof(Rune);
} while(c1 & MAXALIGN);
yylval.sval.s = cp;
return LLSTRING;
@@ -1027,7 +1027,7 @@
} else
c = GETC();
for(;;) {
- if(!isspace(c))
+ if(c >= Runeself || !isspace(c))
return c;
if(c == '\n') {
lineno++;
--- a/sys/src/cmd/cc/pswt.c
+++ b/sys/src/cmd/cc/pswt.c
@@ -132,28 +132,28 @@
}
long
-outlstring(ushort *s, long n)
+outlstring(Rune *s, long n)
{
- char buf[2];
- int c;
+ char buf[sizeof(Rune)];
+ int c, i;
long r;
if(suppress)
return nstring;
- while(nstring & 1)
+ while(nstring % sizeof buf)
outstring("", 1);
r = nstring;
while(n > 0) {
c = *s++;
if(align(0, types[TCHAR], Aarg1)) {
- buf[0] = c>>8;
- buf[1] = c;
+ for(i = sizeof buf; i > 0; c >>= 8)
+ buf[--i] = c;
} else {
- buf[0] = c;
- buf[1] = c>>8;
+ for(i = 0; i < sizeof buf; c >>= 8)
+ buf[i++] = c;
}
- outstring(buf, 2);
- n -= sizeof(ushort);
+ outstring(buf, sizeof buf);
+ n -= sizeof buf;
}
return r;
}
--- a/sys/src/cmd/disk/9660/cdrdwr.c
+++ b/sys/src/cmd/disk/9660/cdrdwr.c
@@ -503,7 +503,6 @@
{
Rune r[256];
- strtorune(r, s);
Cputrs(cd, strtorune(r, s), size);
}
--- a/sys/src/cmd/disk/9660/jchar.c
+++ b/sys/src/cmd/disk/9660/jchar.c
@@ -45,8 +45,7 @@
if(utflen(s) > 64)
return 1;
- strtorune(r, s);
- for(p=r; *p; p++)
+ for(p=strtorune(r, s); *p; p++)
if(isjolietfrog(*p))
return 1;
return 0;
--- a/sys/src/cmd/ed.c
+++ b/sys/src/cmd/ed.c
@@ -54,7 +54,7 @@
int peekc;
int pflag;
int rescuing;
-Rune rhsbuf[LBSIZE/2];
+Rune rhsbuf[LBSIZE/sizeof(Rune)];
char savedfile[FNSIZE];
jmp_buf savej;
int subnewa;
@@ -990,11 +990,11 @@
lp = linebuf;
bp = getblock(tl, OREAD);
nl = nleft;
- tl &= ~((BLKSIZE/2) - 1);
+ tl &= ~((BLKSIZE/sizeof(Rune)) - 1);
while(*lp++ = *bp++) {
nl -= sizeof(Rune);
if(nl == 0) {
- bp = getblock(tl += BLKSIZE/2, OREAD);
+ bp = getblock(tl += BLKSIZE/sizeof(Rune), OREAD);
nl = nleft;
}
}
@@ -1012,7 +1012,7 @@
tl = tline;
bp = getblock(tl, OWRITE);
nl = nleft;
- tl &= ~((BLKSIZE/2)-1);
+ tl &= ~((BLKSIZE/sizeof(Rune))-1);
while(*bp = *lp++) {
if(*bp++ == '\n') {
bp[-1] = 0;
@@ -1021,7 +1021,7 @@
}
nl -= sizeof(Rune);
if(nl == 0) {
- tl += BLKSIZE/2;
+ tl += BLKSIZE/sizeof(Rune);
bp = getblock(tl, OWRITE);
nl = nleft;
}
@@ -1048,8 +1048,8 @@
static uchar ibuff[BLKSIZE];
static uchar obuff[BLKSIZE];
- bno = atl / (BLKSIZE/2);
- off = (atl<<1) & (BLKSIZE-1) & ~03;
+ bno = atl / (BLKSIZE/sizeof(Rune));
+ off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~03;
if(bno >= NBLK) {
lastc = '\n';
error(T);
@@ -1240,7 +1240,7 @@
if(c == '\\') {
c = getchr();
*p++ = ESCFLG;
- if(p >= &rhsbuf[LBSIZE/2])
+ if(p >= &rhsbuf[nelem(rhsbuf)])
error(Q);
} else
if(c == '\n' && (!globp || !globp[0])) {
@@ -1251,7 +1251,7 @@
if(c == seof)
break;
*p++ = c;
- if(p >= &rhsbuf[LBSIZE/2])
+ if(p >= &rhsbuf[nelem(rhsbuf)])
error(Q);
}
*p = 0;
--- a/sys/src/cmd/file.c
+++ b/sys/src/cmd/file.c
@@ -359,7 +359,7 @@
rb = malloc(nbuf+1);
memmove(rb, buf+2, nbuf);
p = (char*)buf;
- e = p+nbuf-4;
+ e = p+sizeof(buf)-UTFmax-1;
for(i=0; i<nbuf && p < e; i+=2){
r = rb[i+1] | rb[i]<<8;
p += runetochar(p, &r);
@@ -376,7 +376,7 @@
rb = malloc(nbuf+1);
memmove(rb, buf+2, nbuf);
p = (char*)buf;
- e = p+nbuf-4;
+ e = p+sizeof(buf)-UTFmax-1;
for(i=0; i<nbuf && p < e; i+=2){
r = rb[i] | rb[i+1]<<8;
p += runetochar(p, &r);
--- a/sys/src/cmd/ip/ftpfs/proto.c
+++ b/sys/src/cmd/ip/ftpfs/proto.c
@@ -1525,7 +1525,7 @@
if(*p == 0)
return nil;
- to = malloc(3*strlen(from)+2);
+ to = malloc(UTFmax*strlen(from)+2);
if(to == nil)
return nil;
for(p = to; *from; from++){
--- a/sys/src/cmd/ip/httpd/wikipost.c
+++ b/sys/src/cmd/ip/httpd/wikipost.c
@@ -59,7 +59,7 @@
t = v;
while(*s){
/* in decoding error, assume latin1 */
- if((n=chartorune(&r, s)) == 1 && r == 0x80)
+ if((n=chartorune(&r, s)) == 1 && r == Runeerror)
r = *s;
s += n;
t += runetochar(t, &r);
--- a/sys/src/cmd/join.c
+++ b/sys/src/cmd/join.c
@@ -286,7 +286,7 @@
{
int i;
Rune *temp;
- char buf[BUFSIZ];
+ char buf[BUFSIZ*UTFmax+1];
if (no <= 0) { /* default case */
printf("%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));
--- a/sys/src/cmd/postscript/common/rune.c
+++ b/sys/src/cmd/postscript/common/rune.c
@@ -7,6 +7,7 @@
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
+ Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
@@ -13,10 +14,12 @@
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
@@ -27,7 +30,7 @@
int
chartorune(Rune *rune, char *str)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
/*
@@ -72,6 +75,25 @@
return 3;
}
+ /*
+ * four character sequence
+ * 10000-10FFFF => T4 Tx Tx Tx
+ */
+ if(UTFmax >= 4) {
+ c3 = *(unsigned char*)(str+3) ^ Tx;
+ if(c3 & Testx)
+ goto bad;
+ if(c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if(l <= Rune3)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+ }
+
/*
* bad decoding
*/
@@ -85,11 +107,14 @@
{
long c;
+ c = *rune;
+ if(c > Runemax)
+ c = Runeerror;
+
/*
* one character sequence
* 00000-0007F => 00-7F
*/
- c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
@@ -109,10 +134,22 @@
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
- return 3;
+ if(c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
}
int
@@ -119,7 +156,7 @@
runelen(long c)
{
Rune rune;
- char str[10];
+ char str[UTFmax];
rune = c;
return runetochar(str, &rune);
@@ -126,17 +163,41 @@
}
int
+runenlen(Rune *r, int nrune)
+{
+ int nb, c;
+
+ nb = 0;
+ while(nrune--) {
+ c = *r++;
+ if(c <= Rune1)
+ nb++;
+ else
+ if(c <= Rune2)
+ nb += 2;
+ else
+ if(c <= Rune3 || c > Runemax)
+ nb += 3;
+ else
+ nb += 4;
+ }
+ return nb;
+}
+
+int
fullrune(char *str, int n)
{
int c;
- if(n > 0) {
- c = *(unsigned char*)str;
- if(c < Tx)
- return 1;
- if(n > 1)
- if(c < T3 || n > 2)
- return 1;
- }
- return 0;
+ if(n <= 0)
+ return 0;
+ c = *(unsigned char*)str;
+ if(c < Tx)
+ return 1;
+ if(c < T3)
+ return n >= 2;
+ if(UTFmax == 3 || c < T4)
+ return n >= 3;
+ return n >= 4;
}
+
--- a/sys/src/cmd/postscript/common/rune.h
+++ b/sys/src/cmd/postscript/common/rune.h
@@ -14,6 +14,7 @@
UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a utf sequence (<) */
Runeself = 0x80, /* rune and utf sequences are the same (<) */
- Runeerror = 0xFFFD, /* decoding error in utf */
+ Runeerror = 0xFFFD, /* decoding error in utf */
+ Runemax = 0xFFFF, /* 16 bit rune */
};
#endif
--- a/sys/src/cmd/sam/cmd.c
+++ b/sys/src/cmd/sam/cmd.c
@@ -71,7 +71,7 @@
inputc(void)
{
int n, nbuf;
- char buf[3];
+ char buf[UTFmax];
Rune r;
Again:
--- a/sys/src/cmd/sam/regexp.c
+++ b/sys/src/cmd/sam/regexp.c
@@ -494,7 +494,7 @@
exprp++; /* eat '-' */
if((c2 = nextrec()) == ']')
goto Error;
- classp[n+0] = 0xFFFF;
+ classp[n+0] = Runemax;
classp[n+1] = c1;
classp[n+2] = c2;
n += 3;
@@ -516,7 +516,7 @@
p = class[classno];
while(*p){
- if(*p == 0xFFFF){
+ if(*p == Runemax){
if(p[1]<=c && c<=p[2])
return !negate;
p += 3;
--- a/sys/src/cmd/samterm/mesg.c
+++ b/sys/src/cmd/samterm/mesg.c
@@ -429,7 +429,7 @@
void
outTslS(Tmesg type, int s1, long l1, Rune *s)
{
- char buf[DATASIZE*3+1];
+ char buf[DATASIZE*UTFmax+1];
char *c;
outstart(type);
--- a/sys/src/cmd/sed.c
+++ b/sys/src/cmd/sed.c
@@ -625,7 +625,7 @@
while ((r = *cp++) != '\0') {
if(r == '\\') {
if (rhs < end)
- *rhs++ = 0xFFFF;
+ *rhs++ = Runemax;
else
return 0;
r = *cp++;
@@ -1055,7 +1055,7 @@
sp = place(sp, loc1, loc2);
continue;
}
- if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') {
+ if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') {
n = c-'0';
if (subexp[n].rsp && subexp[n].rep) {
sp = place(sp, subexp[n].rsp, subexp[n].rep);
@@ -1336,7 +1336,7 @@
arout(void)
{
int c;
- char *s;
+ char *s, *e;
char buf[128];
Rune *p1;
Biobuf *fi;
@@ -1347,7 +1347,7 @@
Bputrune(&fout, *p1);
Bputc(&fout, '\n');
} else {
- for(s = buf, p1 = (*aptr)->text; *p1; p1++)
+ for(s = buf, e = buf+sizeof(buf)-UTFmax-1, p1 = (*aptr)->text; *p1 && s < e; p1++)
s += runetochar(s, p1);
*s = '\0';
if((fi = Bopen(buf, OREAD)) == 0)
--- a/sys/src/cmd/tcs/utf.c
+++ b/sys/src/cmd/tcs/utf.c
@@ -93,7 +93,7 @@
if(!fullisorune(buf+i, tot-i))
break;
c = isochartorune(&runes[j], buf+i);
- if(runes[j] == Runeerror && c == 1){
+ if(runes[j] == Runeerror){
if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
if(clean){
--- a/sys/src/cmd/tr.c
+++ b/sys/src/cmd/tr.c
@@ -15,7 +15,7 @@
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
-#define MAXRUNE 0xFFFF
+#define MAXRUNE Runemax
uchar f[(MAXRUNE+1)/8];
uchar t[(MAXRUNE+1)/8];
--- a/sys/src/cmd/tweak.c
+++ b/sys/src/cmd/tweak.c
@@ -803,13 +803,14 @@
}
int
-type(char *buf, char *tag)
+type(char *buf, int nbuf, char *tag)
{
Rune r;
- char *p;
+ char *p, *e;
esetcursor(&busy);
p = buf;
+ e = buf + nbuf-UTFmax-1;
for(;;){
*p = 0;
mesg("%s: %s", tag, buf);
@@ -827,7 +828,8 @@
--p;
break;
default:
- p += runetochar(p, &r);
+ if(p < e)
+ p += runetochar(p, &r);
}
}
}
@@ -846,7 +848,7 @@
Thing *nt;
buttons(Up);
- if(type(buf, tag) == 0)
+ if(type(buf, sizeof(buf), tag) == 0)
return;
if(strcmp(tag, "file") == 0){
for(s=buf; *s; s++)
@@ -1174,7 +1176,7 @@
long l;
buttons(Up);
- if(type(buf, tag) == 0)
+ if(type(buf, sizeof(buf), tag) == 0)
return;
if(strcmp(tag, "mag") == 0){
if(buf[0]<'0' || '9'<buf[0] || (l=atoi(buf))<=0 || l>Maxmag){
@@ -1806,7 +1808,7 @@
return;
}
}
- if(type(buf, "char (hex or character or hex-hex)") == 0)
+ if(type(buf, sizeof(buf), "char (hex or character or hex-hex)") == 0)
return;
if(utflen(buf) == 1){
chartorune(&r, buf);
@@ -2000,7 +2002,7 @@
sel = emenuhit(3, &mouse, &menu3);
switch(sel){
case Mopen:
- if(type(buf, "file")){
+ if(type(buf, sizeof(buf), "file")){
t = tget(buf);
if(t)
drawthing(t, 1);
--- a/sys/src/cmd/unicode.c
+++ b/sys/src/cmd/unicode.c
@@ -51,13 +51,13 @@
return "bad range";
}
min = strtoul(q, &q, 16);
- if(min<0 || min>0xFFFF || *q!='-')
+ if(min<0 || min>Runemax || *q!='-')
goto err;
q++;
if(strchr(hex, *q) == 0)
goto err;
max = strtoul(q, &q, 16);
- if(max<0 || max>0xFFFF || max<min || *q!=0)
+ if(max<0 || max>Runemax || max<min || *q!=0)
goto err;
i = 0;
do{
@@ -111,7 +111,7 @@
return "bad char";
}
m = strtoul(q, &q, 16);
- if(m<0 || m>0xFFFF || *q!=0)
+ if(m<0 || m>Runemax || *q!=0)
goto err;
Bprint(&bout, "%C", m);
if(!text)
--- a/sys/src/cmd/unix/drawterm/libc/dofmt.c
+++ b/sys/src/cmd/unix/drawterm/libc/dofmt.c
@@ -528,12 +528,15 @@
int
__badfmt(Fmt *f)
{
- char x[3];
+ char x[2+UTFmax];
+ Rune r;
+ int n;
+ r = f->r;
x[0] = '%';
- x[1] = f->r;
- x[2] = '%';
- f->prec = 3;
- __fmtcpy(f, (const void*)x, 3, 3);
+ n = 1+runetochar(x+1, &r);
+ x[n++] = '%';
+ f->prec = n;
+ _fmtcpy(f, x, n, n);
return 0;
}
--- a/sys/src/cmd/unix/drawterm/libc/rune.c
+++ b/sys/src/cmd/unix/drawterm/libc/rune.c
@@ -8,6 +8,7 @@
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
+ Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
@@ -14,10 +15,12 @@
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
@@ -28,7 +31,7 @@
int
chartorune(Rune *rune, char *str)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
/*
@@ -73,6 +76,25 @@
return 3;
}
+ /*
+ * four character sequence
+ * 10000-10FFFF => T4 Tx Tx Tx
+ */
+ if(UTFmax >= 4) {
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if(c3 & Testx)
+ goto bad;
+ if(c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if(l <= Rune3)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+ }
+
/*
* bad decoding
*/
@@ -86,11 +108,14 @@
{
long c;
+ c = *rune;
+ if(c > Runemax)
+ c = Runeerror;
+
/*
* one character sequence
* 00000-0007F => 00-7F
*/
- c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
@@ -110,10 +135,22 @@
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
- return 3;
+ if(c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
}
int
@@ -140,7 +177,10 @@
if(c <= Rune2)
nb += 2;
else
+ if(c <= Rune3 || c > Runemax)
nb += 3;
+ else
+ nb += 4;
}
return nb;
}
@@ -150,13 +190,15 @@
{
int c;
- if(n > 0) {
- c = *(uchar*)str;
- if(c < Tx)
- return 1;
- if(n > 1)
- if(c < T3 || n > 2)
- return 1;
- }
- return 0;
+ if(n <= 0)
+ return 0;
+ c = *(uchar*)str;
+ if(c < Tx)
+ return 1;
+ if(c < T3)
+ return n >= 2;
+ if(UTFmax == 3 || c < T4)
+ return n >= 3;
+ return n >= 4;
}
+
--- a/sys/src/cmd/unix/drawterm/libc/utf.h
+++ b/sys/src/cmd/unix/drawterm/libc/utf.h
@@ -8,7 +8,8 @@
UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
- Runeerror = 0x80, /* decoding error in UTF */
+ Runeerror = 0xFFFD, /* decoding error in UTF */
+ Runemax = 0xFFFF, /* 16 bit rune */
};
/*
--- a/sys/src/cmd/unix/u9fs/rune.c
+++ b/sys/src/cmd/unix/u9fs/rune.c
@@ -1,6 +1,7 @@
#include <plan9.h>
char *argv0;
+
enum
{
Bit1 = 7,
@@ -8,6 +9,7 @@
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
+ Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
@@ -14,21 +16,23 @@
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
- Bad = Runeerror
+ Bad = Runeerror,
};
int
chartorune(Rune *rune, char *str)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
/*
@@ -73,6 +77,25 @@
return 3;
}
+ /*
+ * four character sequence
+ * 10000-10FFFF => T4 Tx Tx Tx
+ */
+ if(UTFmax >= 4) {
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if(c3 & Testx)
+ goto bad;
+ if(c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if(l <= Rune3)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+ }
+
/*
* bad decoding
*/
@@ -86,11 +109,14 @@
{
long c;
+ c = *rune;
+ if(c > Runemax)
+ c = Runeerror;
+
/*
* one character sequence
* 00000-0007F => 00-7F
*/
- c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
@@ -110,10 +136,22 @@
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
- return 3;
+ if(c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
}
int
@@ -120,7 +158,7 @@
runelen(long c)
{
Rune rune;
- char str[10];
+ char str[UTFmax];
rune = c;
return runetochar(str, &rune);
@@ -127,22 +165,41 @@
}
int
-utflen(char *s)
+runenlen(Rune *r, int nrune)
{
- int c;
- long n;
- Rune rune;
+ int nb, c;
- n = 0;
- for(;;) {
- c = *(uchar*)s;
- if(c < Runeself) {
- if(c == 0)
- return n;
- s++;
- } else
- s += chartorune(&rune, s);
- n++;
+ nb = 0;
+ while(nrune--) {
+ c = *r++;
+ if(c <= Rune1)
+ nb++;
+ else
+ if(c <= Rune2)
+ nb += 2;
+ else
+ if(c <= Rune3 || c > Runemax)
+ nb += 3;
+ else
+ nb += 4;
}
- return 0;
+ return nb;
}
+
+int
+fullrune(char *str, int n)
+{
+ int c;
+
+ if(n <= 0)
+ return 0;
+ c = *(uchar*)str;
+ if(c < Tx)
+ return 1;
+ if(c < T3)
+ return n >= 2;
+ if(UTFmax == 3 || c < T4)
+ return n >= 3;
+ return n >= 4;
+}
+
--- a/sys/src/cmd/upas/fs/mbox.c
+++ b/sys/src/cmd/upas/fs/mbox.c
@@ -1223,12 +1223,12 @@
return 0;
n += e-in;
- *out = p = malloc(n+1);
+ *out = p = malloc(UTFmax*n+1);
if(p == nil)
return 0;
for(; in < e; in++){
- r = (uchar)*in;
+ r = (*in) & 0xff;
p += runetochar(p, &r);
}
*p = 0;
--- a/sys/src/cmd/upas/vf/vf.c
+++ b/sys/src/cmd/upas/vf/vf.c
@@ -954,7 +954,7 @@
{
String *s;
char decoded[1024];
- char utfbuf[2*1024];
+ char utfbuf[UTFmax*1024];
int i, len;
char *e;
char *token;
--- a/sys/src/cmd/vnc/screen.c
+++ b/sys/src/cmd/vnc/screen.c
@@ -335,6 +335,8 @@
addflush(r);
curpos.x = *xp;
break;
+ case '\0':
+ break;
default:
p = memsubfontwidth(memdefont, buf);
w = p.x;
@@ -354,23 +356,19 @@
void
screenputs(char *s, int n)
{
- int i;
- Rune r;
- char buf[4];
+ static char rb[UTFmax+1];
+ static int nrb;
+ char *e;
drawlock();
- while(n > 0){
- i = chartorune(&r, s);
- if(i == 0){
- s++;
- --n;
- continue;
+ e = s + n;
+ while(s < e){
+ rb[nrb++] = *s++;
+ if(nrb >= UTFmax || fullrune(rb, nrb)){
+ rb[nrb] = 0;
+ screenputc(rb);
+ nrb = 0;
}
- memmove(buf, s, i);
- buf[i] = 0;
- n -= i;
- s += i;
- screenputc(buf);
}
screenflush();
drawunlock();
--- a/sys/src/cmd/yacc.c
+++ b/sys/src/cmd/yacc.c
@@ -141,7 +141,7 @@
char* infile; /* input file name */
int numbval; /* value of an input number */
-char tokname[NAMESIZE+4]; /* input token name, slop for runes and 0 */
+char tokname[NAMESIZE+UTFmax+1]; /* input token name, slop for runes and 0 */
/* structure declarations */
--- a/sys/src/libbio/bgetrune.c
+++ b/sys/src/libbio/bgetrune.c
@@ -7,7 +7,7 @@
{
int c, i;
Rune rune;
- char str[4];
+ char str[UTFmax];
c = Bgetc(bp);
if(c < Runeself) { /* one char */
--- a/sys/src/libbio/bputrune.c
+++ b/sys/src/libbio/bputrune.c
@@ -6,7 +6,7 @@
Bputrune(Biobufhdr *bp, long c)
{
Rune rune;
- char str[4];
+ char str[UTFmax];
int n;
rune = c;
--- a/sys/src/libc/fmt/dofmt.c
+++ b/sys/src/libc/fmt/dofmt.c
@@ -512,12 +512,15 @@
int
_badfmt(Fmt *f)
{
- char x[3];
+ char x[2+UTFmax];
+ Rune r;
+ int n;
+ r = f->r;
x[0] = '%';
- x[1] = f->r;
- x[2] = '%';
- f->prec = 3;
- _fmtcpy(f, x, 3, 3);
+ n = 1+runetochar(x+1, &r);
+ x[n++] = '%';
+ f->prec = n;
+ _fmtcpy(f, x, n, n);
return 0;
}
--- a/sys/src/libc/port/rune.c
+++ b/sys/src/libc/port/rune.c
@@ -8,6 +8,7 @@
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
+ Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
@@ -14,10 +15,12 @@
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
- Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
- Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
- Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
@@ -28,7 +31,7 @@
int
chartorune(Rune *rune, char *str)
{
- int c, c1, c2;
+ int c, c1, c2, c3;
long l;
/*
@@ -73,6 +76,25 @@
return 3;
}
+ /*
+ * four character sequence
+ * 10000-10FFFF => T4 Tx Tx Tx
+ */
+ if(UTFmax >= 4) {
+ c3 = *(uchar*)(str+3) ^ Tx;
+ if(c3 & Testx)
+ goto bad;
+ if(c < T5) {
+ l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+ if(l <= Rune3)
+ goto bad;
+ if(l > Runemax)
+ goto bad;
+ *rune = l;
+ return 4;
+ }
+ }
+
/*
* bad decoding
*/
@@ -86,11 +108,14 @@
{
long c;
+ c = *rune;
+ if(c > Runemax)
+ c = Runeerror;
+
/*
* one character sequence
* 00000-0007F => 00-7F
*/
- c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
@@ -110,10 +135,22 @@
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
- str[0] = T3 | (c >> 2*Bitx);
- str[1] = Tx | ((c >> 1*Bitx) & Maskx);
- str[2] = Tx | (c & Maskx);
- return 3;
+ if(c <= Rune3) {
+ str[0] = T3 | (c >> 2*Bitx);
+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[2] = Tx | (c & Maskx);
+ return 3;
+ }
+
+ /*
+ * four character sequence
+ * 10000-1FFFFF => T4 Tx Tx Tx
+ */
+ str[0] = T4 | (c >> 3*Bitx);
+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+ str[3] = Tx | (c & Maskx);
+ return 4;
}
int
@@ -120,7 +157,7 @@
runelen(long c)
{
Rune rune;
- char str[10];
+ char str[UTFmax];
rune = c;
return runetochar(str, &rune);
@@ -140,7 +177,10 @@
if(c <= Rune2)
nb += 2;
else
+ if(c <= Rune3 || c > Runemax)
nb += 3;
+ else
+ nb += 4;
}
return nb;
}
@@ -150,13 +190,15 @@
{
int c;
- if(n > 0) {
- c = *(uchar*)str;
- if(c < Tx)
- return 1;
- if(n > 1)
- if(c < T3 || n > 2)
- return 1;
- }
- return 0;
+ if(n <= 0)
+ return 0;
+ c = *(uchar*)str;
+ if(c < Tx)
+ return 1;
+ if(c < T3)
+ return n >= 2;
+ if(UTFmax == 3 || c < T4)
+ return n >= 3;
+ return n >= 4;
}
+
--- a/sys/src/libdraw/buildfont.c
+++ b/sys/src/libdraw/buildfont.c
@@ -70,7 +70,7 @@
}
max = strtol(s, &s, 0);
s = skip(s);
- if(*s==0 || min>=65536 || max>=65536 || min>max){
+ if(*s==0 || min>Runemax || max>Runemax || min>max){
werrstr("illegal subfont range");
Err3:
freefont(fnt);
--- a/sys/src/libdraw/event.c
+++ b/sys/src/libdraw/event.c
@@ -199,7 +199,7 @@
ekeyslave(int fd)
{
Rune r;
- char t[3], k[10];
+ char t[1+UTFmax], k[10];
int kr, kn, w;
if(eforkslave(Ekeyboard) < MAXSLAVE)
@@ -215,10 +215,9 @@
}
w = chartorune(&r, k);
kn -= w;
+ memmove(t+1, k, w);
memmove(k, &k[w], kn);
- t[1] = r;
- t[2] = r>>8;
- if(write(epipe[1], t, 3) != 3)
+ if(write(epipe[1], t, sizeof(t)) != sizeof(t))
break;
}
breakout:;
@@ -302,7 +301,7 @@
s->head = (Ebuf *)1;
return;
}
- if(i == Skeyboard && n != 3)
+ if(i == Skeyboard && n != (1+UTFmax))
drawerror(display, "events: protocol error: keyboard");
if(i == Smouse){
if(n < 1+1+2*12)
@@ -418,14 +417,13 @@
ekbd(void)
{
Ebuf *eb;
- int c;
+ Rune r;
if(Skeyboard < 0)
drawerror(display, "events: keyboard not initialzed");
eb = ebread(&eslave[Skeyboard]);
- c = eb->buf[0] + (eb->buf[1]<<8);
- free(eb);
- return c;
+ chartorune(&r, (char*)eb->buf);
+ return r;
}
void
--- a/sys/src/libhtml/lex.c
+++ b/sys/src/libhtml/lex.c
@@ -1310,9 +1310,9 @@
break;
case UTF_8:
ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
- n = chartorune(&r, (char*)(buf+ts->i));
if(ok) {
- if(warn && c == 0x80)
+ n = chartorune(&r, (char*)(buf+ts->i));
+ if(warn && c == Runeerror)
fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
ts->i += n;
c = r;
--- a/sys/src/libhtml/utils.c
+++ b/sys/src/libhtml/utils.c
@@ -535,7 +535,7 @@
// Convert buf[0:n], Unicode characters,
// into an emalloc'd null-terminated string in character set chset.
-// Use 0x80 for unconvertable characters.
+// Use Runeerror for unconvertable characters.
uchar*
fromStr(Rune* buf, int n, int chset)
{
@@ -554,7 +554,7 @@
for(i = 0; i < n; i++) {
ch = buf[i];
if(ch > lim)
- ch = 0x80;
+ ch = Runeerror;
ans[i] = ch;
}
ans[n] = 0;