shithub: riscv

--- /dev/null

+++ b/lib/ucd/mkfile

@@ -1,0 +1,85 @@

+</$objtype/mkfile

+VERSION='15.0.0'

+URL='https://www.unicode.org/Public/'$VERSION'/ucd/'

+TXT=\

+	ArabicShaping.txt\

+	BidiBrackets.txt\

+	BidiMirroring.txt\

+	BidiTest.txt\

+	Blocks.txt\

+	CJKRadicals.txt\

+	CaseFolding.txt\

+	CompositionExclusions.txt\

+	DerivedAge.txt\

+	DerivedCoreProperties.txt\

+	DerivedNormalizationProps.txt\

+	EastAsianWidth.txt\

+	EmojiSources.txt\

+	EquivalentUnifiedIdeograph.txt\

+	HangulSyllableType.txt\

+	Index.txt\

+	IndicPositionalCategory.txt\

+	IndicSyllabicCategory.txt\

+	Jamo.txt\

+	LineBreak.txt\

+	NameAliases.txt\

+	NamedSequences.txt\

+	NamedSequencesProv.txt\

+	NamesList.txt\

+	NormalizationCorrections.txt\

+	NushuSources.txt\

+	PropList.txt\

+	PropertyAliases.txt\

+	PropertyValueAliases.txt\

+	ScriptExtensions.txt\

+	Scripts.txt\

+	SpecialCasing.txt\

+	StandardizedVariants.txt\

+	TangutSources.txt\

+	USourceData.txt\

+	UnicodeData.txt\

+	VerticalOrientation.txt\

+TEST=\

+	NormalizationTest.txt\

+	BidiCharacterTest.txt\

+PDF=\

+	USourceGlyphs.pdf\

+	USourceRSChart.pdf\

+AUX=\

+	WordBreakProperty.txt\

+	GraphemeBreakProperty.txt\

+ucd:V: UnicodeData.txt

+%.txt:

+	hget $URL^$target > $target >[2]/dev/null

+%.pdf:

+	hget $URL^$target > $target

+emoji-data.txt:

+	hget $URL^emoji/^$target > $target

+WordBreakProperty.txt:

+	hget $URL^'auxiliary/'^$target > $target

+GraphemeBreakProperty.txt:

+	hget $URL^'auxiliary/'^$target > $target

+WordBreakTest.txt:

+	hget $URL^'auxiliary/'^$target > $target

+GraphemeBreakTest.txt:

+	hget $URL^'auxiliary/'^$target > $target

+txt:V: $TXT

+pdf:V: $PDF

+test:V: $TEST

+all:V: $TXT $PDF $TEST

--- a/sys/include/libc.h

+++ b/sys/include/libc.h

@@ -77,6 +77,18 @@

 extern	long	runestrlen(Rune*);

 extern	Rune*	runestrstr(Rune*, Rune*);

+extern	int	runecomp(Rune*, Rune*, int);

+extern	int	runedecomp(Rune*, Rune*, int);

+extern	int	utfcomp(char*, char*, int);

+extern	int	utfdecomp(char*, char*, int);

+extern	char*	fullutfnorm(char*,int);

+extern	Rune*	fullrunenorm(Rune*,int);

+extern	Rune*	runewbreak(Rune*);

+extern	char*	utfwbreak(char*);

+extern	Rune*	runegbreak(Rune*);

+extern	char*	utfgbreak(char*);

 extern	Rune	tolowerrune(Rune);

 extern	Rune	totitlerune(Rune);

 extern	Rune	toupperrune(Rune);

--- a/sys/man/2/isalpharune

+++ b/sys/man/2/isalpharune

@@ -48,7 +48,11 @@

.PP

 The case-conversion routines return the character unchanged if it has no case.

 .SH SOURCE

-.B /sys/src/libc/port/runetype.c

+.B /sys/src/libc/port/mkrunetype.c

+.br

+.B /sys/src/libc/port/runeistype.c

+.br

+.B /sys/src/libc/port/runetotype.c

 .SH "SEE ALSO

 .IR ctype (2) ,

 .IR "The Unicode Standard" .

--- /dev/null

+++ b/sys/man/2/runecomp

@@ -1,0 +1,116 @@

+.TH RUNECOMP 2

+.SH NAME

+runecomp, runedecomp, fullrunenorm, runegbreak, runewbreak, utfcomp, utfdecomp, fullutfnorm, utfgbreak, utfwbreak \- multi-rune graphemes

+.SH SYNOPSIS

+.ta \w'\fLchar*xx'u

+.B #include <u.h>

+.br

+.B #include <libc.h>

+.PP

+.B

+int	runecomp(Rune *dst, Rune *src, int max)

+.PP

+.B

+int	runedecomp(Rune *dst, Rune *src, int max)

+.PP

+.B

+Rune*	fullrunenorm(Rune *s, int n)

+.PP

+.B

+Rune*	runegbreak(Rune *s)

+.PP

+.B

+Rune*	runewbreak(Rune *s)

+.PP

+.B

+int	utfcomp(char *dst, char *src, int max)

+.PP

+.B

+int	utfdecomp(char *dst, char *src, int max)

+.PP

+.B

+char*	fullutfnorm(char *s, int n)

+.PP

+.B

+char*	utfgbreak(char *s)

+.PP

+.B

+char*	utfwbreak(char *s)

+.SH DESCRIPTION

+These routines help in handling

+graphemes that may span multiple runes.

+.PP

+.IR Runecomp ,

+.IR runedecomp ,

+.IR utfcomp ,

+and

+.I utfdecomp

+perform Unicode® normalization on

+.IR src ,

+storing the result in

+.IR dst .

+No more than

+.I max

+elements will be written, and the resulting string

+will always be null terminated. The return value

+is always the total number of elements required to

+store the transformation. If this value is larger

+than the supplied

+.I max

+the caller can assume the result has been truncated.

+.I Runecomp

+and

+.I utfcomp

+perform NFC normalization while

+.I runedecomp

+and

+.I utfdecomp

+perform NFD normalization.

+.PP

+.IR Fullrunenorm ,

+and

+.I fullutfnorm

+determine if enough elements are present in

+.I s

+to perform normalization. If enough are present,

+a pointer is returned to the first element that begins

+the next context. Otherwise

+.I s

+is returned. No more then

+.I n

+elements will be read. In order to find the boundary, the

+first element of the next context must be peeked.

+.PP

+.I Runegbreak

+and

+.I utfgbreak

+search

+.B s

+for the next grapheme break opportunity.

+If none is found before the end of the string,

+.I s

+is returned.

+.PP

+.I Runewbreak

+and

+.I utfwbreak

+search

+.B s

+for the next word break opportunity.

+If none is found before the end of the string,

+.I s

+is returned.

+.SH SOURCE

+.B /sys/src/libc/port/mkrunetype.c

+.br

+.B /sys/src/libc/port/runenorm.c

+.br

+.B /sys/src/libc/port/runebreak.c

+.SH SEE ALSO

+Unicode® Standard Annex #15

+.br

+Unicode® Standard Annex #29

+.br

+.IR rune (2),

+.IR utf (6),

+.IR tcs (1)

--- a/sys/src/libc/port/mkfile

+++ b/sys/src/libc/port/mkfile

@@ -62,6 +62,9 @@

 	rand.c\

 	readn.c\

 	rune.c\

+	runebreak.c\

+	runeistype.c\

+	runenorm.c\

 	runestrcat.c\

 	runestrchr.c\

 	runestrcmp.c\

@@ -74,7 +77,7 @@

 	runestrrchr.c\

 	runestrlen.c\

 	runestrstr.c\

-	runetype.c\

+	runetotype.c\

 	sin.c\

 	sinh.c\

 	sqrt.c\

@@ -127,3 +130,26 @@

 </sys/src/cmd/mksyslib

 profile.$O: /sys/include/tos.h

+runenorm.$O:	runenormdata runenorm.c

+runetotype.$O:	runetotypedata runetotype.c

+runeistype.$O:	runeistypedata runeistype.c

+runebreak.$O:	runebreakdata runebreak.c

+UCD=\

+	/lib/ucd/WordBreakProperty.txt\

+	/lib/ucd/GraphemeBreakProperty.txt\

+	/lib/ucd/emoji-data.txt\

+	/lib/ucd/CompositionExclusions.txt\

+	/lib/ucd/UnicodeData.txt\

+/lib/ucd/%:

+	cd /lib/ucd && mk $stem

+runenormdata runetotypedata runeistypedata runebreakdata:	mkrunetype.c $UCD

+	@{

+		eval `{grep '^[A-Z]' /$cputype/mkfile}

+		$CC $CFLAGS -o mkrunetype.$O mkrunetype.c

+		$LD $LDFLAGS -o $O.mkrunetype mkrunetype.$O

+		$O.mkrunetype

+	}

--- /dev/null

+++ b/sys/src/libc/port/mkrunetype.c

@@ -1,0 +1,748 @@

+#include <u.h>

+#include <libc.h>

+#include <bio.h>

+enum{

+	NRUNES = 1<<21

+};

+typedef struct Param Param;

+typedef struct Lvl Lvl;

+struct Lvl{

+	int bits;

+	int max;

+	int mask;

+};

+struct Param{

+	Lvl idx1;

+	Lvl idx2;

+	Lvl data;

+	int round1max;

+};

+static void

+derive(Lvl *l)

+{

+	l->max = 1 << l->bits;

+	l->mask = l->max - 1;

+}

+static void

+param(Param *p, int idx1, int idx2)

+{

+	assert(idx1 + idx2 < 21);

+	p->idx1.bits = idx1;

+	p->idx2.bits = idx2;

+	p->data.bits = 21 - idx1 - idx2;

+	derive(&p->idx1);

+	derive(&p->idx2);

+	derive(&p->data);

+	p->round1max = NRUNES/p->data.max;

+}

+static int

+lkup(Param *p, int *idx1, int *idx2, int *data, int x)

+{

+	int y, z;

+	y = (((x)>>(p->data.bits+p->idx2.bits))&p->idx1.mask);

+	z = (((x)>>p->data.bits)&p->idx2.mask);

+	return data[idx2[idx1[y] + z] + (x&p->data.mask)];

+}

+static int

+mkarrvar(int fd, char *name, int *d, int len)

+{

+	int i, sz;

+	int max, min;

+	char *t;

+	max = min = 0;

+	for(i = 0; i < len; i++){

+		if(d[i] > max)

+			max = d[i];

+		if(d[i] < min)

+			min = d[i];

+	}

+	if(min == 0){

+		if(max < 0xFF)

+			t = "uchar", sz = 1;

+		else if(max < 0xFFFF)

+			t = "ushort", sz = 2;

+		else

+			t = "uint", sz = 4;

+	} else {

+		if(max < 1<<7)

+			t = "char", sz = 1;

+		else if(max < 1<<15)

+			t = "short", sz = 2;

+		else

+			t = "int", sz = 4;

+	}

+	if(fd < 0)

+		return sz * len;

+	fprint(fd, "static\n%s\t%s[%d] =\n{\n\t", t, name, len);

+	for(i = 0; i < len; i++){

+		fprint(fd, "%d,", d[i]);

+		if((i+1) % 16 == 0)

+			fprint(fd, "\n\t");

+	}

+	fprint(fd, "\n};\n");

+	return sz * len;

+}

+static int

+mkexceptarr(int fd, char *name, int *d, int n, int all)

+{

+	int i;

+	fprint(fd, "static\nRune %s[][%d] =\n{\n\t", name, all ? 3 : 2);

+	for(i = 0; i < n*3; i += 3){

+		if(all && d[i] != 0)

+			fprint(fd, "{0x%X, 0x%X, 0x%X},", d[i], d[i+1], d[i+2]);

+		else if(!all)

+			fprint(fd, "{0x%X, 0x%X},", d[i+1], d[i+2]);

+		if((i+3) % (8*3) == 0)

+			fprint(fd, "\n\t");

+	}

+	fprint(fd, "\n};\n");

+	return n * sizeof(Rune) * 2;

+}

+static int

+compact(int *data, int *idx, int nidx, int *src, int chunksize)

+{

+	int i, n, ndata, best;

+	int *dot, *lp, *rp;

+	dot = src;

+	ndata = 0;

+	idx[0] = 0;

+	for(i = 1; i <= nidx; i++){

+		rp = dot + chunksize;

+		lp = rp - 1;

+		for(best = 0, n = 0; i != nidx && n < chunksize; n++, lp--){

+			if(memcmp(lp, rp, (n+1) * sizeof data[0]) == 0)

+				best = n+1;

+		}

+		memmove(data + ndata, dot, (chunksize - best) * sizeof data[0]);

+		ndata += (chunksize - best);

+		idx[i] = idx[i - 1] + (chunksize - best);

+		dot = rp;

+	}

+	return ndata;

+}

+static int

+mklkup(int fd, char *label, int *map, Param *p)

+{

+	static int data[NRUNES];

+	static int idx2[NRUNES];

+	static int idx2dest[NRUNES];

+	static int idx1[NRUNES];

+	int i, nidx2, ndata;

+	int size;

+	ndata = compact(data, idx2, p->round1max, map, p->data.max);

+	nidx2 = compact(idx2dest, idx1, p->idx1.max, idx2, p->idx2.max);

+	if(fd >= 0){

+		for(i = 0; i < NRUNES; i++)

+			if(map[i] != lkup(p, idx1, idx2dest, data, i))

+				sysfatal("mismatch in %s at %d %d %d\n", label, i, map[i], lkup(p, idx1, idx2dest, data, i));

+	}

+	size = mkarrvar(fd, smprint("_%sdata", label), data, ndata);

+	size += mkarrvar(fd, smprint("_%sidx2", label), idx2dest, nidx2);

+	size += mkarrvar(fd, smprint("_%sidx1", label), idx1, p->idx1.max);

+	if(fd >= 0){

+		fprint(fd, "\n");

+		fprint(fd, "#define %sindex1(x) (((x)>>(%d+%d))&0x%X)\n", label, p->data.bits, p->idx2.bits, p->idx1.mask);

+		fprint(fd, "#define %sindex2(x) (((x)>>%d)&0x%X)\n", label, p->data.bits, p->idx2.mask);

+		fprint(fd, "#define %soffset(x) ((x)&0x%X)\n", label, p->data.mask);

+		fprint(fd, "#define %slkup(x) (_%sdata[_%sidx2[_%sidx1[%sindex1(x)] + %sindex2(x)] + %soffset(x)] )\n\n",

+			label, label, label, label, label, label, label);

+	}

+	return size;

+}

+static void

+mklkupmatrix(char *label, int *map, Param *p)

+{

+	int bestsize, size, bestx, besty;

+	int x, y;

+	bestsize = bestx = besty = -1;

+	for(x = 4; x <= 12; x++)

+		for(y=4; y <= (19 - x); y++){

+			param(p, x, y);

+			size = mklkup(-1, label, map, p);

+			if(bestsize == -1 || size < bestsize){

+				bestx = x;

+				besty = y;

+				bestsize = size;

+			}

+		}

+	assert(bestsize != -1);

+	fprint(2, "label: %s best: %d %d (%d)\n", label, bestx, besty, bestsize);

+	param(p, bestx, besty);

+}

+static int myismerged[NRUNES];

+static int mytoupper[NRUNES];

+static int mytolower[NRUNES];

+static int mytotitle[NRUNES];

+static int mybreak[NRUNES];

+enum{ DSTART = 0xEEEE };

+static int mydecomp[NRUNES];

+static int mydespecial[256*3];

+static int nspecial;

+static int myccc[NRUNES];

+typedef struct KV KV;

+struct KV{

+	uint key;

+	uint val;

+	ushort next;

+};

+static KV myrecomp[2000];

+static int nrecomp;

+static int recompext[256*3];

+static int nrecompext;

+static uint

+hash(uint x)

+{

+	x ^= x >> 16;

+	x *= 0x21f0aaad;

+	x ^= x >> 15;

+	x *= 0xd35a2d97;

+	x ^= x >> 15;

+	return x;

+}

+static void

+mkrecomp(int fd)

+{

+	int i;

+	KV *p;

+	static KV vals[512];

+	static KV coll[1000];

+	int over;

+	int maxchain;

+	for(i = 0; i < nelem(vals); i++)

+		vals[i] = (KV){0, 0, 0};

+	for(i = 0; i < nelem(coll); i++)

+		coll[i] = (KV){0, 0, 0};

+	over = 1;

+	for(i = 0; i < nrecomp; i++){

+		p = vals + (hash(myrecomp[i].key) % nelem(vals));

+		maxchain = 0;

+		while(p->key != 0){

+			maxchain++;

+			if(p->next == 0){

+				p->next = over;

+				p = coll + over - 1;

+				over++;

+			} else

+				p = coll + p->next - 1;

+		}

+		p->key = myrecomp[i].key;

+		p->val = myrecomp[i].val;

+	}

+	fprint(2, "recomp map [%d][%d]: %d\n", nelem(vals), over-1, (nelem(vals) + over-1) * (4+2+2));

+	fprint(fd, "static\nuint\t_recompdata[] =\n{\n\t");

+	for(p = vals, i = 0;; i++){

+		assert(p->val < 0xFFFF);

+		assert(p->next < 0xFFFF);

+		fprint(fd, "%udU,%udU,", p->key, p->val | (p->next<<16));

+		if((i+1) % 8 == 0)

+			fprint(fd, "\n\t");

+		if(p == vals+nelem(vals)-1)

+			p = coll;

+		else if(p == coll + over - 2)

+			break;

+		else

+			p++;

+	}

+	fprint(fd, "\n};\n");

+	fprint(fd, "static uint *_recompcoll = _recompdata+%d*2;\n", nelem(vals));

+}

+static void

+mktables(void)

+{

+	Param p;

+	int tofd, isfd, normfd, breakfd;

+	int size;

+	tofd = create("runetotypedata", OWRITE, 0664);

+	if(tofd < 0)

+		sysfatal("could not create runetotypedata: %r");

+	param(&p, 10, 7);

+	size = mklkup(tofd, "upper", mytoupper, &p);

+	fprint(2, "%s: %d\n", "upper", size);

+	size = mklkup(tofd, "lower", mytolower, &p);

+	fprint(2, "%s: %d\n", "lower", size);

+	size = mklkup(tofd, "title", mytotitle, &p);

+	fprint(2, "%s: %d\n", "title", size);

+	close(tofd);

+	isfd = create("runeistypedata", OWRITE, 0664);

+	if(isfd < 0)

+		sysfatal("could not create runeistypedata: %r");

+	param(&p, 11, 6);

+	size = mklkup(isfd, "merged", myismerged, &p);

+	fprint(2, "%s: %d\n", "merged", size);

+	fprint(isfd, "static\nenum {\n");

+	fprint(isfd, "\tL%s = %s,\n", "space", "1<<0");

+	fprint(isfd, "\tL%s = %s,\n", "alpha", "1<<1");

+	fprint(isfd, "\tL%s = %s,\n", "digit", "1<<2");

+	fprint(isfd, "\tL%s = %s,\n", "upper", "1<<3");

+	fprint(isfd, "\tL%s = %s,\n", "lower", "1<<4");

+	fprint(isfd, "\tL%s = %s,\n", "title", "1<<5");

+	fprint(isfd, "};\n");

+	close(isfd);

+	normfd = create("runenormdata", OWRITE, 0664);

+	if(normfd < 0)

+		sysfatal("could not create runenormdata: %r");

+	param(&p, 10, 7);

+	size = mklkup(normfd, "decomp", mydecomp, &p);

+	fprint(2, "%s: %d\n", "decomp", size);

+	param(&p, 9, 7);

+	size = mklkup(normfd, "ccc", myccc, &p);

+	fprint(2, "%s: %d\n", "ccc", size);

+	mkexceptarr(normfd, "_decompexceptions", mydespecial, nspecial, 0);

+	mkexceptarr(normfd, "_recompexceptions", recompext, nrecompext, 1);

+	mkrecomp(normfd);

+	close(normfd);

+	param(&p, 10, 6);

+	breakfd = create("runebreakdata", OWRITE, 0644);

+	if(breakfd < 0)

+		sysfatal("could not create runebreakdata: %r");

+	size = mklkup(breakfd, "break", mybreak, &p);

+	fprint(2, "%s: %d\n", "break", size);

+}

+enum {

+	FIELD_CODE,

+	FIELD_NAME,

+	FIELD_CATEGORY,

+	FIELD_COMBINING,

+	FIELD_BIDIR,

+	FIELD_DECOMP,

+	FIELD_DECIMAL_DIG,

+	FIELD_DIG,

+	FIELD_NUMERIC_VAL,

+	FIELD_MIRRORED,

+	FIELD_UNICODE_1_NAME,

+	FIELD_COMMENT,

+	FIELD_UPPER,

+	FIELD_LOWER,

+	FIELD_TITLE,

+	NFIELDS,

+};

+static int

+getunicodeline(Biobuf *in, char **fields)

+{

+	char *p;

+	if((p = Brdline(in, '\n')) == nil)

+		return 0;

+	p[Blinelen(in)-1] = '\0';

+	if (getfields(p, fields, NFIELDS + 1, 0, ";") != NFIELDS)

+		sysfatal("bad number of fields");

+	return 1;

+}

+static int

+estrtoul(char *s, int base)

+{

+	char *epr;

+	Rune code;

+	code = strtoul(s, &epr, base);

+	if(s == epr)

+		sysfatal("bad code point hex string");

+	return code;

+}

+enum {

+	OTHER,

+	Hebrew_Letter, Newline, Extend, Format,

+	Katakana, ALetter, MidLetter, MidNum,

+	MidNumLet, Numeric, ExtendNumLet, WSegSpace,

+	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,

+	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,

+	EMOJIEX = 0xB0,

+};

+static void

+markbreak(void)

+{

+	Biobuf *b;

+	char *p, *dot;

+	int i, s, e;

+	uchar v;

+	b = Bopen("/lib/ucd/WordBreakProperty.txt", OREAD);

+	if(b == nil)

+		sysfatal("could not load word breaks: %r");

+	while((p = Brdline(b, '\n')) != nil){

+		p[Blinelen(b)-1] = 0;

+		if(p[0] == 0 || p[0] == '#')

+			continue;

+		if((dot = strstr(p, "..")) != nil){

+			*dot = 0;

+			dot += 2;

+			s = estrtoul(p, 16);

+			e = estrtoul(dot, 16);

+		} else {

+			s = e = estrtoul(p, 16);

+			dot = p;

+		}

+		v = 0;

+		if(strstr(dot, "ExtendNumLet") != nil)

+			v = ExtendNumLet;

+		else if(strstr(dot, "Hebrew_Letter") != nil)

+			v = Hebrew_Letter;

+		else if(strstr(dot, "Newline") != nil)

+			v = Newline;

+		else if(strstr(dot, "Extend") != nil)

+			v = Extend;

+		else if(strstr(dot, "Format") != nil)

+			v = Format;

+		else if(strstr(dot, "Katakana") != nil)

+			v = Katakana;

+		else if(strstr(dot, "ALetter") != nil)

+			v = ALetter;

+		else if(strstr(dot, "MidLetter") != nil)

+			v = MidLetter;

+		else if(strstr(dot, "MidNum") != nil)

+			v = MidNum;

+		else if(strstr(dot, "Numeric") != nil)

+			v = Numeric;

+		else if(strstr(dot, "WSegSpace") != nil)

+			v = WSegSpace;

+		for(i = s; i <= e; i++)

+			mybreak[i] = v;

+	}

+	Bterm(b);

+	b = Bopen("/lib/ucd/GraphemeBreakProperty.txt", OREAD);

+	if(b == nil)

+		sysfatal("could not load Grapheme breaks: %r");

+	while((p = Brdline(b, '\n')) != nil){

+		p[Blinelen(b)-1] = 0;

+		if(p[0] == 0 || p[0] == '#')

+			continue;

+		if((dot = strstr(p, "..")) != nil){

+			*dot = 0;

+			dot += 2;

+			s = estrtoul(p, 16);

+			e = estrtoul(dot, 16);

+		} else {

+			s = e = estrtoul(p, 16);

+			dot = p;

+		}

+		v = 0;

+		if(strstr(dot, "; Prepend #") != nil)

+			v = PREPEND;

+		else if(strstr(dot, "; Control #") != nil)

+			v = CONTROL;

+		else if(strstr(dot, "; Extend #") != nil)

+			v = EXTEND;

+		else if(strstr(dot, "; Regional_Indicator #") != nil)

+			v = REGION;

+		else if(strstr(dot, "; SpacingMark #") != nil)

+			v = SPACEMK;

+		else if(strstr(dot, "; L #") != nil)

+			v = L;

+		else if(strstr(dot, "; V #") != nil)

+			v = V;

+		else if(strstr(dot, "; T #") != nil)

+			v = T;

+		else if(strstr(dot, "; LV #") != nil)

+			v = LV;

+		else if(strstr(dot, "; LVT #") != nil)

+			v = LVT;

+		for(i = s; i <= e; i++)

+			mybreak[i] |= v;

+	}

+	Bterm(b);

+	b = Bopen("/lib/ucd/emoji-data.txt", OREAD);

+	if(b == nil)

+		sysfatal("could not load emoji-data: %r");

+	while((p = Brdline(b, '\n')) != nil){

+		p[Blinelen(b)-1] = 0;

+		if(p[0] == 0 || p[0] == '#')

+			continue;

+		if((dot = strstr(p, "..")) != nil){

+			*dot = 0;

+			dot += 2;

+			s = estrtoul(p, 16);

+			e = estrtoul(dot, 16);

+		} else {

+			s = e = estrtoul(p, 16);

+			dot = p;

+		}

+		v = 0;

+		if(strstr(dot, "; Extended_Pictographic") != nil)

+			v = EMOJIEX;

+		for(i = s; i <= e; i++)

+			mybreak[i] |= v;

+	}

+	Bterm(b);

+}

+static void

+markexclusions(void)

+{

+	Biobuf *b;

+	char *p;

+	int i;

+	uint x;

+	b = Bopen("/lib/ucd/CompositionExclusions.txt", OREAD);

+	if(b == nil)

+		sysfatal("could not load composition exclusions: %r");

+	while((p = Brdline(b, '\n')) != nil){

+		p[Blinelen(b)-1] = 0;

+		if(p[0] == 0 || p[0] == '#')

+			continue;

+		x = estrtoul(p, 16);

+		for(i = 0; i < nrecomp; i++){

+			if(myrecomp[i].val == x){

+				myrecomp[i].val = 0;

+				break;

+			}

+		}

+		if(i == nrecomp){

+			for(i = 0; i < nrecompext; i++){

+				if(recompext[i*3] == x){

+					recompext[i*3] = 0;

+					break;

+				}

+			}

+		}

+	}

+	Bterm(b);

+}

+void

+main(int, char)

+{

+	static char myisspace[NRUNES];

+	static char myisalpha[NRUNES];

+	static char myisdigit[NRUNES];

+	static char myisupper[NRUNES];

+	static char myislower[NRUNES];

+	static char myistitle[NRUNES];

+	Biobuf *in;

+	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];

+	char *p, *d;

+	int i, code, last;

+	int decomp[2], *ip;

+	in = Bopen("/lib/ucd/UnicodeData.txt", OREAD);

+	if(in == nil)

+		sysfatal("can't open UnicodeData.txt: %r");

+	for(i = 0; i < NRUNES; i++){

+		mytoupper[i] = -1;

+		mytolower[i] = -1;

+		mytotitle[i] = -1;

+		mydecomp[i] = 0;

+		myccc[i] = 0;

+		mybreak[i] = 0;

+	}

+	myisspace['\t'] = 1;

+	myisspace['\n'] = 1;

+	myisspace['\r'] = 1;

+	myisspace['\f'] = 1;

+	myisspace['\v'] = 1;

+	myisspace[0x85] = 1;	/* control char, "next line" */

+	myisspace[0xfeff] = 1;	/* zero-width non-break space */

+	last = -1;

+	nspecial = nrecomp = nrecompext =  0;

+	while(getunicodeline(in, fields)){

+		code = estrtoul(fields[FIELD_CODE], 16);

+		if (code >= NRUNES)

+			sysfatal("code-point value too big: %x", code);

+		if(code <= last)

+			sysfatal("bad code sequence: %x then %x", last, code);

+		last = code;

+		p = fields[FIELD_CATEGORY];

+		if(strstr(fields[FIELD_NAME], ", First>") != nil){

+			if(!getunicodeline(in, fields2))

+				sysfatal("range start at eof");

+			if (strstr(fields2[FIELD_NAME], ", Last>") == nil)

+				sysfatal("range start not followed by range end");

+			last = estrtoul(fields2[FIELD_CODE], 16);

+			if(last <= code)

+				sysfatal("range out of sequence: %x then %x", code, last);

+			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)

+				sysfatal("range with mismatched category");

+		}

+		d = fields[FIELD_DECOMP];

+		if(strlen(d) > 0 && strstr(d, "<") == nil){

+			decomp[0] = estrtoul(d, 16);

+			d = strstr(d, " ");

+			if(d == nil){

+				/* singleton recompositions are verboden */

+				decomp[1] = 0;

+				if(decomp[0] > 0xFFFF){

+					ip = mydespecial + nspecial*3;

+					ip[0] = code;

+					ip[1] = decomp[0];

+					ip[2] = 0;

+					mydecomp[code] = (DSTART+nspecial)<<16;

+					nspecial++;

+				} else

+					mydecomp[code] = decomp[0]<<16;

+			} else {

+				d++;

+				decomp[1] = estrtoul(d, 16);

+				if(decomp[0] > 0xFFFF || decomp[1] > 0xFFFF){

+					ip = mydespecial + nspecial*3;

+					ip[0] = code;

+					ip[1] = decomp[0];

+					ip[2] = decomp[1];

+					mydecomp[code] = (DSTART+nspecial)<<16;

+					nspecial++;

+					ip = recompext + nrecompext*3;

+					ip[0] = code;

+					ip[1] = decomp[0];

+					ip[2] = decomp[1];

+					nrecompext++;

+				} else {

+					mydecomp[code] = decomp[0]<<16 | decomp[1];

+					myrecomp[nrecomp++] = (KV){decomp[0]<<16 | decomp[1], code, 0};

+				}

+			}

+		}

+		for (; code <= last; code++){

+			if(p[0] == 'L')

+				myisalpha[code] = 1;

+			if(p[0] == 'Z')

+				myisspace[code] = 1;

+			if(strcmp(p, "Lu") == 0)

+				myisupper[code] = 1;

+			if(strcmp(p, "Ll") == 0)

+				myislower[code] = 1;

+			if(strcmp(p, "Lt") == 0)

+				myistitle[code] = 1;

+			if(strcmp(p, "Nd") == 0)

+				myisdigit[code] = 1;

+			if(fields[FIELD_UPPER][0] != '\0')

+				mytoupper[code] = estrtoul(fields[FIELD_UPPER], 16);

+			if(fields[FIELD_LOWER][0] != '\0')

+				mytolower[code] = estrtoul(fields[FIELD_LOWER], 16);

+			if(fields[FIELD_TITLE][0] != '\0')

+				mytotitle[code] = estrtoul(fields[FIELD_TITLE], 16);

+			myccc[code] = estrtoul(fields[FIELD_COMBINING], 10);

+		}

+	}

+	Bterm(in);

+	markexclusions();

+	/*

+	 * according to standard, if totitle(x) is not defined in ucd

+	 * but toupper(x) is, then totitle is defined to be toupper(x)

+	 */

+	for(i = 0; i < NRUNES; i++){

+		if(mytotitle[i] == -1

+		&& mytoupper[i] != -1

+		&& !myistitle[i])

+			mytotitle[i] = mytoupper[i];

+	}

+	/*

+	 * A couple corrections:

+	 * is*(to*(x)) should be true.

+	 * restore undefined transformations.

+	 * store offset instead of value, makes them sparse.

+	 */

+	for(i = 0; i < NRUNES; i++){

+		if(mytoupper[i] != -1)

+			myisupper[mytoupper[i]] = 1;

+		else

+			mytoupper[i] = i;

+		if(mytolower[i] != -1)

+			myislower[mytolower[i]] = 1;

+		else

+			mytolower[i] = i;

+		if(mytotitle[i] != -1)

+			myistitle[mytotitle[i]] = 1;

+		else

+			mytotitle[i] = i;

+		mytoupper[i] = mytoupper[i] - i;

+		mytolower[i] = mytolower[i] - i;

+		mytotitle[i] = mytotitle[i] - i;

+	}

+	uchar b;

+	for(i = 0; i < NRUNES; i++){

+		b = 0;

+		if(myisspace[i])

+			b |= 1<<0;

+		if(myisalpha[i])

+			b |= 1<<1;

+		if(myisdigit[i])

+			b |= 1<<2;

+		if(myisupper[i])

+			b |= 1<<3;

+		if(myislower[i])

+			b |= 1<<4;

+		if(myistitle[i])

+			b |= 1<<5;

+		myismerged[i] = b;

+	}

+	markbreak();

+	mktables();

+	exits(nil);

+}

--- /dev/null

+++ b/sys/src/libc/port/runebreak.c

@@ -1,0 +1,293 @@

+#include <u.h>

+#include <libc.h>

+#include "runebreakdata"

+enum {

+	OTHER,

+	Hebrew_Letter, Newline, Extend, Format,

+	Katakana, ALetter, MidLetter, MidNum,

+	MidNumLet, Numeric, ExtendNumLet, WSegSpace,

+	PREPEND = 0x10, CONTROL = 0x20, EXTEND = 0x30, REGION = 0x40,

+	L = 0x50, V = 0x60, T = 0x70, LV = 0x80, LVT = 0x90, SPACEMK = 0xA0,

+	EMOJIEX = 0xB0,

+	ZWJ = 0x200DU,

+	LINETAB = 0xB,

+};

+#define IS(x, y) ((x&0xf) == y)

+#define ISG(x, y) ((x&0xf0) == y)

+Rune*

+runegbreak(Rune *s)

+{

+	Rune l, r;

+	uchar lt, rt;

+	Rune *p;

+	p = s;

+	if((l = *p++) == 0)

+		return s;

+	if((r = *p) == 0)

+		return s;

+	lt = breaklkup(l);

+	rt = breaklkup(r);

+	if(l == '\r' && r == '\n')

+		goto Done;

+	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')

+		return p;

+	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')

+		return p;

+	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))

+		goto Done;

+	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))

+		goto Done;

+	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))

+		goto Done;

+	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))

+		goto Done;

+	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){

+		while(ISG(rt, EXTEND)){

+			p++;

+			if((r = *p) == 0)

+				return s;

+			rt = breaklkup(r);

+		}

+		if(r != ZWJ)

+			return p;

+		p++;

+		if((r = *p) == 0)

+			return s;

+		rt = breaklkup(r);

+		if(ISG(rt, EMOJIEX))

+			goto Done;

+		return p;

+	}

+	if(ISG(rt, EXTEND) || r == ZWJ)

+		goto Done;

+	if(ISG(lt, REGION) && ISG(rt, REGION))

+		goto Done;

+	return p;

+Done:

+	if(p[1] == 0)

+		return s;

+	return p + 1;

+}

+char*

+utfgbreak(char *s)

+{

+	Rune l, r;

+	uchar lt, rt;

+	char *p;

+	p = s;

+	p += chartorune(&l, p);

+	if(l == 0)

+		return s;

+	chartorune(&r, p);

+	if(r == 0)

+		return s;

+	lt = breaklkup(l);

+	rt = breaklkup(r);

+	if(l == '\r' && r == '\n')

+		goto Done;

+	if(ISG(lt, CONTROL) || l == '\r' || l == '\n')

+		return p;

+	if(ISG(rt, CONTROL) || r == '\r' || r == '\n')

+		return p;

+	if(ISG(lt, L) && (ISG(rt, L) || ISG(rt, V) || ISG(rt, LV) || ISG(rt, LVT)))

+		goto Done;

+	if((ISG(lt, LV) || ISG(lt, V)) && (ISG(rt, V) || ISG(rt, T)))

+		goto Done;

+	if((ISG(lt, LVT) || ISG(lt, T)) && (ISG(rt, T) || ISG(rt, T)))

+		goto Done;

+	if(ISG(rt, SPACEMK) || ISG(lt, PREPEND))

+		goto Done;

+	if(ISG(lt, EMOJIEX) && (ISG(rt, EXTEND) || r == ZWJ)){

+		while(ISG(rt, EXTEND)){

+			p += chartorune(&r, p);

+			chartorune(&r, p);

+			if(r == 0)

+				return s;

+			rt = breaklkup(r);

+		}

+		if(r != ZWJ)

+			return p;

+		p += chartorune(&r, p);

+		chartorune(&r, p);

+		if(r == 0)

+			return s;

+		rt = breaklkup(r);

+		if(ISG(rt, EMOJIEX))

+			goto Done;

+		return p;

+	}

+	if(ISG(rt, EXTEND) || r == ZWJ)

+		goto Done;

+	if(ISG(lt, REGION) && ISG(rt, REGION))

+		goto Done;

+	return p;

+Done:

+	p += chartorune(&r, p);

+	chartorune(&r, p);

+	if(r == 0)

+		return s;

+	return p;

+}

+#define AH(x) (IS(x, ALetter) || IS(x, Hebrew_Letter))

+#define MNLQ(x) (IS(x, MidNumLet) || x == '\'')

+Rune*

+runewbreak(Rune *s)

+{

+	Rune l, r;

+	uchar lt, rt;

+	Rune *p;

+	p = s;

+	if((l = *p++) == 0)

+		return s;

+	if((r = *p) == 0)

+		return s;

+	lt = breaklkup(l);

+	rt = breaklkup(r);

+	if(l == '\r' && r == '\n')

+		goto Done;

+	if(l == '\r' || l == '\n' || l == LINETAB)

+		return p;

+	if(r == '\r' || r == '\n' || l == LINETAB)

+		return p;

+	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))

+		goto Done;

+	if(IS(rt, Format) || IS(rt, Extend))

+		goto Done;

+	if(AH(lt)){

+		if(AH(rt))

+			goto Done;

+		if((IS(rt, MidLetter) || MNLQ(rt)) && p[1] != 0 && AH(breaklkup(p[1])))

+			goto Done;

+		if(IS(lt, Hebrew_Letter) && r == '\'')

+			goto Done;

+		if(IS(lt, Hebrew_Letter) && r == '"' && p[1] != 0 && IS(breaklkup(p[1]), Hebrew_Letter))

+			goto Done;

+		if(IS(rt, Numeric))

+			goto Done;

+	}

+	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))

+		goto Done;

+	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && p[1] != 0 && IS(breaklkup(p[1]), Numeric))

+		goto Done;

+	if(IS(lt, Katakana) && IS(rt, Katakana))

+		goto Done;

+	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))

+		if(IS(rt, ExtendNumLet))

+			goto Done;

+	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))

+		goto Done;

+	if(ISG(lt, REGION)){

+		if(ISG(rt, REGION))

+			goto Done;

+		if(r != ZWJ)

+			return p;

+		p++;

+		if((r = *p) == 0)

+			return s;

+		rt = breaklkup(r);

+		if(ISG(rt, REGION))

+			goto Done;

+	}

+	return p;

+Done:

+	if(p[1] == 0)

+		return s;

+	return p + 1;

+}

+char*

+utfwbreak(char *s)

+{

+	Rune l, r;

+	Rune peek;

+	uchar lt, rt;

+	char *p;

+	p = s;

+	p += chartorune(&l, p);

+	if(l == 0)

+		return s;

+	chartorune(&peek, p+chartorune(&r, p));

+	if(r == 0)

+		return s;

+	lt = breaklkup(l);

+	rt = breaklkup(r);

+	if(l == '\r' && r == '\n')

+		goto Done;

+	if(l == '\r' || l == '\n' || l == LINETAB)

+		return p;

+	if(r == '\r' || r == '\n' || l == LINETAB)

+		return p;

+	if(IS(lt, WSegSpace) && IS(rt, WSegSpace))

+		goto Done;

+	if(IS(rt, Format) || IS(rt, Extend))

+		goto Done;

+	if(AH(lt)){

+		if(AH(rt))

+			goto Done;

+		if(IS(rt, MidLetter) || MNLQ(rt))

+		if(peek != 0 && AH(breaklkup(peek)))

+			goto Done;

+		if(IS(lt, Hebrew_Letter) && r == '\'')

+			goto Done;

+		if(IS(lt, Hebrew_Letter) && r == '"')

+		if(peek != 0 && IS(breaklkup(peek), Hebrew_Letter))

+			goto Done;

+		if(IS(rt, Numeric))

+			goto Done;

+	}

+	if(IS(lt, Numeric) && (AH(rt) || IS(rt, Numeric)))

+		goto Done;

+	if(IS(lt, Numeric) && (IS(rt, MidNum) || MNLQ(rt)) && peek != 0 && IS(breaklkup(peek), Numeric))

+		goto Done;

+	if(IS(lt, Katakana) && IS(rt, Katakana))

+		goto Done;

+	if(AH(lt) || IS(lt, Numeric) || IS(lt, Katakana) || IS(lt, ExtendNumLet))

+		if(IS(rt, ExtendNumLet))

+			goto Done;

+	if(IS(lt, ExtendNumLet) && (AH(rt) || IS(rt, Numeric) || IS(rt, Katakana)))

+		goto Done;

+	if(ISG(lt, REGION)){

+		if(ISG(rt, REGION))

+			goto Done;

+		if(r != ZWJ)

+			return p;

+		p += chartorune(&r, p);

+		chartorune(&r, p);

+		if(r == 0)

+			return s;

+		rt = breaklkup(r);

+		if(ISG(rt, REGION))

+			goto Done;

+	}

+	return p;

+Done:

+	p += chartorune(&r, p);

+	chartorune(&r, p);

+	if(r == 0)

+		return s;

+	return p;

+}

--- /dev/null

+++ b/sys/src/libc/port/runeistype.c

@@ -1,0 +1,40 @@

+#include <u.h>

+#include <libc.h>

+#include "runeistypedata"

+int

+isspacerune(Rune c)

+{

+	return (mergedlkup(c) & Lspace) == Lspace;

+}

+int

+isalpharune(Rune c)

+{

+	return (mergedlkup(c) & Lalpha) == Lalpha;

+}

+int

+isdigitrune(Rune c)

+{

+	return (mergedlkup(c) & Ldigit) == Ldigit;

+}

+int

+isupperrune(Rune c)

+{

+	return (mergedlkup(c) & Lupper) == Lupper;

+}

+int

+islowerrune(Rune c)

+{

+	return (mergedlkup(c) & Llower) == Llower;

+}

+int

+istitlerune(Rune c)

+{

+	return (mergedlkup(c) & Ltitle) == Ltitle;

+}

--- /dev/null

+++ b/sys/src/libc/port/runenorm.c

@@ -1,0 +1,334 @@

+#include <u.h>

+#include <libc.h>

+#include "runenormdata"

+//Unicode Standard: Section 3.12 Conjoining Jamo Behavior

+enum {

+	SBase = 0xAC00,

+	LBase = 0x1100,

+	VBase = 0x1161,

+	TBase = 0x11A7,

+	LCount = 19,

+	VCount = 21,

+	TCount = 28,

+	NCount = VCount * TCount,

+	SCount = LCount * NCount,

+	LLast = LBase + LCount - 1,

+	SLast = SBase + SCount - 1,

+	VLast = VBase + VCount - 1,

+	TLast = TBase + TCount - 1,

+};

+static void

+_runedecomp(Rune dst[2], Rune c)

+{

+	uint x;

+	if(c >= SBase && c <= SLast){

+		c -= SBase;

+		x = c % TCount;

+		if(x){

+			dst[0] = SBase + ((c / TCount) * TCount);

+			dst[1] = TBase + x;

+			return;

+		}

+		dst[0] = LBase + (c / NCount);

+		dst[1] = VBase + ((c % NCount) / TCount);

+		return;

+	}

+	x = decomplkup(c);

+	if((x & 0xFFFF) != 0){

+		dst[0] = x>>16;

+		dst[1] = x & 0xFFFF;

+		return;

+	}

+	x >>= 16;

+	if(x >= 0xEEEE && x <0xF8FF){

+		memmove(dst, _decompexceptions[x - 0xEEEE], sizeof(Rune)*2);

+		return;

+	}

+	dst[0] = x;

+	dst[1] = 0;

+}

+static Rune

+_runerecomp(Rune r[2])

+{

+	uint x, y, *p, next;

+	if(r[0] >= LBase && r[0] <= LLast){

+		if(r[1] < VBase || r[1] > VLast)

+			return 0;

+		x = (r[0] - LBase) * NCount + (r[1] - VBase) * TCount;

+		return SBase + x;

+	}

+	if(r[0] >= SBase && r[0] <= SLast && (r[0] - SBase) % TCount == 0){

+		if(r[1] > TBase && r[1] <= TLast)

+			return r[0] + (r[1] - TBase);

+		return 0;

+	}

+	if(r[0] > 0xFFFF || r[1] > 0xFFFF){

+		for(x = 0; x < nelem(_recompexceptions); x++)

+			if(r[0] == _recompexceptions[x][1] && r[1] == _recompexceptions[x][2])

+				return  _recompexceptions[x][0];

+		return 0;

+	}

+	y = x = r[0]<<16 | r[1];

+	x ^= x >> 16;

+	x *= 0x21f0aaad;

+	x ^= x >> 15;

+	x *= 0xd35a2d97;

+	x ^= x >> 15;

+	p = _recompdata + (x%512)*2;

+	while(p[0] != y){

+		next = p[1]>>16;

+		if(!next)

+			return 0;

+		p = _recompcoll + (next-1)*2;

+	}

+	return p[1] & 0xFFFF;

+}

+static void

+runecccsort(Rune *a, int len)

+{

+	Rune r;

+	int i;

+	int fail;

+	do {

+		fail = 0;

+		for(i = 0; i < len - 1; i++){

+			if(ccclkup(a[i]) > ccclkup(a[i+1]) > 0){

+				r = a[i];

+				a[i] = a[i+1];

+				a[i + 1] = r;

+				fail = 1;

+			}

+		}

+	} while(fail);

+}

+char*

+fullutfnorm(char *s, int n)

+{

+	Rune r, peek;

+	char *p, *p2;

+	p = s;

+	if(fullrune(p, n) == 0)

+		return s;

+	p += chartorune(&r, p);

+	n -= (p - s);

+	if((r >= LBase && r <= LLast) || (r >= SBase && r <= SLast)){

+		do {

+			if(fullrune(p, n) == 0)

+				return s;

+			p2 = p + chartorune(&peek, p);

+			n -= (p2 - p);

+			p = p2;

+		} while(n > 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast));

+		if(n <= 0)

+			return s;

+		return p;

+	}

+	do {

+		if(fullrune(p, n) == 0)

+			return s;

+		p2 = p + chartorune(&peek, p);

+		n -= (p2 - p);

+		p = p2;

+		if(ccclkup(peek) == 0)

+			return p;

+	} while(n > 0);

+	return s;

+}

+Rune*

+fullrunenorm(Rune *r, int n)

+{

+	Rune *e, *p;

+	p = r;

+	e = p + n;

+	if((*p >= LBase && *p <= LLast) || (*p >= SBase && *p <= SLast)){

+		p++;

+		while(p < e && (*p >= VBase && *p <= VLast) || (*p > TBase && *p <= TLast))

+			p++;

+		if(p >= e)

+			return r;

+		return p;

+	}

+	for(; p < e && p + 1 < e; p++)

+		if(ccclkup(p[1]) == 0)

+			return p + 1;

+	return r;

+}

+static int

+runenorm(Rune *dst, Rune *src, char *sdst, char *ssrc, int max, int compose)

+{

+	Rune c, r[2], _stack[32];

+	Rune *p, *stack, *sp, *tp;

+	char *strp, *strstop;

+	Rune *rp, *rrp;

+	Rune *stop;

+	Rune peek;

+	int w, w2, size;

+	int mode;

+	if(src){

+		mode = 1;

+		p = src;

+		stop = dst + (max - 1);

+		strp = "";

+		strstop = nil;

+	} else {

+		mode = 0;

+		p = L"";

+		stop = nil;

+		strp = ssrc;

+		strstop = sdst + (max - 1);

+	}

+	stack = _stack + nelem(_stack)/2;

+	size = 0;

+	w = w2 = 0;

+	while(*strp || *p){

+		if(mode)

+			c = *p;

+		else

+			w = chartorune(&c, strp);

+		sp = stack - 1;

+		tp = stack;

+		_runedecomp(r, c);

+		while(r[0] != 0){

+			c = r[0];

+			if(r[1] != 0){

+				*sp-- = r[1];

+				if(sp == _stack)

+					break;

+			}

+			_runedecomp(r, c);

+		}

+		*sp = c;

+		if(mode)

+			peek = p[1];

+		else

+			w2 = chartorune(&peek, strp+w);

+		if((*sp >= LBase && *sp <= LLast) || (*sp >= SBase && *sp <= SLast)){

+			while(peek != 0 && (peek >= VBase && peek <= VLast) || (peek > TBase && peek <= TLast)){

+				*tp++ = peek;

+				if(mode){

+					p++;

+					peek = p[1];

+				} else {

+					strp += w;

+					w = w2;

+					w2 = chartorune(&peek, strp+w);

+				}

+				if(tp == _stack + nelem(_stack))

+					break;

+			}

+		}

+		while(peek != 0 && ccclkup(peek) != 0){

+			_runedecomp(r, peek);

+			if(r[1] != 0){

+				if(tp+1 >= _stack + nelem(_stack))

+					break;

+				*tp++ = r[0];

+				*tp++ = r[1];

+			} else if(r[0] != 0)

+				*tp++ = r[0];

+			else

+				*tp++ = peek;

+			if(mode){

+				p++;

+				peek = p[1];

+			} else {

+				strp += w;

+				w = w2;

+				w2 = chartorune(&peek, strp+w);

+			}

+			if(tp == _stack + nelem(_stack))

+				break;

+		}

+		runecccsort(sp, tp - sp);

+		if(compose && ccclkup(*sp) == 0){

+			for(rp = sp + 1; rp < tp; rp++){

+				r[0] = *sp;

+				r[1] = *rp;

+				c = _runerecomp(r);

+				if(c != 0){

+					*sp = c;

+					for(rrp = rp; rrp > sp; rrp--)

+						*rrp = rrp[-1];

+					sp++;

+				} else while(rp + 1 < tp && ccclkup(*rp) == ccclkup(*(rp+1)))

+					rp++;

+			}

+		}

+		for(; sp < tp; sp++){

+			if(mode){

+				if(dst < stop)

+					*dst++ = *sp;

+				size++;

+			} else {

+				w2 = runelen(*sp);

+				if(sdst+w2 < strstop)

+					sdst += runetochar(sdst, sp);

+				size += w2;

+			}

+		}

+		if(mode)

+			p++;

+		else

+			strp += w;

+	}

+	if(mode)

+		*dst = 0;

+	else

+		*sdst = 0;

+	return size;

+}

+int

+runecomp(Rune *dst, Rune *src, int max)

+{

+	return runenorm(dst, src, nil, nil, max, 1);

+}

+int

+runedecomp(Rune *dst, Rune *src, int max)

+{

+	return runenorm(dst, src, nil, nil, max, 0);

+}

+int

+utfcomp(char *dst, char *src, int max)

+{

+	return runenorm(nil, nil, dst, src, max, 1);

+}

+int

+utfdecomp(char *dst, char *src, int max)

+{

+	return runenorm(nil, nil, dst, src, max, 0);

+}

--- /dev/null

+++ b/sys/src/libc/port/runetotype.c

@@ -1,0 +1,22 @@

+#include <u.h>

+#include <libc.h>

+#include "runetotypedata"

+Rune

+toupperrune(Rune c)

+{

+	return c + upperlkup(c);

+}

+Rune

+tolowerrune(Rune c)

+{

+	return c + lowerlkup(c);

+}

+Rune

+totitlerune(Rune c)

+{

+	return c + titlelkup(c);

+}

--- a/sys/src/libc/test/mkfile

+++ b/sys/src/libc/test/mkfile

@@ -3,6 +3,14 @@

 TEST=\

 	date\

 	pow\

+	runebreak\

+	runenorm\

 	strchr\

 </sys/src/cmd/mktest

+/lib/ucd/%:

+	cd /lib/ucd && mk $stem

+runebreak.test:	/lib/ucd/GraphemeBreakTest.txt /lib/ucd/WordBreakTest.txt

+runenorm.test: /lib/ucd/NormalizationTest.txt

--- /dev/null

+++ b/sys/src/libc/test/runebreak.c

@@ -1,0 +1,112 @@

+#include <u.h>

+#include <libc.h>

+#include <bio.h>

+static int

+estrtoul(char *s)

+{

+	char *epr;

+	Rune code;

+	code = strtoul(s, &epr, 16);

+	if(s == epr)

+		sysfatal("bad code point hex string");

+	return code;

+}

+static Rune*

+check(Rune *r, Rune* (*fn)(Rune*), char* (*fn2)(char*))

+{

+	Rune *r2, *tmp;

+	char *p, *p2;

+	p = smprint("%S", r);

+	r2 = fn(r);

+	p2 = fn2(p);

+	tmp = runesmprint("%.*s", (int)(p2-p), p);

+	if(memcmp(r, tmp, r2-r) != 0)

+		print("utf mismstach\n");

+	free(p);

+	free(tmp);

+	return r2;

+}

+static void

+run(char *file, Rune* (*fn)(Rune*), char* (*fn2)(char*))

+{

+	Biobuf *b;

+	char *p, *dot;

+	char *pieces[16];

+	int i, j, n;

+	Rune stack[16], ops[16];

+	int nstack, nops;

+	Rune r, *rp, *rp2;

+	char *line;

+	b = Bopen(file, OREAD);

+	if(b == nil)

+		sysfatal("could not load composition exclusions: %r");

+	for(;(p = Brdline(b, '\n')) != nil; free(line)){

+		p[Blinelen(b)-1] = 0;

+		line = strdup(p);

+		if(p[0] == 0 || p[0] == '#')

+			continue;

+		if((dot = strstr(p, "#")) != nil)

+			*dot = 0;

+		n = getfields(p, pieces, nelem(pieces), 0, " ");

+		nstack = nops = 0;

+		for(i = 0; i < n; i++){

+			chartorune(&r, pieces[i]);

+			if(r != L'÷' && r != L'×'){

+				r = estrtoul(pieces[i]);

+				stack[nstack++] = r;

+				stack[nstack] = 0;

+			} else {

+				ops[nops++] = r;

+				ops[nops] = 0;

+			}

+		}

+		rp = stack;

+		for(i = 1; i < nops-1;){

+			rp2 = check(rp, fn, fn2);

+			switch(ops[i]){

+			case L'÷':

+				if(rp2 != rp+1){

+					print("break fail %X %X || %s\n", rp[0], rp[1], line);

+					goto Break;

+				}

+				rp++;

+				i++;

+				break;

+			case L'×':

+				if(rp2 - rp == 0){

+					for(j = i; j < nops - 1; j++)

+						if(ops[j] !=  L'×')

+							print("skipped %d %d %s\n", i, nops, line);

+					goto Break;

+				}

+				for(; rp < (rp2-1); rp++, i++){

+					if(ops[i] != L'×')

+						print("skipped %d %d %s\n", i, nops, line);

+				}

+				rp = rp2;

+				i++;

+				break;

+			}

+		}

+Break:

+		;

+	}

+}

+void

+main(int, char)

+{

+	run("/lib/ucd/GraphemeBreakTest.txt", runegbreak, utfgbreak);

+	run("/lib/ucd/WordBreakTest.txt", runewbreak, utfwbreak);

+	exits(nil);

+}

--- /dev/null

+++ b/sys/src/libc/test/runenorm.c

@@ -1,0 +1,92 @@

+#include <u.h>

+#include <libc.h>

+#include <bio.h>

+static int

+estrtoul(char *s)

+{

+	char *epr;

+	Rune code;

+	code = strtoul(s, &epr, 16);

+	if(s == epr)

+		sysfatal("bad code point hex string");

+	return code;

+}

+void

+main(int, char)

+{

+	Rune buffer1[64];

+	Rune buffer2[64];

+	char utfbuff1[128];

+	char utfbuff2[128];

+	char srctmp[128], tmp1[128], tmp2[128];

+	char *fields[10];

+	char *runes[32];

+	char *p;

+	int n, n2;

+	int i;

+	uint fail;

+	Biobuf *b;

+	b = Bopen("/lib/ucd/NormalizationTest.txt", OREAD);

+	if(b == nil)

+		sysfatal("could not load composition exclusions: %r");

+	struct {

+		Rune src[32];

+		Rune nfc[32];

+		Rune nfd[32];

+	} test;

+	while((p = Brdline(b, '\n')) != nil){

+		p[Blinelen(b)-1] = 0;

+		if(p[0] == 0 || p[0] == '#' || p[0] == '@')

+			continue;

+		getfields(p, fields, 6 + 1, 0, ";");

+		n = getfields(fields[0], runes, nelem(runes), 0, " ");

+		for(i = 0; i < n; i++)

+			test.src[i] = estrtoul(runes[i]);

+		test.src[i] = 0;

+		n = getfields(fields[1], runes, nelem(runes), 0, " ");

+		for(i = 0; i < n; i++)

+			test.nfc[i] = estrtoul(runes[i]);

+		test.nfc[i] = 0;

+		n = getfields(fields[2], runes, nelem(runes), 0, " ");

+		for(i = 0; i < n; i++)

+			test.nfd[i] = estrtoul(runes[i]);

+		test.nfd[i] = 0;

+		n = runecomp(buffer1, test.src, nelem(buffer1));

+		n2 = runedecomp(buffer2, test.src, nelem(buffer2));

+		fail = 0;

+		if(runestrcmp(buffer1, test.nfc) != 0)

+			fail |= 1<<0;

+		if(runestrcmp(buffer2, test.nfd) != 0)

+			fail |= 1<<1;

+		if(fail)

+			print("%d %d %S %S %S %S %S\n", fail, i, test.src, test.nfd, test.nfc, buffer2, buffer1);

+		assert(n == runestrlen(test.nfc));

+		assert(n2 == runestrlen(test.nfd));

+		snprint(srctmp, sizeof tmp1, "%S", test.src);

+		snprint(tmp1, sizeof tmp1, "%S", test.nfc);

+		snprint(tmp2, sizeof tmp2, "%S", test.nfd);

+		n = utfcomp(utfbuff1, srctmp, nelem(utfbuff1));

+		n2 = utfdecomp(utfbuff2, srctmp, nelem(utfbuff2));

+		if(strcmp(utfbuff1, tmp1) != 0)

+			fail |= 1<<2;

+		if(strcmp(utfbuff2, tmp2) != 0)

+			fail |= 1<<3;

+		if(fail)

+			print("%d %d %s %s %s %s %s\n", fail, i, srctmp, tmp2, tmp1, utfbuff2, utfbuff1);

+		assert(n == strlen(tmp1));

+		assert(n2 == strlen(tmp2));

+	}

+	exits(nil);

+}