shithub: riscv

Download patch

ref: 6152c8d41d889d28175dc23566920e05f6a3e179
parent: 35fcb73f86518e00fb002c98d0d0a39eef9209fa
author: cinap_lenrek <cinap_lenrek@centraldogma>
date: Sun Sep 18 16:35:57 EDT 2011

html2ms: rewrite from scratch

--- a/sys/src/cmd/html2ms.c
+++ b/sys/src/cmd/html2ms.c
@@ -3,599 +3,538 @@
 #include <ctype.h>
 #include <bio.h>
 
-enum
-{
-	SSIZE = 10,
+typedef struct Tag Tag;
+typedef struct Attr Attr;
+typedef struct Text Text;
 
-	/* list types */
-	Lordered = 0,
-	Lunordered,
-	Lmenu,
-	Ldir,
-
+struct Attr {
+	char	attr[64];
+	char	val[256-64];
 };
 
-Biobuf in, out;
-int lastc = '\n';
-int inpre = 0;
+struct Tag {
+	Tag	*up;
+	char	tag[32];
+	Attr	attr[16];
+	int	nattr;
+	int	opening;
+	int	closing;
 
-/* stack for fonts */
-char *fontstack[SSIZE];
-char *font = "R";
-int fsp;
-
-/* stack for lists */
-struct
-{
-	int	type;
-	int	ord;
-} liststack[SSIZE];
-int lsp;
-
-int quoting;
-
-typedef struct Goobie Goobie;
-struct Goobie
-{
-	char *name;
-	void (*f)(Goobie*, char*);
-	void (*ef)(Goobie*, char*);
+	void	(*close)(Text *, Tag *);
+	union {
+		void	*aux;
+		int	restore;
+	};
 };
 
-void	eatwhite(void);
-void	escape(void);
-
-typedef void Action(Goobie*, char*);
-
-Action	g_ignore;
-Action	g_unexpected;
-Action	g_title;
-Action	g_p;
-Action	g_h;
-Action	g_li;
-Action	g_list, g_listend;
-Action	g_pre;
-Action	g_fpush, g_fpop;
-Action	g_indent, g_exdent;
-Action	g_dt;
-Action	g_display;
-Action	g_displayend;
-Action	g_table, g_tableend, g_caption, g_captionend;
-Action	g_br, g_hr;
-
-Goobie gtab[] =
-{
-	"!--",		g_ignore,	g_unexpected,
-	"!doctype",	g_ignore,	g_unexpected,
-	"a",		g_ignore,	g_ignore,
-	"address",	g_display,	g_displayend,
-	"b",		g_fpush,	g_fpop,
-	"base",		g_ignore,	g_unexpected,
-	"blink",	g_ignore,	g_ignore,
-	"blockquote",	g_ignore,	g_ignore,
-	"body",		g_ignore,	g_ignore,
-	"br",		g_br,		g_unexpected,
-	"caption",	g_caption,	g_captionend,
-	"center",	g_ignore,	g_ignore,
-	"cite",		g_ignore,	g_ignore,
-	"code",		g_ignore,	g_ignore,
-	"dd",		g_ignore,	g_unexpected,
-	"dfn",		g_ignore,	g_ignore,
-	"dir",		g_list,		g_listend,
-	"dl",		g_indent,	g_exdent,
-	"dt",		g_dt,		g_unexpected,
-	"em",		g_ignore,	g_ignore,
-	"font",		g_ignore,	g_ignore,
-	"form",		g_ignore,	g_ignore,
-	"h1",		g_h,		g_p,
-	"h2",		g_h,		g_p,
-	"h3",		g_h,		g_p,
-	"h4",		g_h,		g_p,
-	"h5",		g_h,		g_p,
-	"h6",		g_h,		g_p,
-	"head",		g_ignore,	g_ignore,
-	"hr",		g_hr,		g_unexpected,
-	"html",		g_ignore,	g_ignore,
-	"i",		g_fpush,	g_fpop,
-	"input",	g_ignore,	g_unexpected,
-	"img",		g_ignore,	g_unexpected,
-	"isindex",	g_ignore,	g_unexpected,
-	"kbd",		g_fpush,	g_fpop,
-	"key",		g_ignore,	g_ignore,
-	"li",		g_li,		g_unexpected,
-	"link",		g_ignore,	g_unexpected,
-	"listing",	g_ignore,	g_ignore,
-	"menu",		g_list,		g_listend,
-	"meta",		g_ignore,	g_unexpected,
-	"nextid",	g_ignore,	g_unexpected,
-	"ol",		g_list,		g_listend,
-	"option",	g_ignore,	g_unexpected,
-	"p",		g_p,		g_ignore,
-	"plaintext",	g_ignore,	g_unexpected,
-	"pre",		g_pre,		g_displayend,
-	"samp",		g_ignore,	g_ignore,
-	"select",	g_ignore,	g_ignore,
-	"strong",	g_ignore,	g_ignore,
-	"table",	g_table,	g_tableend,
-	"textarea",	g_ignore,	g_ignore,
-	"title",	g_title,	g_ignore,
-	"tt",		g_fpush,	g_fpop,
-	"u",		g_ignore,	g_ignore,
-	"ul",		g_list,		g_listend,
-	"var",		g_ignore,	g_ignore,
-	"xmp",		g_ignore,	g_ignore,
-	0,		0,	0,
+struct Text {
+	char	font;
+	int	pre;
+	int	pos;
+	int	space;
+	int	output;
 };
 
-typedef struct Entity Entity;
-struct Entity
-{
-	char *name;
-	Rune value;
-};
+void eatwhite(void);
+Tag *parsetext(Text *, Tag *);
+int parsetag(Tag *);
+int parseattr(Attr *);
 
-Entity pl_entity[]=
-{
-"#SPACE", L' ', "#RS",   L'\n', "#RE",   L'\r', "quot",   L'"',
-"AElig",  L'Æ', "Aacute", L'Á', "Acirc",  L'Â', "Agrave", L'À', "Aring",  L'Å',
-"Atilde", L'Ã', "Auml",   L'Ä', "Ccedil", L'Ç', "ETH",    L'Ð', "Eacute", L'É',
-"Ecirc",  L'Ê', "Egrave", L'È', "Euml",   L'Ë', "Iacute", L'Í', "Icirc",  L'Î',
-"Igrave", L'Ì', "Iuml",   L'Ï', "Ntilde", L'Ñ', "Oacute", L'Ó', "Ocirc",  L'Ô',
-"Ograve", L'Ò', "Oslash", L'Ø', "Otilde", L'Õ', "Ouml",   L'Ö', "THORN",  L'Þ',
-"Uacute", L'Ú', "Ucirc",  L'Û', "Ugrave", L'Ù', "Uuml",   L'Ü', "Yacute", L'Ý',
-"aacute", L'á', "acirc",  L'â', "aelig",  L'æ', "agrave", L'à', "amp",    L'&',
-"aring",  L'å', "atilde", L'ã', "auml",   L'ä', "ccedil", L'ç', "eacute", L'é',
-"ecirc",  L'ê', "egrave", L'è', "eth",    L'ð', "euml",   L'ë', "gt",     L'>',
-"iacute", L'í', "icirc",  L'î', "igrave", L'ì', "iuml",   L'ï', "lt",     L'<',
-"ntilde", L'ñ', "oacute", L'ó', "ocirc",  L'ô', "ograve", L'ò', "oslash", L'ø',
-"otilde", L'õ', "ouml",   L'ö', "szlig",  L'ß', "thorn",  L'þ', "uacute", L'ú',
-"ucirc",  L'û', "ugrave", L'ù', "uuml",   L'ü', "yacute", L'ý', "yuml",   L'ÿ',
-0
-};
+Biobuf in, out;
 
-int
-cistrcmp(char *a, char *b)
+void
+emit(Text *text, char *fmt, ...)
 {
-	int c, d;
+	va_list a;
 
-	for(;; a++, b++){
-		d = tolower(*a);
-		c = d - tolower(*b);
-		if(c)
-			break;
-		if(d == 0)
-			break;
+	if(text->pos > 0){
+		text->pos = 0;
+		Bputc(&out, '\n');
 	}
-	return c;
+	va_start(a, fmt);
+	Bvprint(&out, fmt, a);
+	va_end(a);
 }
 
-int
-readupto(char *buf, int n, char d, char notme)
-{
-	char *p;
-	int c;
-
-	buf[0] = 0;
-	for(p = buf;; p++){
-		c = Bgetc(&in);
-		if(c < 0){
-			*p = 0;
-			return -1;
-		}
-		if(c == notme){
-			Bungetc(&in);
-			return -1;
-		}
-		if(c == d){
-			*p = 0;
-			return 0;
-		}
-		*p = c;
-		if(p == buf + n){
-			*p = 0;
-			Bprint(&out, "<%s", buf);
-			return -1;
-		}
-	}
-}
-
 void
-dogoobie(void)
+restoreoutput(Text *text, Tag *tag)
 {
-	char *arg, *type;
-	Goobie *g;
-	char buf[1024];
-	int closing;
-
-	if(readupto(buf, sizeof(buf), '>', '<') < 0){
-		Bprint(&out, "<%s", buf);
-		return;
-	}
-	type = buf;
-	if(*type == '/'){
-		type++;
-		closing = 1;
-	} else
-		closing = 0;
-	arg = strchr(type, ' ');
-	if(arg == 0)
-		arg = strchr(type, '\r');
-	if(arg == 0)
-		arg = strchr(type, '\n');
-	if(arg)
-		*arg++ = 0;
-	for(g = gtab; g->name; g++)
-		if(cistrcmp(type, g->name) == 0){
-			if(closing){
-				if(g->ef){
-					(*g->ef)(g, arg);
-					return;
-				}
-			} else {
-				if(g->f){
-					(*g->f)(g, arg);
-					return;
-				}
-			}
-		}
-	if(closing)
-		type--;
-	if(arg)
-		Bprint(&out, "<%s %s>\n", type, arg);
-	else
-		Bprint(&out, "<%s>\n", type);
+	text->output = tag->restore;
 }
 
 void
-main(void)
+ongarbage(Text *text, Tag *tag)
 {
-	int c, pos;
-
-	Binit(&in, 0, OREAD);
-	Binit(&out, 1, OWRITE);
-
-	pos = 0;
-	for(;;){
-		c = Bgetc(&in);
-		if(c < 0)
-			return;
-		switch(c){
-		case '<':
-			dogoobie();
-			break;
-		case '&':
-			escape();
-			break;
-		case '\r':
-			pos = 0;
-			break;
-		case '\n':
-			if(quoting){
-				Bputc(&out, '"');
-				quoting = 0;
-			}
-			if(lastc != '\n')
-				Bputc(&out, '\n');
-			/* can't emit leading spaces in filled troff docs */
-			if (!inpre)
-				eatwhite();
-			lastc = c;
-			break;
-		default:
-			++pos;
-			if(!inpre && isascii(c) && isspace(c) && pos > 80){
-				Bputc(&out, '\n');
-				eatwhite();
-				pos = 0;
-			}else
-				Bputc(&out, c);
-			lastc = c;
-			break;
-		}
-	}
+	tag->restore = text->output;
+	tag->close = restoreoutput;
+	text->output = 0;
 }
 
 void
-escape(void)
+onp(Text *text, Tag *)
 {
-	int c;
-	Entity *e;
-	char buf[8];
-
-	if(readupto(buf, sizeof(buf), ';', '\n') < 0){
-		Bprint(&out, "&%s", buf);
-		return;
-	}
-	for(e = pl_entity; e->name; e++)
-		if(strcmp(buf, e->name) == 0){
-			Bprint(&out, "%C", e->value);
-			return;
-		}
-	if(*buf == '#'){
-		c = atoi(buf+1);
-		if(isascii(c) && isprint(c)){
-			Bputc(&out, c);
-			return;
-		}
-	}
-	Bprint(&out, "&%s;", buf);
+	emit(text, ".LP\n");
 }
 
-/*
- * whitespace is not significant to HTML, but newlines
- * and leading spaces are significant to troff.
- */
 void
-eatwhite(void)
+restorepre(Text *text, Tag *tag)
 {
-	int c;
-
-	for(;;){
-		c = Bgetc(&in);
-		if(c < 0)
-			break;
-		if(!isspace(c)){
-			Bungetc(&in);
-			break;
-		}
-	}
+	text->pre = tag->restore;
+	emit(text, ".DE\n");
 }
 
-/*
- *  print at start of line
- */
 void
-printsol(char *fmt, ...)
+onpre(Text *text, Tag *tag)
 {
-	va_list arg;
-
-	if(quoting){
-		Bputc(&out, '"');
-		quoting = 0;
-	}
-	if(lastc != '\n')
-		Bputc(&out, '\n');
-	va_start(arg, fmt);
-	Bvprint(&out, fmt, arg);
-	va_end(arg);
-	lastc = '\n';
+	tag->restore = text->pre;
+	tag->close = restorepre;
+	text->pre = 1;
+	emit(text, ".DS L\n");
 }
 
 void
-g_ignore(Goobie *g, char *arg)
+onli(Text *text, Tag *tag)
 {
-	USED(g, arg);
+	if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
+		emit(text, ".IP\n");
+	else
+		emit(text, ".IP \\(bu\n");
+	if(tag->up)
+		tag->up->close = onp;
 }
 
 void
-g_unexpected(Goobie *g, char *arg)
+onh(Text *text, Tag *tag)
 {
-	USED(arg);
-	fprint(2, "unexpected %s ending\n", g->name);
+	emit(text, ".SH %c\n", tag->tag[1]);
+	tag->close = onp;
 }
 
 void
-g_title(Goobie *g, char *arg)
+onbr(Text *text, Tag *tag)
 {
-	USED(arg);
-	printsol(".TL\n", g->name);
+	tag->closing = 1;
+	emit(text, ".br\n");
+	if(cistrcmp(tag->tag, "hr") == 0)
+		emit(text, "\\l'5i'\n.br\n");
 }
 
 void
-g_p(Goobie *g, char *arg)
+restorefont(Text *text, Tag *tag)
 {
-	USED(arg);
-	printsol(".LP\n", g->name);
+	text->font = tag->restore;
+	text->pos += Bprint(&out, "\\f%c", text->font);
 }
 
 void
-g_h(Goobie *g, char *arg)
+onfont(Text *text, Tag *tag)
 {
-	USED(arg);
-	printsol(".SH %c\n", g->name[1]);
+	if(text->font == 0)
+		text->font = 'R';
+	tag->restore = text->font;
+	tag->close = restorefont;
+	if(cistrcmp(tag->tag, "i") == 0)
+		text->font = 'I';
+	else if(cistrcmp(tag->tag, "b") == 0)
+		text->font = 'B';
+	text->pos += Bprint(&out, "\\f%c", text->font);
 }
 
+struct {
+	char	*tag;
+	void	(*open)(Text *, Tag *);
+} ontag[] = {
+	"br",		onbr,
+	"hr",		onbr,
+	"b",		onfont,
+	"i",		onfont,
+	"p",		onp,
+	"h1",		onh,
+	"h2",		onh,
+	"h3",		onh,
+	"h4",		onh,
+	"h5",		onh,
+	"li",		onli,
+	"pre",		onpre,
+	"head",		ongarbage,
+	"style",	ongarbage,
+	"script",	ongarbage,
+};
+
 void
-g_list(Goobie *g, char *arg)
+eatwhite(void)
 {
-	USED(arg);
+	int c;
 
-	if(lsp != SSIZE){
-		switch(g->name[0]){
-		case 'o':
-			liststack[lsp].type  = Lordered;
-			liststack[lsp].ord = 0;
-			break;
-		default:
-			liststack[lsp].type = Lunordered;
-			break;
+	while((c = Bgetc(&in)) > 0){
+		if(strchr("\n\r\t ", c) == nil){
+			Bungetc(&in);
+			return;
 		}
 	}
-	lsp++;
 }
 
 void
-g_br(Goobie *g, char *arg)
+parsecomment(void)
 {
-	USED(g, arg);
-	printsol(".br\n");
-}
+	char buf[64];
+	int n, c;
 
-void
-g_li(Goobie *g, char *arg)
-{
-	USED(g, arg);
-	if(lsp <= 0 || lsp > SSIZE){
-		printsol(".IP \\(bu\n");
-		return;
+	n = 0;
+	eatwhite();
+	while((c = Bgetc(&in)) > 0){
+		if(c == '>')
+			return;
+		if(n == 0 && c == '-'){
+			while((c = Bgetc(&in)) > 0){
+				if(c == '-')
+					if(Bgetc(&in) == '-')
+						if(Bgetc(&in) == '>')
+							return;
+			}
+		}
+		if(n+1 < sizeof(buf)){
+			buf[n++] = c;
+			if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
+				continue;
+			while((c = Bgetc(&in)) > 0){
+				if(c == ']')
+					if(Bgetc(&in) == ']')
+						if(Bgetc(&in) == '>')
+							return;
+			}
+		}
 	}
-	switch(liststack[lsp-1].type){
-	case Lunordered:
-		printsol(".IP \\(bu\n");
-		break;
-	case Lordered:
-		printsol(".IP %d\n", ++liststack[lsp-1].ord);
-		break;
-	}
 }
 
-void
-g_listend(Goobie *g, char *arg)
+int
+parseattr(Attr *a)
 {
-	USED(g, arg);
-	if(--lsp < 0)
-		lsp = 0;
-	printsol(".LP\n");
-}
+	int q, c, n;
 
-void
-g_display(Goobie *g, char *arg)
-{
-	USED(g, arg);
-	printsol(".DS\n");
+	n = 0;
+	eatwhite();
+	while((c = Bgetc(&in)) > 0){
+		if(strchr("</>=?!", c)){
+			Bungetc(&in);
+			break;
+		}
+		if(strchr("\n\r\t ", c))
+			break;
+		if(n < sizeof(a->attr)-1)
+			a->attr[n++] = c;
+	}
+	if(n == 0)
+		return 0;
+	a->attr[n] = 0;
+	n = 0;
+	eatwhite();
+	if(Bgetc(&in) == '='){
+		eatwhite();
+		c = Bgetc(&in);
+		if(strchr("'\"", c)){
+			q = c;
+			while((c = Bgetc(&in)) > 0){
+				if(c == q)
+					break;
+				if(n < sizeof(a->val)-1)
+					a->val[n++] = c;
+			}
+		} else {
+			Bungetc(&in);
+			while((c = Bgetc(&in)) > 0){
+				if(strchr("\n\r\t </>?!", c)){
+					Bungetc(&in);
+					break;
+				}
+				if(n < sizeof(a->val)-1)
+					a->val[n++] = c;
+			}
+		}
+	} else
+		Bungetc(&in);
+	a->val[n] = 0;
+	return 1;
 }
 
-void
-g_pre(Goobie *g, char *arg)
+int
+parsetag(Tag *t)
 {
-	USED(g, arg);
-	printsol(".DS L\n");
-	inpre = 1;
-}
+	int n, c;
 
-void
-g_displayend(Goobie *g, char *arg)
-{
-	USED(g, arg);
-	printsol(".DE\n");
-	inpre = 0;
-}
+	t->nattr = 0;
+	t->opening = 1;
+	t->closing = 0;
 
-void
-g_fpush(Goobie *g, char *arg)
-{
-	USED(arg);
-	if(fsp < SSIZE)
-		fontstack[fsp] = font;
-	fsp++;
-	switch(g->name[0]){
-	case 'b':
-		font = "B";
-		break;
-	case 'i':
-		font = "I";
-		break;
-	case 'k':		/* kbd */
-	case 't':		/* tt */
-		font = "(CW";
-		break;
+	n = 0;
+	eatwhite();
+	while((c = Bgetc(&in)) > 0){
+		if(c == '>')
+			break;
+		if(strchr("\n\r\t ", c)){
+			if(parseattr(t->attr + t->nattr))
+				if(t->nattr < nelem(t->attr)-1)
+					t->nattr++;
+			continue;
+		}
+		if(n == 0 && strchr("?!", c)){
+			parsecomment();
+			return 0;
+		}
+		if(c == '/'){
+			if(n == 0){
+				t->opening = 0;
+				t->closing = 1;
+			} else
+				t->closing = 1;
+			continue;
+		}
+		if(n < sizeof(t->tag)-1)
+			t->tag[n++] = c;
 	}
-	Bprint(&out, "\\f%s", font);
+	t->tag[n] = 0;
+	return n > 0;
 }
 
-void
-g_fpop(Goobie *g, char *arg)
+struct {
+	char	*entity;
+	Rune	rune;
+} entities[] = {
+	"AElig", 198,	"Aacute", 193,	"Acirc", 194,	"Agrave", 192,	
+	"Alpha", 913,	"Aring", 197,	"Atilde", 195,	"Auml", 196,	
+	"Beta", 914,	"Ccedil", 199,	"Chi", 935,	"Dagger", 8225,	
+	"Delta", 916,	"ETH", 208,	"Eacute", 201,	"Ecirc", 202,	
+	"Egrave", 200,	"Epsilon", 917,	"Eta", 919,	"Euml", 203,	
+	"Gamma", 915,	"Iacute", 205,	"Icirc", 206,	"Igrave", 204,	
+	"Iota", 921,	"Iuml", 207,	"Kappa", 922,	"Lambda", 923,	
+	"Mu", 924,	"Ntilde", 209,	"Nu", 925,	"OElig", 338,	
+	"Oacute", 211,	"Ocirc", 212,	"Ograve", 210,	"Omega", 937,	
+	"Omicron", 927,	"Oslash", 216,	"Otilde", 213,	"Ouml", 214,	
+	"Phi", 934,	"Pi", 928,	"Prime", 8243,	"Psi", 936,	
+	"Rho", 929,	"Scaron", 352,	"Sigma", 931,	"THORN", 222,	
+	"Tau", 932,	"Theta", 920,	"Uacute", 218,	"Ucirc", 219,	
+	"Ugrave", 217,	"Upsilon", 933,	"Uuml", 220,	"Xi", 926,	
+	"Yacute", 221,	"Yuml", 376,	"Zeta", 918,	"aacute", 225,	
+	"acirc", 226,	"acute", 180,	"aelig", 230,	"agrave", 224,	
+	"alefsym", 8501,"alpha", 945,	"amp", 38,	"and", 8743,	
+	"ang", 8736,	"aring", 229,	"asymp", 8776,	"atilde", 227,	
+	"auml", 228,	"bdquo", 8222,	"beta", 946,	"brvbar", 166,	
+	"bull", 8226,	"cap", 8745,	"ccedil", 231,	"cdots", 8943,	
+	"cedil", 184,	"cent", 162,	"chi", 967,	"circ", 710,	
+	"clubs", 9827,	"cong", 8773,	"copy", 169,	"crarr", 8629,	
+	"cup", 8746,	"curren", 164,	"dArr", 8659,	"dagger", 8224,	
+	"darr", 8595,	"ddots", 8945,	"deg", 176,	"delta", 948,	
+	"diams", 9830,	"divide", 247,	"eacute", 233,	"ecirc", 234,	
+	"egrave", 232,	"emdash", 8212,	"empty", 8709,	"emsp", 8195,	
+	"endash", 8211,	"ensp", 8194,	"epsilon", 949,	"equiv", 8801,	
+	"eta", 951,	"eth", 240,	"euml", 235,	"euro", 8364,	
+	"exist", 8707,	"fnof", 402,	"forall", 8704,	"frac12", 189,	
+	"frac14", 188,	"frac34", 190,	"frasl", 8260,	"gamma", 947,	
+	"ge", 8805,	"gt", 62,	"hArr", 8660,	"harr", 8596,	
+	"hearts", 9829,	"hellip", 8230,	"iacute", 237,	"icirc", 238,	
+	"iexcl", 161,	"igrave", 236,	"image", 8465,	"infin", 8734,	
+	"int", 8747,	"iota", 953,	"iquest", 191,	"isin", 8712,	
+	"iuml", 239,	"kappa", 954,	"lArr", 8656,	"lambda", 955,	
+	"lang", 9001,	"laquo", 171,	"larr", 8592,	"lceil", 8968,	
+	"ldots", 8230,	"ldquo", 8220,	"le", 8804,	"lfloor", 8970,	
+	"lowast", 8727,	"loz", 9674,	"lrm", 8206,	"lsaquo", 8249,	
+	"lsquo", 8216,	"lt", 60,	"macr", 175,	"mdash", 8212,	
+	"micro", 181,	"middot", 183,	"minus", 8722,	"mu", 956,	
+	"nabla", 8711,	"nbsp", 160,	"ndash", 8211,	"ne", 8800,	
+	"ni", 8715,	"not", 172,	"notin", 8713,	"nsub", 8836,	
+	"ntilde", 241,	"nu", 957,	"oacute", 243,	"ocirc", 244,	
+	"oelig", 339,	"ograve", 242,	"oline", 8254,	"omega", 969,	
+	"omicron", 959,	"oplus", 8853,	"or", 8744,	"ordf", 170,	
+	"ordm", 186,	"oslash", 248,	"otilde", 245,	"otimes", 8855,	
+	"ouml", 246,	"para", 182,	"part", 8706,	"permil", 8240,	
+	"perp", 8869,	"phi", 966,	"pi", 960,	"piv", 982,	
+	"plusmn", 177,	"pound", 163,	"prime", 8242,	"prod", 8719,	
+	"prop", 8733,	"psi", 968,	"quad", 8193,	"quot", 34,	
+	"rArr", 8658,	"radic", 8730,	"rang", 9002,	"raquo", 187,	
+	"rarr", 8594,	"rceil", 8969,	"rdquo", 8221,	"real", 8476,	
+	"reg", 174,	"rfloor", 8971,	"rho", 961,	"rlm", 8207,	
+	"rsaquo", 8250,	"rsquo", 8217,	"sbquo", 8218,	"scaron", 353,	
+	"sdot", 8901,	"sect", 167,	"shy", 173,	"sigma", 963,	
+	"sigmaf", 962,	"sim", 8764,	"sp", 8194,	"spades", 9824,	
+	"sub", 8834,	"sube", 8838,	"sum", 8721,	"sup", 8835,	
+	"sup1", 185,	"sup2", 178,	"sup3", 179,	"supe", 8839,	
+	"szlig", 223,	"tau", 964,	"there4", 8756,	"theta", 952,	
+	"thetasym", 977,"thinsp", 8201,	"thorn", 254,	"tilde", 732,	
+	"times", 215,	"trade", 8482,	"uArr", 8657,	"uacute", 250,	
+	"uarr", 8593,	"ucirc", 251,	"ugrave", 249,	"uml", 168,	
+	"upsih", 978,	"upsilon", 965,	"uuml", 252,	"varepsilon", 8712,	
+	"varphi", 981,	"varpi", 982,	"varrho", 1009,	"vdots", 8942,	
+	"vsigma", 962,	"vtheta", 977,	"weierp", 8472,	"xi", 958,	
+	"yacute", 253,	"yen", 165,	"yuml", 255,	"zeta", 950,	
+	"zwj", 8205,	"zwnj", 8204,
+};
+
+Rune
+parserune(int c)
 {
-	USED(g, arg);
-	fsp--;
-	if(fsp < SSIZE)
-		font = fontstack[fsp];
-	else
-		font = "R";
+	char buf[10];
+	int i, n;
+	Rune r;
 
-	Bprint(&out, "\\f%s", font);
+	n = 0;
+	if(c == '&'){
+		while((c = Bgetc(&in)) > 0){
+			if(strchr("\n\r\t ;</>", c)){
+				if(c != ';')
+					Bungetc(&in);
+				if(n == 0)
+					return '&';
+				break;
+			}
+			if(n == sizeof(buf)-1)
+				break;
+			buf[n++] = c;
+		}
+		buf[n] = 0;
+		if(buf[0] == '#')
+			return atoi(buf+1);
+		for(i=0; i<nelem(entities); i++){
+			n = strcmp(buf, entities[i].entity);
+			if(n == 0)
+				return entities[i].rune;
+			if(n < 0)
+				break;
+		}
+	} else {
+		do {
+			buf[n++] = c;
+			if(fullrune(buf, n)){
+				chartorune(&r, buf);
+				return r;
+			}
+			if(n >= UTFmax)
+				break;
+		} while((c = Bgetc(&in)) > 0);
+	}
+	return 0xFFFD;
 }
 
-void
-g_indent(Goobie *g, char *arg)
+Rune
+substrune(Rune r)
 {
-	USED(g, arg);
-	printsol(".RS\n");
+	switch(r){
+	case 0x2019:
+	case 0x2018:
+		return '\'';
+	case 0x201c:
+	case 0x201d:
+		return '"';
+	default:
+		return r;
+	}
 }
 
 void
-g_exdent(Goobie *g, char *arg)
+debugtag(Tag *tag, char *dbg)
 {
-	USED(g, arg);
-	printsol(".RE\n");
-}
+	if(1) return;
 
-void
-g_dt(Goobie *g, char *arg)
-{
-	USED(g, arg);
-	printsol(".IP \"");
-	quoting = 1;
+	if(tag == nil)
+		return;
+	debugtag(tag->up, nil);
+	fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
 }
 
-void
-g_hr(Goobie *g, char *arg)
+
+Tag*
+parsetext(Text *text, Tag *tag)
 {
-	USED(g, arg);
-	printsol(".br\n");
-	printsol("\\l'5i'\n");
-}
+	Tag *rtag;
+	Rune r;
+	int c;
 
+	rtag = tag;
+	debugtag(tag, "open");
+	if(tag == nil || tag->closing == 0){
+		while((c = Bgetc(&in)) > 0){
+			if(c == '<'){
+				Tag t;
 
-/*
-<table border>
-<caption><font size="+1"><b>Cumulative Class Data</b></font></caption>
-<tr><th rowspan=2>DOSE<br>mg/kg</th><th colspan=2>PARALYSIS</th><th colspan=2>DEATH</th>
-</tr>
-<tr><th width=80>Number</th><th width=80>Percent</th><th width=80>Number</th><th width=80>Percent</th>
-</tr>
-<tr align=center>
-<td>0.1</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.2</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.3</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.4</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.5</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.6</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.7</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.8</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.8 oral</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-</table>
-*/
-
-void
-g_table(Goobie *g, char *arg)
-{
-	USED(g, arg);
-	printsol(".TS\ncenter ;\n");
+				memset(&t, 0, sizeof(t));
+				if(parsetag(&t)){
+					if(t.opening){
+						t.up = tag;
+						for(c = 0; c < nelem(ontag); c++){
+							if(cistrcmp(t.tag, ontag[c].tag) == 0){
+								ontag[c].open(text, &t);
+								break;
+							}
+						}
+						rtag = parsetext(text, &t);
+						if(rtag == &t)
+							rtag = tag;
+						else
+							break;
+					} else if(t.closing){
+						while(rtag && cistrcmp(rtag->tag, t.tag))
+							rtag = rtag->up;
+						if(rtag == nil)
+							rtag = tag;
+						else
+							break;
+					}
+				}
+				continue;
+			}
+			if(!text->output)
+				continue;
+			r = substrune(parserune(c));
+			switch(r){
+			case '\n':
+			case '\r':
+			case ' ':
+			case '\t':
+				if(text->pre == 0){
+					text->space = 1;
+					continue;
+				}
+			default:
+				if(r == '\n' || r == '\r')
+					text->pos = 0;
+				if(text->space){
+					text->space = 0;
+					if(text->pos >= 70){
+						text->pos = 0;
+						Bputc(&out, '\n');
+					} else if(text->pos > 0){
+						text->pos++;
+						Bputc(&out, ' ');
+					}
+				}
+				if(text->pos == 0 && r == '.'){
+					text->pos++;
+					Bputc(&out, ' ');
+				}
+				text->pos++;
+				if(r == 0xA0){
+					r = ' ';
+					Bputc(&out, '\\');
+				}
+				Bprint(&out, "%C", r);
+			}
+		}
+	}
+	debugtag(tag, "close");
+	if(tag && tag->close)
+		tag->close(text, tag);
+	return rtag;
 }
 
 void
-g_tableend(Goobie *g, char *arg)
+main(void)
 {
-	USED(g, arg);
-	printsol(".TE\n");
-}
+	Text text;
 
-void
-g_caption(Goobie *g, char *arg)
-{
-	USED(g, arg);
-}
+	Binit(&in, 0, OREAD);
+	Binit(&out, 1, OWRITE);
 
-void
-g_captionend(Goobie *g, char *arg)
-{
-	USED(g, arg);
+	memset(&text, 0, sizeof(text));
+	text.output = 1;
+	parsetext(&text, nil);
+	emit(&text, "\n");
 }
--- a/sys/src/cmd/page.c
+++ b/sys/src/cmd/page.c
@@ -626,7 +626,7 @@
 	else if(cistrncmp(buf, "<?xml", 5) == 0 ||
 		cistrncmp(buf, "<!DOCTYPE", 9) == 0 ||
 		cistrncmp(buf, "<HTML", 5) == 0){
-		p->data = "htmlfmt -c utf8 | lp -dstdout";
+		p->data = "html2ms | troff -ms | lp -dstdout";
 		p->open = popengs;
 	}
 	else if(memcmp(buf, "\xF7\x02\x01\x83\x92\xC0\x1C;", 8) == 0){