ref: 6152c8d41d889d28175dc23566920e05f6a3e179
parent: 35fcb73f86518e00fb002c98d0d0a39eef9209fa
author: cinap_lenrek <cinap_lenrek@centraldogma>
date: Sun Sep 18 16:35:57 EDT 2011
html2ms: rewrite from scratch
--- a/sys/src/cmd/html2ms.c
+++ b/sys/src/cmd/html2ms.c
@@ -3,599 +3,538 @@
#include <ctype.h>
#include <bio.h>
-enum
-{
- SSIZE = 10,
+typedef struct Tag Tag;
+typedef struct Attr Attr;
+typedef struct Text Text;
- /* list types */
- Lordered = 0,
- Lunordered,
- Lmenu,
- Ldir,
-
+struct Attr {
+ char attr[64];
+ char val[256-64];
};
-Biobuf in, out;
-int lastc = '\n';
-int inpre = 0;
+struct Tag {
+ Tag *up;
+ char tag[32];
+ Attr attr[16];
+ int nattr;
+ int opening;
+ int closing;
-/* stack for fonts */
-char *fontstack[SSIZE];
-char *font = "R";
-int fsp;
-
-/* stack for lists */
-struct
-{
- int type;
- int ord;
-} liststack[SSIZE];
-int lsp;
-
-int quoting;
-
-typedef struct Goobie Goobie;
-struct Goobie
-{
- char *name;
- void (*f)(Goobie*, char*);
- void (*ef)(Goobie*, char*);
+ void (*close)(Text *, Tag *);
+ union {
+ void *aux;
+ int restore;
+ };
};
-void eatwhite(void);
-void escape(void);
-
-typedef void Action(Goobie*, char*);
-
-Action g_ignore;
-Action g_unexpected;
-Action g_title;
-Action g_p;
-Action g_h;
-Action g_li;
-Action g_list, g_listend;
-Action g_pre;
-Action g_fpush, g_fpop;
-Action g_indent, g_exdent;
-Action g_dt;
-Action g_display;
-Action g_displayend;
-Action g_table, g_tableend, g_caption, g_captionend;
-Action g_br, g_hr;
-
-Goobie gtab[] =
-{
- "!--", g_ignore, g_unexpected,
- "!doctype", g_ignore, g_unexpected,
- "a", g_ignore, g_ignore,
- "address", g_display, g_displayend,
- "b", g_fpush, g_fpop,
- "base", g_ignore, g_unexpected,
- "blink", g_ignore, g_ignore,
- "blockquote", g_ignore, g_ignore,
- "body", g_ignore, g_ignore,
- "br", g_br, g_unexpected,
- "caption", g_caption, g_captionend,
- "center", g_ignore, g_ignore,
- "cite", g_ignore, g_ignore,
- "code", g_ignore, g_ignore,
- "dd", g_ignore, g_unexpected,
- "dfn", g_ignore, g_ignore,
- "dir", g_list, g_listend,
- "dl", g_indent, g_exdent,
- "dt", g_dt, g_unexpected,
- "em", g_ignore, g_ignore,
- "font", g_ignore, g_ignore,
- "form", g_ignore, g_ignore,
- "h1", g_h, g_p,
- "h2", g_h, g_p,
- "h3", g_h, g_p,
- "h4", g_h, g_p,
- "h5", g_h, g_p,
- "h6", g_h, g_p,
- "head", g_ignore, g_ignore,
- "hr", g_hr, g_unexpected,
- "html", g_ignore, g_ignore,
- "i", g_fpush, g_fpop,
- "input", g_ignore, g_unexpected,
- "img", g_ignore, g_unexpected,
- "isindex", g_ignore, g_unexpected,
- "kbd", g_fpush, g_fpop,
- "key", g_ignore, g_ignore,
- "li", g_li, g_unexpected,
- "link", g_ignore, g_unexpected,
- "listing", g_ignore, g_ignore,
- "menu", g_list, g_listend,
- "meta", g_ignore, g_unexpected,
- "nextid", g_ignore, g_unexpected,
- "ol", g_list, g_listend,
- "option", g_ignore, g_unexpected,
- "p", g_p, g_ignore,
- "plaintext", g_ignore, g_unexpected,
- "pre", g_pre, g_displayend,
- "samp", g_ignore, g_ignore,
- "select", g_ignore, g_ignore,
- "strong", g_ignore, g_ignore,
- "table", g_table, g_tableend,
- "textarea", g_ignore, g_ignore,
- "title", g_title, g_ignore,
- "tt", g_fpush, g_fpop,
- "u", g_ignore, g_ignore,
- "ul", g_list, g_listend,
- "var", g_ignore, g_ignore,
- "xmp", g_ignore, g_ignore,
- 0, 0, 0,
+struct Text {
+ char font;
+ int pre;
+ int pos;
+ int space;
+ int output;
};
-typedef struct Entity Entity;
-struct Entity
-{
- char *name;
- Rune value;
-};
+void eatwhite(void);
+Tag *parsetext(Text *, Tag *);
+int parsetag(Tag *);
+int parseattr(Attr *);
-Entity pl_entity[]=
-{
-"#SPACE", L' ', "#RS", L'\n', "#RE", L'\r', "quot", L'"',
-"AElig", L'Æ', "Aacute", L'Á', "Acirc", L'Â', "Agrave", L'À', "Aring", L'Å',
-"Atilde", L'Ã', "Auml", L'Ä', "Ccedil", L'Ç', "ETH", L'Ð', "Eacute", L'É',
-"Ecirc", L'Ê', "Egrave", L'È', "Euml", L'Ë', "Iacute", L'Í', "Icirc", L'Î',
-"Igrave", L'Ì', "Iuml", L'Ï', "Ntilde", L'Ñ', "Oacute", L'Ó', "Ocirc", L'Ô',
-"Ograve", L'Ò', "Oslash", L'Ø', "Otilde", L'Õ', "Ouml", L'Ö', "THORN", L'Þ',
-"Uacute", L'Ú', "Ucirc", L'Û', "Ugrave", L'Ù', "Uuml", L'Ü', "Yacute", L'Ý',
-"aacute", L'á', "acirc", L'â', "aelig", L'æ', "agrave", L'à', "amp", L'&',
-"aring", L'å', "atilde", L'ã', "auml", L'ä', "ccedil", L'ç', "eacute", L'é',
-"ecirc", L'ê', "egrave", L'è', "eth", L'ð', "euml", L'ë', "gt", L'>',
-"iacute", L'í', "icirc", L'î', "igrave", L'ì', "iuml", L'ï', "lt", L'<',
-"ntilde", L'ñ', "oacute", L'ó', "ocirc", L'ô', "ograve", L'ò', "oslash", L'ø',
-"otilde", L'õ', "ouml", L'ö', "szlig", L'ß', "thorn", L'þ', "uacute", L'ú',
-"ucirc", L'û', "ugrave", L'ù', "uuml", L'ü', "yacute", L'ý', "yuml", L'ÿ',
-0
-};
+Biobuf in, out;
-int
-cistrcmp(char *a, char *b)
+void
+emit(Text *text, char *fmt, ...)
{
- int c, d;
+ va_list a;
- for(;; a++, b++){
- d = tolower(*a);
- c = d - tolower(*b);
- if(c)
- break;
- if(d == 0)
- break;
+ if(text->pos > 0){
+ text->pos = 0;
+ Bputc(&out, '\n');
}
- return c;
+ va_start(a, fmt);
+ Bvprint(&out, fmt, a);
+ va_end(a);
}
-int
-readupto(char *buf, int n, char d, char notme)
-{
- char *p;
- int c;
-
- buf[0] = 0;
- for(p = buf;; p++){
- c = Bgetc(&in);
- if(c < 0){
- *p = 0;
- return -1;
- }
- if(c == notme){
- Bungetc(&in);
- return -1;
- }
- if(c == d){
- *p = 0;
- return 0;
- }
- *p = c;
- if(p == buf + n){
- *p = 0;
- Bprint(&out, "<%s", buf);
- return -1;
- }
- }
-}
-
void
-dogoobie(void)
+restoreoutput(Text *text, Tag *tag)
{
- char *arg, *type;
- Goobie *g;
- char buf[1024];
- int closing;
-
- if(readupto(buf, sizeof(buf), '>', '<') < 0){
- Bprint(&out, "<%s", buf);
- return;
- }
- type = buf;
- if(*type == '/'){
- type++;
- closing = 1;
- } else
- closing = 0;
- arg = strchr(type, ' ');
- if(arg == 0)
- arg = strchr(type, '\r');
- if(arg == 0)
- arg = strchr(type, '\n');
- if(arg)
- *arg++ = 0;
- for(g = gtab; g->name; g++)
- if(cistrcmp(type, g->name) == 0){
- if(closing){
- if(g->ef){
- (*g->ef)(g, arg);
- return;
- }
- } else {
- if(g->f){
- (*g->f)(g, arg);
- return;
- }
- }
- }
- if(closing)
- type--;
- if(arg)
- Bprint(&out, "<%s %s>\n", type, arg);
- else
- Bprint(&out, "<%s>\n", type);
+ text->output = tag->restore;
}
void
-main(void)
+ongarbage(Text *text, Tag *tag)
{
- int c, pos;
-
- Binit(&in, 0, OREAD);
- Binit(&out, 1, OWRITE);
-
- pos = 0;
- for(;;){
- c = Bgetc(&in);
- if(c < 0)
- return;
- switch(c){
- case '<':
- dogoobie();
- break;
- case '&':
- escape();
- break;
- case '\r':
- pos = 0;
- break;
- case '\n':
- if(quoting){
- Bputc(&out, '"');
- quoting = 0;
- }
- if(lastc != '\n')
- Bputc(&out, '\n');
- /* can't emit leading spaces in filled troff docs */
- if (!inpre)
- eatwhite();
- lastc = c;
- break;
- default:
- ++pos;
- if(!inpre && isascii(c) && isspace(c) && pos > 80){
- Bputc(&out, '\n');
- eatwhite();
- pos = 0;
- }else
- Bputc(&out, c);
- lastc = c;
- break;
- }
- }
+ tag->restore = text->output;
+ tag->close = restoreoutput;
+ text->output = 0;
}
void
-escape(void)
+onp(Text *text, Tag *)
{
- int c;
- Entity *e;
- char buf[8];
-
- if(readupto(buf, sizeof(buf), ';', '\n') < 0){
- Bprint(&out, "&%s", buf);
- return;
- }
- for(e = pl_entity; e->name; e++)
- if(strcmp(buf, e->name) == 0){
- Bprint(&out, "%C", e->value);
- return;
- }
- if(*buf == '#'){
- c = atoi(buf+1);
- if(isascii(c) && isprint(c)){
- Bputc(&out, c);
- return;
- }
- }
- Bprint(&out, "&%s;", buf);
+ emit(text, ".LP\n");
}
-/*
- * whitespace is not significant to HTML, but newlines
- * and leading spaces are significant to troff.
- */
void
-eatwhite(void)
+restorepre(Text *text, Tag *tag)
{
- int c;
-
- for(;;){
- c = Bgetc(&in);
- if(c < 0)
- break;
- if(!isspace(c)){
- Bungetc(&in);
- break;
- }
- }
+ text->pre = tag->restore;
+ emit(text, ".DE\n");
}
-/*
- * print at start of line
- */
void
-printsol(char *fmt, ...)
+onpre(Text *text, Tag *tag)
{
- va_list arg;
-
- if(quoting){
- Bputc(&out, '"');
- quoting = 0;
- }
- if(lastc != '\n')
- Bputc(&out, '\n');
- va_start(arg, fmt);
- Bvprint(&out, fmt, arg);
- va_end(arg);
- lastc = '\n';
+ tag->restore = text->pre;
+ tag->close = restorepre;
+ text->pre = 1;
+ emit(text, ".DS L\n");
}
void
-g_ignore(Goobie *g, char *arg)
+onli(Text *text, Tag *tag)
{
- USED(g, arg);
+ if(tag->up && cistrcmp(tag->up->tag, "ol") == 0)
+ emit(text, ".IP\n");
+ else
+ emit(text, ".IP \\(bu\n");
+ if(tag->up)
+ tag->up->close = onp;
}
void
-g_unexpected(Goobie *g, char *arg)
+onh(Text *text, Tag *tag)
{
- USED(arg);
- fprint(2, "unexpected %s ending\n", g->name);
+ emit(text, ".SH %c\n", tag->tag[1]);
+ tag->close = onp;
}
void
-g_title(Goobie *g, char *arg)
+onbr(Text *text, Tag *tag)
{
- USED(arg);
- printsol(".TL\n", g->name);
+ tag->closing = 1;
+ emit(text, ".br\n");
+ if(cistrcmp(tag->tag, "hr") == 0)
+ emit(text, "\\l'5i'\n.br\n");
}
void
-g_p(Goobie *g, char *arg)
+restorefont(Text *text, Tag *tag)
{
- USED(arg);
- printsol(".LP\n", g->name);
+ text->font = tag->restore;
+ text->pos += Bprint(&out, "\\f%c", text->font);
}
void
-g_h(Goobie *g, char *arg)
+onfont(Text *text, Tag *tag)
{
- USED(arg);
- printsol(".SH %c\n", g->name[1]);
+ if(text->font == 0)
+ text->font = 'R';
+ tag->restore = text->font;
+ tag->close = restorefont;
+ if(cistrcmp(tag->tag, "i") == 0)
+ text->font = 'I';
+ else if(cistrcmp(tag->tag, "b") == 0)
+ text->font = 'B';
+ text->pos += Bprint(&out, "\\f%c", text->font);
}
+struct {
+ char *tag;
+ void (*open)(Text *, Tag *);
+} ontag[] = {
+ "br", onbr,
+ "hr", onbr,
+ "b", onfont,
+ "i", onfont,
+ "p", onp,
+ "h1", onh,
+ "h2", onh,
+ "h3", onh,
+ "h4", onh,
+ "h5", onh,
+ "li", onli,
+ "pre", onpre,
+ "head", ongarbage,
+ "style", ongarbage,
+ "script", ongarbage,
+};
+
void
-g_list(Goobie *g, char *arg)
+eatwhite(void)
{
- USED(arg);
+ int c;
- if(lsp != SSIZE){
- switch(g->name[0]){
- case 'o':
- liststack[lsp].type = Lordered;
- liststack[lsp].ord = 0;
- break;
- default:
- liststack[lsp].type = Lunordered;
- break;
+ while((c = Bgetc(&in)) > 0){
+ if(strchr("\n\r\t ", c) == nil){
+ Bungetc(&in);
+ return;
}
}
- lsp++;
}
void
-g_br(Goobie *g, char *arg)
+parsecomment(void)
{
- USED(g, arg);
- printsol(".br\n");
-}
+ char buf[64];
+ int n, c;
-void
-g_li(Goobie *g, char *arg)
-{
- USED(g, arg);
- if(lsp <= 0 || lsp > SSIZE){
- printsol(".IP \\(bu\n");
- return;
+ n = 0;
+ eatwhite();
+ while((c = Bgetc(&in)) > 0){
+ if(c == '>')
+ return;
+ if(n == 0 && c == '-'){
+ while((c = Bgetc(&in)) > 0){
+ if(c == '-')
+ if(Bgetc(&in) == '-')
+ if(Bgetc(&in) == '>')
+ return;
+ }
+ }
+ if(n+1 < sizeof(buf)){
+ buf[n++] = c;
+ if(n != 7 || cistrncmp(buf, "[CDATA[", 7))
+ continue;
+ while((c = Bgetc(&in)) > 0){
+ if(c == ']')
+ if(Bgetc(&in) == ']')
+ if(Bgetc(&in) == '>')
+ return;
+ }
+ }
}
- switch(liststack[lsp-1].type){
- case Lunordered:
- printsol(".IP \\(bu\n");
- break;
- case Lordered:
- printsol(".IP %d\n", ++liststack[lsp-1].ord);
- break;
- }
}
-void
-g_listend(Goobie *g, char *arg)
+int
+parseattr(Attr *a)
{
- USED(g, arg);
- if(--lsp < 0)
- lsp = 0;
- printsol(".LP\n");
-}
+ int q, c, n;
-void
-g_display(Goobie *g, char *arg)
-{
- USED(g, arg);
- printsol(".DS\n");
+ n = 0;
+ eatwhite();
+ while((c = Bgetc(&in)) > 0){
+ if(strchr("</>=?!", c)){
+ Bungetc(&in);
+ break;
+ }
+ if(strchr("\n\r\t ", c))
+ break;
+ if(n < sizeof(a->attr)-1)
+ a->attr[n++] = c;
+ }
+ if(n == 0)
+ return 0;
+ a->attr[n] = 0;
+ n = 0;
+ eatwhite();
+ if(Bgetc(&in) == '='){
+ eatwhite();
+ c = Bgetc(&in);
+ if(strchr("'\"", c)){
+ q = c;
+ while((c = Bgetc(&in)) > 0){
+ if(c == q)
+ break;
+ if(n < sizeof(a->val)-1)
+ a->val[n++] = c;
+ }
+ } else {
+ Bungetc(&in);
+ while((c = Bgetc(&in)) > 0){
+ if(strchr("\n\r\t </>?!", c)){
+ Bungetc(&in);
+ break;
+ }
+ if(n < sizeof(a->val)-1)
+ a->val[n++] = c;
+ }
+ }
+ } else
+ Bungetc(&in);
+ a->val[n] = 0;
+ return 1;
}
-void
-g_pre(Goobie *g, char *arg)
+int
+parsetag(Tag *t)
{
- USED(g, arg);
- printsol(".DS L\n");
- inpre = 1;
-}
+ int n, c;
-void
-g_displayend(Goobie *g, char *arg)
-{
- USED(g, arg);
- printsol(".DE\n");
- inpre = 0;
-}
+ t->nattr = 0;
+ t->opening = 1;
+ t->closing = 0;
-void
-g_fpush(Goobie *g, char *arg)
-{
- USED(arg);
- if(fsp < SSIZE)
- fontstack[fsp] = font;
- fsp++;
- switch(g->name[0]){
- case 'b':
- font = "B";
- break;
- case 'i':
- font = "I";
- break;
- case 'k': /* kbd */
- case 't': /* tt */
- font = "(CW";
- break;
+ n = 0;
+ eatwhite();
+ while((c = Bgetc(&in)) > 0){
+ if(c == '>')
+ break;
+ if(strchr("\n\r\t ", c)){
+ if(parseattr(t->attr + t->nattr))
+ if(t->nattr < nelem(t->attr)-1)
+ t->nattr++;
+ continue;
+ }
+ if(n == 0 && strchr("?!", c)){
+ parsecomment();
+ return 0;
+ }
+ if(c == '/'){
+ if(n == 0){
+ t->opening = 0;
+ t->closing = 1;
+ } else
+ t->closing = 1;
+ continue;
+ }
+ if(n < sizeof(t->tag)-1)
+ t->tag[n++] = c;
}
- Bprint(&out, "\\f%s", font);
+ t->tag[n] = 0;
+ return n > 0;
}
-void
-g_fpop(Goobie *g, char *arg)
+struct {
+ char *entity;
+ Rune rune;
+} entities[] = {
+ "AElig", 198, "Aacute", 193, "Acirc", 194, "Agrave", 192,
+ "Alpha", 913, "Aring", 197, "Atilde", 195, "Auml", 196,
+ "Beta", 914, "Ccedil", 199, "Chi", 935, "Dagger", 8225,
+ "Delta", 916, "ETH", 208, "Eacute", 201, "Ecirc", 202,
+ "Egrave", 200, "Epsilon", 917, "Eta", 919, "Euml", 203,
+ "Gamma", 915, "Iacute", 205, "Icirc", 206, "Igrave", 204,
+ "Iota", 921, "Iuml", 207, "Kappa", 922, "Lambda", 923,
+ "Mu", 924, "Ntilde", 209, "Nu", 925, "OElig", 338,
+ "Oacute", 211, "Ocirc", 212, "Ograve", 210, "Omega", 937,
+ "Omicron", 927, "Oslash", 216, "Otilde", 213, "Ouml", 214,
+ "Phi", 934, "Pi", 928, "Prime", 8243, "Psi", 936,
+ "Rho", 929, "Scaron", 352, "Sigma", 931, "THORN", 222,
+ "Tau", 932, "Theta", 920, "Uacute", 218, "Ucirc", 219,
+ "Ugrave", 217, "Upsilon", 933, "Uuml", 220, "Xi", 926,
+ "Yacute", 221, "Yuml", 376, "Zeta", 918, "aacute", 225,
+ "acirc", 226, "acute", 180, "aelig", 230, "agrave", 224,
+ "alefsym", 8501,"alpha", 945, "amp", 38, "and", 8743,
+ "ang", 8736, "aring", 229, "asymp", 8776, "atilde", 227,
+ "auml", 228, "bdquo", 8222, "beta", 946, "brvbar", 166,
+ "bull", 8226, "cap", 8745, "ccedil", 231, "cdots", 8943,
+ "cedil", 184, "cent", 162, "chi", 967, "circ", 710,
+ "clubs", 9827, "cong", 8773, "copy", 169, "crarr", 8629,
+ "cup", 8746, "curren", 164, "dArr", 8659, "dagger", 8224,
+ "darr", 8595, "ddots", 8945, "deg", 176, "delta", 948,
+ "diams", 9830, "divide", 247, "eacute", 233, "ecirc", 234,
+ "egrave", 232, "emdash", 8212, "empty", 8709, "emsp", 8195,
+ "endash", 8211, "ensp", 8194, "epsilon", 949, "equiv", 8801,
+ "eta", 951, "eth", 240, "euml", 235, "euro", 8364,
+ "exist", 8707, "fnof", 402, "forall", 8704, "frac12", 189,
+ "frac14", 188, "frac34", 190, "frasl", 8260, "gamma", 947,
+ "ge", 8805, "gt", 62, "hArr", 8660, "harr", 8596,
+ "hearts", 9829, "hellip", 8230, "iacute", 237, "icirc", 238,
+ "iexcl", 161, "igrave", 236, "image", 8465, "infin", 8734,
+ "int", 8747, "iota", 953, "iquest", 191, "isin", 8712,
+ "iuml", 239, "kappa", 954, "lArr", 8656, "lambda", 955,
+ "lang", 9001, "laquo", 171, "larr", 8592, "lceil", 8968,
+ "ldots", 8230, "ldquo", 8220, "le", 8804, "lfloor", 8970,
+ "lowast", 8727, "loz", 9674, "lrm", 8206, "lsaquo", 8249,
+ "lsquo", 8216, "lt", 60, "macr", 175, "mdash", 8212,
+ "micro", 181, "middot", 183, "minus", 8722, "mu", 956,
+ "nabla", 8711, "nbsp", 160, "ndash", 8211, "ne", 8800,
+ "ni", 8715, "not", 172, "notin", 8713, "nsub", 8836,
+ "ntilde", 241, "nu", 957, "oacute", 243, "ocirc", 244,
+ "oelig", 339, "ograve", 242, "oline", 8254, "omega", 969,
+ "omicron", 959, "oplus", 8853, "or", 8744, "ordf", 170,
+ "ordm", 186, "oslash", 248, "otilde", 245, "otimes", 8855,
+ "ouml", 246, "para", 182, "part", 8706, "permil", 8240,
+ "perp", 8869, "phi", 966, "pi", 960, "piv", 982,
+ "plusmn", 177, "pound", 163, "prime", 8242, "prod", 8719,
+ "prop", 8733, "psi", 968, "quad", 8193, "quot", 34,
+ "rArr", 8658, "radic", 8730, "rang", 9002, "raquo", 187,
+ "rarr", 8594, "rceil", 8969, "rdquo", 8221, "real", 8476,
+ "reg", 174, "rfloor", 8971, "rho", 961, "rlm", 8207,
+ "rsaquo", 8250, "rsquo", 8217, "sbquo", 8218, "scaron", 353,
+ "sdot", 8901, "sect", 167, "shy", 173, "sigma", 963,
+ "sigmaf", 962, "sim", 8764, "sp", 8194, "spades", 9824,
+ "sub", 8834, "sube", 8838, "sum", 8721, "sup", 8835,
+ "sup1", 185, "sup2", 178, "sup3", 179, "supe", 8839,
+ "szlig", 223, "tau", 964, "there4", 8756, "theta", 952,
+ "thetasym", 977,"thinsp", 8201, "thorn", 254, "tilde", 732,
+ "times", 215, "trade", 8482, "uArr", 8657, "uacute", 250,
+ "uarr", 8593, "ucirc", 251, "ugrave", 249, "uml", 168,
+ "upsih", 978, "upsilon", 965, "uuml", 252, "varepsilon", 8712,
+ "varphi", 981, "varpi", 982, "varrho", 1009, "vdots", 8942,
+ "vsigma", 962, "vtheta", 977, "weierp", 8472, "xi", 958,
+ "yacute", 253, "yen", 165, "yuml", 255, "zeta", 950,
+ "zwj", 8205, "zwnj", 8204,
+};
+
+Rune
+parserune(int c)
{
- USED(g, arg);
- fsp--;
- if(fsp < SSIZE)
- font = fontstack[fsp];
- else
- font = "R";
+ char buf[10];
+ int i, n;
+ Rune r;
- Bprint(&out, "\\f%s", font);
+ n = 0;
+ if(c == '&'){
+ while((c = Bgetc(&in)) > 0){
+ if(strchr("\n\r\t ;</>", c)){
+ if(c != ';')
+ Bungetc(&in);
+ if(n == 0)
+ return '&';
+ break;
+ }
+ if(n == sizeof(buf)-1)
+ break;
+ buf[n++] = c;
+ }
+ buf[n] = 0;
+ if(buf[0] == '#')
+ return atoi(buf+1);
+ for(i=0; i<nelem(entities); i++){
+ n = strcmp(buf, entities[i].entity);
+ if(n == 0)
+ return entities[i].rune;
+ if(n < 0)
+ break;
+ }
+ } else {
+ do {
+ buf[n++] = c;
+ if(fullrune(buf, n)){
+ chartorune(&r, buf);
+ return r;
+ }
+ if(n >= UTFmax)
+ break;
+ } while((c = Bgetc(&in)) > 0);
+ }
+ return 0xFFFD;
}
-void
-g_indent(Goobie *g, char *arg)
+Rune
+substrune(Rune r)
{
- USED(g, arg);
- printsol(".RS\n");
+ switch(r){
+ case 0x2019:
+ case 0x2018:
+ return '\'';
+ case 0x201c:
+ case 0x201d:
+ return '"';
+ default:
+ return r;
+ }
}
void
-g_exdent(Goobie *g, char *arg)
+debugtag(Tag *tag, char *dbg)
{
- USED(g, arg);
- printsol(".RE\n");
-}
+ if(1) return;
-void
-g_dt(Goobie *g, char *arg)
-{
- USED(g, arg);
- printsol(".IP \"");
- quoting = 1;
+ if(tag == nil)
+ return;
+ debugtag(tag->up, nil);
+ fprint(2, "%s %s%s", tag->tag, dbg ? dbg : " > ", dbg ? "\n" : "");
}
-void
-g_hr(Goobie *g, char *arg)
+
+Tag*
+parsetext(Text *text, Tag *tag)
{
- USED(g, arg);
- printsol(".br\n");
- printsol("\\l'5i'\n");
-}
+ Tag *rtag;
+ Rune r;
+ int c;
+ rtag = tag;
+ debugtag(tag, "open");
+ if(tag == nil || tag->closing == 0){
+ while((c = Bgetc(&in)) > 0){
+ if(c == '<'){
+ Tag t;
-/*
-<table border>
-<caption><font size="+1"><b>Cumulative Class Data</b></font></caption>
-<tr><th rowspan=2>DOSE<br>mg/kg</th><th colspan=2>PARALYSIS</th><th colspan=2>DEATH</th>
-</tr>
-<tr><th width=80>Number</th><th width=80>Percent</th><th width=80>Number</th><th width=80>Percent</th>
-</tr>
-<tr align=center>
-<td>0.1</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.2</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.3</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.4</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.5</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.6</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.7</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.8</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-<tr align=center>
-<td>0.8 oral</td><td><br></td> <td><br></td> <td><br></td> <td><br></td>
-</tr>
-</table>
-*/
-
-void
-g_table(Goobie *g, char *arg)
-{
- USED(g, arg);
- printsol(".TS\ncenter ;\n");
+ memset(&t, 0, sizeof(t));
+ if(parsetag(&t)){
+ if(t.opening){
+ t.up = tag;
+ for(c = 0; c < nelem(ontag); c++){
+ if(cistrcmp(t.tag, ontag[c].tag) == 0){
+ ontag[c].open(text, &t);
+ break;
+ }
+ }
+ rtag = parsetext(text, &t);
+ if(rtag == &t)
+ rtag = tag;
+ else
+ break;
+ } else if(t.closing){
+ while(rtag && cistrcmp(rtag->tag, t.tag))
+ rtag = rtag->up;
+ if(rtag == nil)
+ rtag = tag;
+ else
+ break;
+ }
+ }
+ continue;
+ }
+ if(!text->output)
+ continue;
+ r = substrune(parserune(c));
+ switch(r){
+ case '\n':
+ case '\r':
+ case ' ':
+ case '\t':
+ if(text->pre == 0){
+ text->space = 1;
+ continue;
+ }
+ default:
+ if(r == '\n' || r == '\r')
+ text->pos = 0;
+ if(text->space){
+ text->space = 0;
+ if(text->pos >= 70){
+ text->pos = 0;
+ Bputc(&out, '\n');
+ } else if(text->pos > 0){
+ text->pos++;
+ Bputc(&out, ' ');
+ }
+ }
+ if(text->pos == 0 && r == '.'){
+ text->pos++;
+ Bputc(&out, ' ');
+ }
+ text->pos++;
+ if(r == 0xA0){
+ r = ' ';
+ Bputc(&out, '\\');
+ }
+ Bprint(&out, "%C", r);
+ }
+ }
+ }
+ debugtag(tag, "close");
+ if(tag && tag->close)
+ tag->close(text, tag);
+ return rtag;
}
void
-g_tableend(Goobie *g, char *arg)
+main(void)
{
- USED(g, arg);
- printsol(".TE\n");
-}
+ Text text;
-void
-g_caption(Goobie *g, char *arg)
-{
- USED(g, arg);
-}
+ Binit(&in, 0, OREAD);
+ Binit(&out, 1, OWRITE);
-void
-g_captionend(Goobie *g, char *arg)
-{
- USED(g, arg);
+ memset(&text, 0, sizeof(text));
+ text.output = 1;
+ parsetext(&text, nil);
+ emit(&text, "\n");
}
--- a/sys/src/cmd/page.c
+++ b/sys/src/cmd/page.c
@@ -626,7 +626,7 @@
else if(cistrncmp(buf, "<?xml", 5) == 0 ||
cistrncmp(buf, "<!DOCTYPE", 9) == 0 ||
cistrncmp(buf, "<HTML", 5) == 0){
- p->data = "htmlfmt -c utf8 | lp -dstdout";
+ p->data = "html2ms | troff -ms | lp -dstdout";
p->open = popengs;
}
else if(memcmp(buf, "\xF7\x02\x01\x83\x92\xC0\x1C;", 8) == 0){