ref: 7853a37e63357a9729be86d00236094878ea0460
dir: /sys/src/cmd/tcs/html.c/
#include <u.h> #include <libc.h> #include <bio.h> #include "hdr.h" #include "conv.h" typedef struct Hchar Hchar; struct Hchar { char *s; Rune r; }; /* * Names beginning with _ are names we recognize * (without the underscore) but will not generate, * because they are nonstandard. */ static Hchar byname[] = { {"AElig", 198}, {"Aacute", 193}, {"Acirc", 194}, {"Agrave", 192}, {"Alpha", 913}, {"Aring", 197}, {"Atilde", 195}, {"Auml", 196}, {"Beta", 914}, {"Ccedil", 199}, {"Chi", 935}, {"Dagger", 8225}, {"Delta", 916}, {"ETH", 208}, {"Eacute", 201}, {"Ecirc", 202}, {"Egrave", 200}, {"Epsilon", 917}, {"Eta", 919}, {"Euml", 203}, {"Gamma", 915}, {"Iacute", 205}, {"Icirc", 206}, {"Igrave", 204}, {"Iota", 921}, {"Iuml", 207}, {"Kappa", 922}, {"Lambda", 923}, {"Mu", 924}, {"Ntilde", 209}, {"Nu", 925}, {"OElig", 338}, {"Oacute", 211}, {"Ocirc", 212}, {"Ograve", 210}, {"Omega", 937}, {"Omicron", 927}, {"Oslash", 216}, {"Otilde", 213}, {"Ouml", 214}, {"Phi", 934}, {"Pi", 928}, {"Prime", 8243}, {"Psi", 936}, {"Rho", 929}, {"Scaron", 352}, {"Sigma", 931}, {"THORN", 222}, {"Tau", 932}, {"Theta", 920}, {"Uacute", 218}, {"Ucirc", 219}, {"Ugrave", 217}, {"Upsilon", 933}, {"Uuml", 220}, {"Xi", 926}, {"Yacute", 221}, {"Yuml", 376}, {"Zeta", 918}, {"aacute", 225}, {"acirc", 226}, {"acute", 180}, {"aelig", 230}, {"agrave", 224}, {"alefsym", 8501}, {"alpha", 945}, {"amp", 38}, {"and", 8743}, {"ang", 8736}, {"apos", 39}, {"aring", 229}, {"asymp", 8776}, {"atilde", 227}, {"auml", 228}, {"bdquo", 8222}, {"beta", 946}, {"brvbar", 166}, {"bull", 8226}, {"cap", 8745}, {"ccedil", 231}, {"cdots", 8943}, {"cedil", 184}, {"cent", 162}, {"chi", 967}, {"circ", 710}, {"clubs", 9827}, {"cong", 8773}, {"copy", 169}, {"crarr", 8629}, {"cup", 8746}, {"curren", 164}, {"dArr", 8659}, {"dagger", 8224}, {"darr", 8595}, {"ddots", 8945}, {"deg", 176}, {"delta", 948}, {"diams", 9830}, {"divide", 247}, {"eacute", 233}, {"ecirc", 234}, {"egrave", 232}, {"_emdash", 8212}, /* non-standard but commonly used */ {"empty", 8709}, {"emsp", 8195}, {"_endash", 8211}, /* non-standard but commonly used */ {"ensp", 8194}, {"epsilon", 949}, {"equiv", 8801}, {"eta", 951}, {"eth", 240}, {"euml", 235}, {"euro", 8364}, {"exist", 8707}, {"fnof", 402}, {"forall", 8704}, {"frac12", 189}, {"frac14", 188}, {"frac34", 190}, {"frasl", 8260}, {"gamma", 947}, {"ge", 8805}, {"gt", 62}, {"hArr", 8660}, {"harr", 8596}, {"hearts", 9829}, {"hellip", 8230}, {"iacute", 237}, {"icirc", 238}, {"iexcl", 161}, {"igrave", 236}, {"image", 8465}, {"infin", 8734}, {"int", 8747}, {"iota", 953}, {"iquest", 191}, {"isin", 8712}, {"iuml", 239}, {"kappa", 954}, {"lArr", 8656}, {"lambda", 955}, {"lang", 9001}, {"laquo", 171}, {"larr", 8592}, {"lceil", 8968}, {"_ldots", 8230}, {"ldquo", 8220}, {"le", 8804}, {"lfloor", 8970}, {"lowast", 8727}, {"loz", 9674}, {"lrm", 8206}, {"lsaquo", 8249}, {"lsquo", 8216}, {"lt", 60}, {"macr", 175}, {"mdash", 8212}, {"micro", 181}, {"middot", 183}, {"minus", 8722}, {"mu", 956}, {"nabla", 8711}, {"nbsp", 160}, {"ndash", 8211}, {"ne", 8800}, {"ni", 8715}, {"not", 172}, {"notin", 8713}, {"nsub", 8836}, {"ntilde", 241}, {"nu", 957}, {"oacute", 243}, {"ocirc", 244}, {"oelig", 339}, {"ograve", 242}, {"oline", 8254}, {"omega", 969}, {"omicron", 959}, {"oplus", 8853}, {"or", 8744}, {"ordf", 170}, {"ordm", 186}, {"oslash", 248}, {"otilde", 245}, {"otimes", 8855}, {"ouml", 246}, {"para", 182}, {"part", 8706}, {"permil", 8240}, {"perp", 8869}, {"phi", 966}, {"pi", 960}, {"piv", 982}, {"plusmn", 177}, {"pound", 163}, {"prime", 8242}, {"prod", 8719}, {"prop", 8733}, {"psi", 968}, {"quad", 8193}, {"quot", 34}, {"rArr", 8658}, {"radic", 8730}, {"rang", 9002}, {"raquo", 187}, {"rarr", 8594}, {"rceil", 8969}, {"rdquo", 8221}, {"real", 8476}, {"reg", 174}, {"rfloor", 8971}, {"rho", 961}, {"rlm", 8207}, {"rsaquo", 8250}, {"rsquo", 8217}, {"sbquo", 8218}, {"scaron", 353}, {"sdot", 8901}, {"sect", 167}, {"shy", 173}, {"sigma", 963}, {"sigmaf", 962}, {"sim", 8764}, {"_sp", 8194}, {"spades", 9824}, {"sub", 8834}, {"sube", 8838}, {"sum", 8721}, {"sup", 8835}, {"sup1", 185}, {"sup2", 178}, {"sup3", 179}, {"supe", 8839}, {"szlig", 223}, {"tau", 964}, {"there4", 8756}, {"theta", 952}, {"thetasym", 977}, {"thinsp", 8201}, {"thorn", 254}, {"tilde", 732}, {"times", 215}, {"trade", 8482}, {"uArr", 8657}, {"uacute", 250}, {"uarr", 8593}, {"ucirc", 251}, {"ugrave", 249}, {"uml", 168}, {"upsih", 978}, {"upsilon", 965}, {"uuml", 252}, {"_varepsilon", 8712}, {"varphi", 981}, {"_varpi", 982}, {"varrho", 1009}, {"vdots", 8942}, {"_vsigma", 962}, {"_vtheta", 977}, {"weierp", 8472}, {"xi", 958}, {"yacute", 253}, {"yen", 165}, {"yuml", 255}, {"zeta", 950}, {"zwj", 8205}, {"zwnj", 8204} }; static Hchar byrune[nelem(byname)]; static int hnamecmp(const void *va, const void *vb) { Hchar *a, *b; a = (Hchar*)va; b = (Hchar*)vb; return strcmp(a->s, b->s); } static int hrunecmp(const void *va, const void *vb) { Hchar *a, *b; a = (Hchar*)va; b = (Hchar*)vb; return a->r - b->r; } static void html_init(void) { static int init; int i; if(init) return; init = 1; memmove(byrune, byname, sizeof byrune); /* Eliminate names we aren't allowed to generate. */ for(i=0; i<nelem(byrune); i++){ if(byrune[i].s[0] == '_'){ byrune[i].r = Runeerror; byname[i].s++; } } qsort(byname, nelem(byname), sizeof byname[0], hnamecmp); qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp); } static Rune findbyname(char *s) { Hchar *h; int n, m, x; h = byname; n = nelem(byname); while(n > 0){ m = n/2; x = strcmp(h[m].s, s); if(x == 0) return h[m].r; if(x < 0){ h += m+1; n -= m+1; }else n = m; } return Runeerror; } static char* findbyrune(Rune r) { Hchar *h; int n, m; if(r == Runeerror) return nil; h = byrune; n = nelem(byrune); while(n > 0){ m = n/2; if(h[m].r == r) return h[m].s; if(h[m].r < r){ h += m+1; n -= m+1; }else n = m; } return nil; } void html_in(int fd, long *x, struct convert *out) { char buf[100], *p; Biobuf b; Rune rbuf[N]; Rune *r, *er; int c, s, i; USED(x); html_init(); r = rbuf; er = rbuf+N; Binit(&b, fd, OREAD); while((c = Bgetrune(&b)) != Beof){ if(r >= er){ OUT(out, rbuf, r-rbuf); r = rbuf; } if(c == '&'){ s = 0; buf[0] = c; for(i=1; i<nelem(buf)-1;){ c = Bgetc(&b); if(c == Beof) break; if(strchr(";&</> \t\r\n", c)){ if(c != ';') Bungetc(&b); else s = 1; break; } buf[i++] = c; } buf[i] = 0; if(i > 1){ if((c = findbyname(buf+1)) != Runeerror) goto out; if(i > 2 && buf[1] == '#'){ if(i > 3 && strchr("xX", buf[2])) c = strtol(buf+3, &p, 16); else c = strtol(buf+2, &p, 10); if(*p || c >= NRUNE || c < 0) goto bad; goto out; } } bad: if(s) buf[i++] = ';'; for(p=buf; p<buf+i; ){ p += chartorune(r++, p); if(r >= er){ OUT(out, rbuf, r-rbuf); r = rbuf; } } continue; out: if((c & 0x7f) == c && strchr("<>&\"'", c)){ s = ';'; i = sprint(buf, "&%s", findbyrune(c)); goto bad; } } *r++ = c; } if(r > rbuf) OUT(out, rbuf, r-rbuf); OUT(out, rbuf, 0); } /* * use biobuf because can use more than UTFmax bytes per rune */ void html_out(Rune *r, int n, long *x) { char *s; Biobuf b; Rune *er; USED(x); html_init(); Binit(&b, 1, OWRITE); er = r+n; for(; r<er; r++){ if(*r < Runeself) Bputrune(&b, *r); else if((s = findbyrune(*r)) != nil) Bprint(&b, "&%s;", s); else Bprint(&b, "&#%d;", *r); } Bflush(&b); }