shithub: riscv

ref: 7853a37e63357a9729be86d00236094878ea0460
dir: /sys/src/cmd/tcs/html.c/

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "hdr.h"
#include "conv.h"

typedef struct Hchar Hchar;
struct Hchar
{
	char *s;
	Rune r;
};

/*
 * Names beginning with _ are names we recognize
 * (without the underscore) but will not generate,
 * because they are nonstandard.
 */
static Hchar byname[] =
{
	{"AElig", 198},
	{"Aacute", 193},
	{"Acirc", 194},
	{"Agrave", 192},
	{"Alpha", 913},
	{"Aring", 197},
	{"Atilde", 195},
	{"Auml", 196},
	{"Beta", 914},
	{"Ccedil", 199},
	{"Chi", 935},
	{"Dagger", 8225},
	{"Delta", 916},
	{"ETH", 208},
	{"Eacute", 201},
	{"Ecirc", 202},
	{"Egrave", 200},
	{"Epsilon", 917},
	{"Eta", 919},
	{"Euml", 203},
	{"Gamma", 915},
	{"Iacute", 205},
	{"Icirc", 206},
	{"Igrave", 204},
	{"Iota", 921},
	{"Iuml", 207},
	{"Kappa", 922},
	{"Lambda", 923},
	{"Mu", 924},
	{"Ntilde", 209},
	{"Nu", 925},
	{"OElig", 338},
	{"Oacute", 211},
	{"Ocirc", 212},
	{"Ograve", 210},
	{"Omega", 937},
	{"Omicron", 927},
	{"Oslash", 216},
	{"Otilde", 213},
	{"Ouml", 214},
	{"Phi", 934},
	{"Pi", 928},
	{"Prime", 8243},
	{"Psi", 936},
	{"Rho", 929},
	{"Scaron", 352},
	{"Sigma", 931},
	{"THORN", 222},
	{"Tau", 932},
	{"Theta", 920},
	{"Uacute", 218},
	{"Ucirc", 219},
	{"Ugrave", 217},
	{"Upsilon", 933},
	{"Uuml", 220},
	{"Xi", 926},
	{"Yacute", 221},
	{"Yuml", 376},
	{"Zeta", 918},
	{"aacute", 225},
	{"acirc", 226},
	{"acute", 180},
	{"aelig", 230},
	{"agrave", 224},
	{"alefsym", 8501},
	{"alpha", 945},
	{"amp", 38},
	{"and", 8743},
	{"ang", 8736},
	{"apos", 39},
	{"aring", 229},
	{"asymp", 8776},
	{"atilde", 227},
	{"auml", 228},
	{"bdquo", 8222},
	{"beta", 946},
	{"brvbar", 166},
	{"bull", 8226},
	{"cap", 8745},
	{"ccedil", 231},
	{"cdots", 8943},
	{"cedil", 184},
	{"cent", 162},
	{"chi", 967},
	{"circ", 710},
	{"clubs", 9827},
	{"cong", 8773},
	{"copy", 169},
	{"crarr", 8629},
	{"cup", 8746},
	{"curren", 164},
	{"dArr", 8659},
	{"dagger", 8224},
	{"darr", 8595},
	{"ddots", 8945},
	{"deg", 176},
	{"delta", 948},
	{"diams", 9830},
	{"divide", 247},
	{"eacute", 233},
	{"ecirc", 234},
	{"egrave", 232},
	{"_emdash", 8212},	/* non-standard but commonly used */
	{"empty", 8709},
	{"emsp", 8195},
	{"_endash", 8211},	/* non-standard but commonly used */
	{"ensp", 8194},
	{"epsilon", 949},
	{"equiv", 8801},
	{"eta", 951},
	{"eth", 240},
	{"euml", 235},
	{"euro", 8364},
	{"exist", 8707},
	{"fnof", 402},
	{"forall", 8704},
	{"frac12", 189},
	{"frac14", 188},
	{"frac34", 190},
	{"frasl", 8260},
	{"gamma", 947},
	{"ge", 8805},
	{"gt", 62},
	{"hArr", 8660},
	{"harr", 8596},
	{"hearts", 9829},
	{"hellip", 8230},
	{"iacute", 237},
	{"icirc", 238},
	{"iexcl", 161},
	{"igrave", 236},
	{"image", 8465},
	{"infin", 8734},
	{"int", 8747},
	{"iota", 953},
	{"iquest", 191},
	{"isin", 8712},
	{"iuml", 239},
	{"kappa", 954},
	{"lArr", 8656},
	{"lambda", 955},
	{"lang", 9001},
	{"laquo", 171},
	{"larr", 8592},
	{"lceil", 8968},
	{"_ldots", 8230},
	{"ldquo", 8220},
	{"le", 8804},
	{"lfloor", 8970},
	{"lowast", 8727},
	{"loz", 9674},
	{"lrm", 8206},
	{"lsaquo", 8249},
	{"lsquo", 8216},
	{"lt", 60},
	{"macr", 175},
	{"mdash", 8212},
	{"micro", 181},
	{"middot", 183},
	{"minus", 8722},
	{"mu", 956},
	{"nabla", 8711},
	{"nbsp", 160},
	{"ndash", 8211},
	{"ne", 8800},
	{"ni", 8715},
	{"not", 172},
	{"notin", 8713},
	{"nsub", 8836},
	{"ntilde", 241},
	{"nu", 957},
	{"oacute", 243},
	{"ocirc", 244},
	{"oelig", 339},
	{"ograve", 242},
	{"oline", 8254},
	{"omega", 969},
	{"omicron", 959},
	{"oplus", 8853},
	{"or", 8744},
	{"ordf", 170},
	{"ordm", 186},
	{"oslash", 248},
	{"otilde", 245},
	{"otimes", 8855},
	{"ouml", 246},
	{"para", 182},
	{"part", 8706},
	{"permil", 8240},
	{"perp", 8869},
	{"phi", 966},
	{"pi", 960},
	{"piv", 982},
	{"plusmn", 177},
	{"pound", 163},
	{"prime", 8242},
	{"prod", 8719},
	{"prop", 8733},
	{"psi", 968},
	{"quad", 8193},
	{"quot", 34},
	{"rArr", 8658},
	{"radic", 8730},
	{"rang", 9002},
	{"raquo", 187},
	{"rarr", 8594},
	{"rceil", 8969},
	{"rdquo", 8221},
	{"real", 8476},
	{"reg", 174},
	{"rfloor", 8971},
	{"rho", 961},
	{"rlm", 8207},
	{"rsaquo", 8250},
	{"rsquo", 8217},
	{"sbquo", 8218},
	{"scaron", 353},
	{"sdot", 8901},
	{"sect", 167},
	{"shy", 173},
	{"sigma", 963},
	{"sigmaf", 962},
	{"sim", 8764},
	{"_sp", 8194},
	{"spades", 9824},
	{"sub", 8834},
	{"sube", 8838},
	{"sum", 8721},
	{"sup", 8835},
	{"sup1", 185},
	{"sup2", 178},
	{"sup3", 179},
	{"supe", 8839},
	{"szlig", 223},
	{"tau", 964},
	{"there4", 8756},
	{"theta", 952},
	{"thetasym", 977},
	{"thinsp", 8201},
	{"thorn", 254},
	{"tilde", 732},
	{"times", 215},
	{"trade", 8482},
	{"uArr", 8657},
	{"uacute", 250},
	{"uarr", 8593},
	{"ucirc", 251},
	{"ugrave", 249},
	{"uml", 168},
	{"upsih", 978},
	{"upsilon", 965},
	{"uuml", 252},
	{"_varepsilon", 8712},
	{"varphi", 981},
	{"_varpi", 982},
	{"varrho", 1009},
	{"vdots", 8942},
	{"_vsigma", 962},
	{"_vtheta", 977},
	{"weierp", 8472},
	{"xi", 958},
	{"yacute", 253},
	{"yen", 165},
	{"yuml", 255},
	{"zeta", 950},
	{"zwj", 8205},
	{"zwnj", 8204}
};

static Hchar byrune[nelem(byname)];

static int
hnamecmp(const void *va, const void *vb)
{
	Hchar *a, *b;
	
	a = (Hchar*)va;
	b = (Hchar*)vb;
	return strcmp(a->s, b->s);
}

static int
hrunecmp(const void *va, const void *vb)
{
	Hchar *a, *b;
	
	a = (Hchar*)va;
	b = (Hchar*)vb;
	return a->r - b->r;
}

static void
html_init(void)
{
	static int init;
	int i;
	
	if(init)
		return;
	init = 1;
	memmove(byrune, byname, sizeof byrune);
	
	/* Eliminate names we aren't allowed to generate. */
	for(i=0; i<nelem(byrune); i++){
		if(byrune[i].s[0] == '_'){
			byrune[i].r = Runeerror;
			byname[i].s++;
		}
	}
	
	qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
	qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
}

static Rune
findbyname(char *s)
{
	Hchar *h;
	int n, m, x;
	
	h = byname;
	n = nelem(byname);
	while(n > 0){
		m = n/2;
		x = strcmp(h[m].s, s);
		if(x == 0)
			return h[m].r;
		if(x < 0){
			h += m+1;
			n -= m+1;
		}else
			n = m;
	}
	return Runeerror;
}

static char*
findbyrune(Rune r)
{
	Hchar *h;
	int n, m;

	if(r == Runeerror)
		return nil;
	h = byrune;
	n = nelem(byrune);
	while(n > 0){
		m = n/2;
		if(h[m].r == r)
			return h[m].s;
		if(h[m].r < r){
			h += m+1;
			n -= m+1;
		}else
			n = m;
	}
	return nil;
}

void
html_in(int fd, long *x, struct convert *out)
{
	char buf[100], *p;
	Biobuf b;
	Rune rbuf[N];
	Rune *r, *er;
	int c, s, i;
	
	USED(x);
	
	html_init();
	r = rbuf;
	er = rbuf+N;
	Binit(&b, fd, OREAD);
	while((c = Bgetrune(&b)) != Beof){
		if(r >= er){
			OUT(out, rbuf, r-rbuf);
			r = rbuf;
		}
		if(c == '&'){
			s = 0;
			buf[0] = c;
			for(i=1; i<nelem(buf)-1;){
				c = Bgetc(&b);
				if(c == Beof)
					break;
				if(strchr(";&</> \t\r\n", c)){
					if(c != ';')
						Bungetc(&b);
					else
						s = 1;
					break;
				}
				buf[i++] = c;
			}
			buf[i] = 0;
			if(i > 1){
				if((c = findbyname(buf+1)) != Runeerror)
					goto out;
				if(i > 2 && buf[1] == '#'){
					if(i > 3 && strchr("xX", buf[2]))
						c = strtol(buf+3, &p, 16);
					else
						c = strtol(buf+2, &p, 10);
					if(*p || c >= NRUNE || c < 0)
						goto bad;
					goto out;
				}
			}
		bad:
			if(s)
				buf[i++] = ';';
			for(p=buf; p<buf+i; ){
				p += chartorune(r++, p);
				if(r >= er){
					OUT(out, rbuf, r-rbuf);
					r = rbuf;
				}
			}
			continue;
		out:
			if((c & 0x7f) == c && strchr("<>&\"'", c)){
				s = ';';
				i = sprint(buf, "&%s", findbyrune(c));
				goto bad;
			}
		}
		*r++ = c;
	}
	if(r > rbuf)
		OUT(out, rbuf, r-rbuf);
	OUT(out, rbuf, 0);
}

/*
 * use biobuf because can use more than UTFmax bytes per rune
 */
void
html_out(Rune *r, int n, long *x)
{
	char *s;
	Biobuf b;
	Rune *er;
	
	USED(x);
	html_init();
	Binit(&b, 1, OWRITE);
	er = r+n;
	for(; r<er; r++){
		if(*r < Runeself)
			Bputrune(&b, *r);
		else if((s = findbyrune(*r)) != nil)
			Bprint(&b, "&%s;", s);
		else
			Bprint(&b, "&#%d;", *r);
	}
	Bflush(&b);
}