shithub: riscv

--- /dev/null

+++ b/sys/man/1/uhtml

@@ -1,0 +1,46 @@

+.TH UHTML 1

+.SH NAME

+uhtml \- convert foreign character set HTML file to unicode

+.SH SYNOPSIS

+.B uhtml

+[

+.B -p

+] [

+.B -c

+.I charset

+] [

+.I file

+]

+.SH DESCRIPTION

+HTML comes in various character set encodings

+and has special forms to encode characters. To

+make it easier to process html, uthml is used

+to normalize it to a unicode only form.

+.LP

+Uhtml detects the character set of the html input

+.I file

+and calls

+.IR tcs (1)

+to convert it to utf replacing html-entity forms

+by ther unicode character representations except for

+.B lt

+.B gt

+.B amp

+.B quot

+and

+.B apos .

+The converted html is written to

+standard output. If no

+.I file

+was given, it is read from standard input. If the

+.B -p

+option is given, the detected character set is printed and

+the program exits without conversion.

+In case character set detection fails, the default (utf)

+is assumed. This default can be changed with the

+.B -c

+option.

+.SH SOURCE

+.B /sys/src/cmd/uhtml.c

+.SH SEE ALSO

+.IR tcs (1)

--- a/sys/src/cmd/html2ms.c

+++ b/sys/src/cmd/html2ms.c

@@ -680,6 +680,8 @@

 			return '>';

 		if(strcmp(buf, "quot") == 0)

 			return '"';

+		if(strcmp(buf, "apos") == 0)

+			return '\'';

 		if(strcmp(buf, "amp") == 0)

 			return '&';

 		/* use tcs -f html to handle the rest. */

--- a/sys/src/cmd/mothra/rdhtml.c

+++ b/sys/src/cmd/mothra/rdhtml.c

@@ -272,6 +272,8 @@

 				*t++='>';

 			else if(strcmp(u, "quot") == 0)

 				*t++='"';

+			else if(strcmp(u, "apos") == 0)

+				*t++='\'';

 			else if(strcmp(u, "amp") == 0)

 				*t++='&';

 			else {

--- a/sys/src/cmd/tcs/html.c

+++ b/sys/src/cmd/tcs/html.c

@@ -11,8 +11,6 @@

 	Rune r;

};

-/* &lt;, &gt;, &quot;, &amp; intentionally omitted */

/*

  * Names beginning with _ are names we recognize

  * (without the underscore) but will not generate,

@@ -86,7 +84,7 @@

 	{"agrave", 224},

 	{"alefsym", 8501},

 	{"alpha", 945},

-	/*	{"amp", 38},	*/

+	{"amp", 38},

 	{"and", 8743},

 	{"ang", 8736},

 	{"aring", 229},

@@ -141,7 +139,7 @@

 	{"frasl", 8260},

 	{"gamma", 947},

 	{"ge", 8805},

-	/*	{"gt", 62},	*/

+	{"gt", 62},

 	{"hArr", 8660},

 	{"harr", 8596},

 	{"hearts", 9829},

@@ -173,7 +171,7 @@

 	{"lrm", 8206},

 	{"lsaquo", 8249},

 	{"lsquo", 8216},

-	/*	{"lt", 60},	*/

+	{"lt", 60},

 	{"macr", 175},

 	{"mdash", 8212},

 	{"micro", 181},

@@ -219,7 +217,7 @@

 	{"prop", 8733},

 	{"psi", 968},

 	{"quad", 8193},

-	/*	{"quot", 34},	*/

+	{"quot", 34},

 	{"rArr", 8658},

 	{"radic", 8730},

 	{"rang", 9002},

@@ -416,10 +414,8 @@

 			buf[i] = 0;

 			if(i > 1){

-				if((c = findbyname(buf+1)) != Runeerror){

-					*r++ = c;

-					continue;

-				}

+				if((c = findbyname(buf+1)) != Runeerror)

+					goto out;

 				if(i > 2 && buf[1] == '#'){

 					if(i > 3 && strchr("xX", buf[2]))

 						c = strtol(buf+3, &p, 16);

@@ -427,8 +423,7 @@

 						c = strtol(buf+2, &p, 10);

 					if(*p || c >= NRUNE || c < 0)

 						goto bad;

-					*r++ = c;

-					continue;

+					goto out;

 		bad:

@@ -442,6 +437,12 @@

 			continue;

+		out:

+			if(strchr("<>&\"'", c)){

+				s = ';';

+				i = sprint(buf, "&%s", findbyrune(c));

+				goto bad;

+			}

 		*r++ = c;

--- a/sys/src/cmd/uhtml.c

+++ b/sys/src/cmd/uhtml.c

@@ -41,7 +41,7 @@

 main(int argc, char *argv[])

 	int pfd[2], pflag = 0;

-	char *arg[4], *s;

+	char *arg[4], *s, *p;

 	ARGBEGIN {

 	case 'h':

@@ -59,21 +59,32 @@

 		if(open(*argv, OREAD) != 1)

 			sysfatal("open: %r");

-	if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)

+	if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)

 		sysfatal("read: %r");

 	buf[nbuf] = 0;

-	/* useless BOM marker */

-	if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)

-		memmove(buf, buf+3, nbuf-3);

-	for(;;){

-		if(s = cistrstr(buf, "encoding="))

+	p = buf;

+	while(nbuf > 0){

+		if(memcmp(p, "\xEF\xBB\xBF", 3)==0){

+			p += 3;

+			cset = "utf";

+			break;

+		}

+		if(memcmp(p, "\xFE\xFF", 2) == 0){

+			p += 2;

+			cset = "unicode-be";

+			break;

+		}

+		if(memcmp(p, "\xFF\xFE", 2) == 0){

+			p += 2;

+			cset = "unicode-le";

+			break;

+		}

+		if(s = cistrstr(p, "encoding="))

 			if(s = strval(s+9)){

 				cset = s;

 				break;

-		if(s = cistrstr(buf, "charset="))

+		if(s = cistrstr(p, "charset="))

 			if(s = strval(s+8)){

 				cset = s;

 				break;

@@ -80,6 +91,7 @@

 		break;

+	nbuf -= p - buf;

 	if(pflag){

 		print("%s\n", cset);

@@ -86,15 +98,15 @@

 		exits(0);

-	if(pipe(pfd) < 0)

-		sysfatal("pipe: %r");

 	if(nbuf == 0){

-		write(1, buf, 0);

+		write(1, p, 0);

 		exits(0);

-	switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){

+	if(pipe(pfd) < 0)

+		sysfatal("pipe: %r");

+	switch(rfork(RFFDG|RFREND|RFPROC)){

 	case -1:

 		sysfatal("fork: %r");

 	case 0:

@@ -114,10 +126,13 @@

 	close(pfd[1]);

 	while(nbuf > 0){

-		if(write(1, buf, nbuf) != nbuf)

+		if(write(1, p, nbuf) != nbuf)

 			sysfatal("write: %r");

-		if((nbuf = read(0, buf, sizeof(buf))) < 0)

+		p = buf;

+		if((nbuf = read(0, p, sizeof(buf))) < 0)

 			sysfatal("read: %r");

+	close(1);

+	waitpid();

 	exits(0);