shithub: riscv

Download patch

ref: 13304b7b967c6172cfaa6b31dd4f92348056ed1a
parent: 6d6880cec936a13e67e43357538394a5c7f09010
author: cinap_lenrek <cinap_lenrek@centraldogma>
date: Sat Sep 24 13:06:45 EDT 2011

html2ms, tcs, mothra, uhtml: threat &apos; as special entity, add uhtml(1)

--- /dev/null
+++ b/sys/man/1/uhtml
@@ -1,0 +1,46 @@
+.TH UHTML 1
+.SH NAME
+uhtml \- convert foreign character set HTML file to unicode
+.SH SYNOPSIS
+.B uhtml
+[
+.B -p
+] [
+.B -c
+.I charset
+] [
+.I file
+]
+.SH DESCRIPTION
+HTML comes in various character set encodings
+and has special forms to encode characters. To
+make it easier to process html, uthml is used
+to normalize it to a unicode only form.
+.LP
+Uhtml detects the character set of the html input
+.I file
+and calls
+.IR tcs (1)
+to convert it to utf replacing html-entity forms
+by ther unicode character representations except for 
+.B lt
+.B gt
+.B amp
+.B quot
+and
+.B apos .
+The converted html is written to
+standard output. If no
+.I file
+was given, it is read from standard input. If the
+.B -p
+option is given, the detected character set is printed and
+the program exits without conversion.
+In case character set detection fails, the default (utf)
+is assumed. This default can be changed with the
+.B -c
+option.
+.SH SOURCE
+.B /sys/src/cmd/uhtml.c
+.SH SEE ALSO
+.IR tcs (1)
--- a/sys/src/cmd/html2ms.c
+++ b/sys/src/cmd/html2ms.c
@@ -680,6 +680,8 @@
 			return '>';
 		if(strcmp(buf, "quot") == 0)
 			return '"';
+		if(strcmp(buf, "apos") == 0)
+			return '\'';
 		if(strcmp(buf, "amp") == 0)
 			return '&';
 		/* use tcs -f html to handle the rest. */
--- a/sys/src/cmd/mothra/rdhtml.c
+++ b/sys/src/cmd/mothra/rdhtml.c
@@ -272,6 +272,8 @@
 				*t++='>';
 			else if(strcmp(u, "quot") == 0)
 				*t++='"';
+			else if(strcmp(u, "apos") == 0)
+				*t++='\'';
 			else if(strcmp(u, "amp") == 0)
 				*t++='&';
 			else {
--- a/sys/src/cmd/tcs/html.c
+++ b/sys/src/cmd/tcs/html.c
@@ -11,8 +11,6 @@
 	Rune r;
 };
 
-/* &lt;, &gt;, &quot;, &amp; intentionally omitted */
-
 /*
  * Names beginning with _ are names we recognize
  * (without the underscore) but will not generate,
@@ -86,7 +84,7 @@
 	{"agrave", 224},
 	{"alefsym", 8501},
 	{"alpha", 945},
-	/*	{"amp", 38},	*/
+	{"amp", 38},
 	{"and", 8743},
 	{"ang", 8736},
 	{"aring", 229},
@@ -141,7 +139,7 @@
 	{"frasl", 8260},
 	{"gamma", 947},
 	{"ge", 8805},
-	/*	{"gt", 62},	*/
+	{"gt", 62},
 	{"hArr", 8660},
 	{"harr", 8596},
 	{"hearts", 9829},
@@ -173,7 +171,7 @@
 	{"lrm", 8206},
 	{"lsaquo", 8249},
 	{"lsquo", 8216},
-	/*	{"lt", 60},	*/
+	{"lt", 60},
 	{"macr", 175},
 	{"mdash", 8212},
 	{"micro", 181},
@@ -219,7 +217,7 @@
 	{"prop", 8733},
 	{"psi", 968},
 	{"quad", 8193},
-	/*	{"quot", 34},	*/
+	{"quot", 34},
 	{"rArr", 8658},
 	{"radic", 8730},
 	{"rang", 9002},
@@ -416,10 +414,8 @@
 			}
 			buf[i] = 0;
 			if(i > 1){
-				if((c = findbyname(buf+1)) != Runeerror){
-					*r++ = c;
-					continue;
-				}
+				if((c = findbyname(buf+1)) != Runeerror)
+					goto out;
 				if(i > 2 && buf[1] == '#'){
 					if(i > 3 && strchr("xX", buf[2]))
 						c = strtol(buf+3, &p, 16);
@@ -427,8 +423,7 @@
 						c = strtol(buf+2, &p, 10);
 					if(*p || c >= NRUNE || c < 0)
 						goto bad;
-					*r++ = c;
-					continue;
+					goto out;
 				}
 			}
 		bad:
@@ -442,6 +437,12 @@
 				}
 			}
 			continue;
+		out:
+			if(strchr("<>&\"'", c)){
+				s = ';';
+				i = sprint(buf, "&%s", findbyrune(c));
+				goto bad;
+			}
 		}
 		*r++ = c;
 	}
--- a/sys/src/cmd/uhtml.c
+++ b/sys/src/cmd/uhtml.c
@@ -41,7 +41,7 @@
 main(int argc, char *argv[])
 {
 	int pfd[2], pflag = 0;
-	char *arg[4], *s;
+	char *arg[4], *s, *p;
 
 	ARGBEGIN {
 	case 'h':
@@ -59,21 +59,32 @@
 		if(open(*argv, OREAD) != 1)
 			sysfatal("open: %r");
 	}
-	if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
+	if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
 		sysfatal("read: %r");
 	buf[nbuf] = 0;
-
-	/* useless BOM marker */
-	if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
-		memmove(buf, buf+3, nbuf-3);
-
-	for(;;){
-		if(s = cistrstr(buf, "encoding="))
+	p = buf;
+	while(nbuf > 0){
+		if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
+			p += 3;
+			cset = "utf";
+			break;
+		}
+		if(memcmp(p, "\xFE\xFF", 2) == 0){
+			p += 2;
+			cset = "unicode-be";
+			break;
+		}
+		if(memcmp(p, "\xFF\xFE", 2) == 0){
+			p += 2;
+			cset = "unicode-le";
+			break;
+		}
+		if(s = cistrstr(p, "encoding="))
 			if(s = strval(s+9)){
 				cset = s;
 				break;
 			}
-		if(s = cistrstr(buf, "charset="))
+		if(s = cistrstr(p, "charset="))
 			if(s = strval(s+8)){
 				cset = s;
 				break;
@@ -80,6 +91,7 @@
 			}
 		break;
 	}
+	nbuf -= p - buf;
 
 	if(pflag){
 		print("%s\n", cset);
@@ -86,15 +98,15 @@
 		exits(0);
 	}
 
-	if(pipe(pfd) < 0)
-		sysfatal("pipe: %r");
-
 	if(nbuf == 0){
-		write(1, buf, 0);
+		write(1, p, 0);
 		exits(0);
 	}
 
-	switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){
+	if(pipe(pfd) < 0)
+		sysfatal("pipe: %r");
+
+	switch(rfork(RFFDG|RFREND|RFPROC)){
 	case -1:
 		sysfatal("fork: %r");
 	case 0:
@@ -114,10 +126,13 @@
 	close(pfd[1]);
 
 	while(nbuf > 0){
-		if(write(1, buf, nbuf) != nbuf)
+		if(write(1, p, nbuf) != nbuf)
 			sysfatal("write: %r");
-		if((nbuf = read(0, buf, sizeof(buf))) < 0)
+		p = buf;
+		if((nbuf = read(0, p, sizeof(buf))) < 0)
 			sysfatal("read: %r");
 	}
+	close(1);
+	waitpid();
 	exits(0);
 }