ref: 13304b7b967c6172cfaa6b31dd4f92348056ed1a
parent: 6d6880cec936a13e67e43357538394a5c7f09010
author: cinap_lenrek <cinap_lenrek@centraldogma>
date: Sat Sep 24 13:06:45 EDT 2011
html2ms, tcs, mothra, uhtml: threat ' as special entity, add uhtml(1)
--- /dev/null
+++ b/sys/man/1/uhtml
@@ -1,0 +1,46 @@
+.TH UHTML 1
+.SH NAME
+uhtml \- convert foreign character set HTML file to unicode
+.SH SYNOPSIS
+.B uhtml
+[
+.B -p
+] [
+.B -c
+.I charset
+] [
+.I file
+]
+.SH DESCRIPTION
+HTML comes in various character set encodings
+and has special forms to encode characters. To
+make it easier to process html, uthml is used
+to normalize it to a unicode only form.
+.LP
+Uhtml detects the character set of the html input
+.I file
+and calls
+.IR tcs (1)
+to convert it to utf replacing html-entity forms
+by ther unicode character representations except for
+.B lt
+.B gt
+.B amp
+.B quot
+and
+.B apos .
+The converted html is written to
+standard output. If no
+.I file
+was given, it is read from standard input. If the
+.B -p
+option is given, the detected character set is printed and
+the program exits without conversion.
+In case character set detection fails, the default (utf)
+is assumed. This default can be changed with the
+.B -c
+option.
+.SH SOURCE
+.B /sys/src/cmd/uhtml.c
+.SH SEE ALSO
+.IR tcs (1)
--- a/sys/src/cmd/html2ms.c
+++ b/sys/src/cmd/html2ms.c
@@ -680,6 +680,8 @@
return '>';
if(strcmp(buf, "quot") == 0)
return '"';
+ if(strcmp(buf, "apos") == 0)
+ return '\'';
if(strcmp(buf, "amp") == 0)
return '&';
/* use tcs -f html to handle the rest. */
--- a/sys/src/cmd/mothra/rdhtml.c
+++ b/sys/src/cmd/mothra/rdhtml.c
@@ -272,6 +272,8 @@
*t++='>';
else if(strcmp(u, "quot") == 0)
*t++='"';
+ else if(strcmp(u, "apos") == 0)
+ *t++='\'';
else if(strcmp(u, "amp") == 0)
*t++='&';
else {
--- a/sys/src/cmd/tcs/html.c
+++ b/sys/src/cmd/tcs/html.c
@@ -11,8 +11,6 @@
Rune r;
};
-/* <, >, ", & intentionally omitted */
-
/*
* Names beginning with _ are names we recognize
* (without the underscore) but will not generate,
@@ -86,7 +84,7 @@
{"agrave", 224},
{"alefsym", 8501},
{"alpha", 945},
- /* {"amp", 38}, */
+ {"amp", 38},
{"and", 8743},
{"ang", 8736},
{"aring", 229},
@@ -141,7 +139,7 @@
{"frasl", 8260},
{"gamma", 947},
{"ge", 8805},
- /* {"gt", 62}, */
+ {"gt", 62},
{"hArr", 8660},
{"harr", 8596},
{"hearts", 9829},
@@ -173,7 +171,7 @@
{"lrm", 8206},
{"lsaquo", 8249},
{"lsquo", 8216},
- /* {"lt", 60}, */
+ {"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"micro", 181},
@@ -219,7 +217,7 @@
{"prop", 8733},
{"psi", 968},
{"quad", 8193},
- /* {"quot", 34}, */
+ {"quot", 34},
{"rArr", 8658},
{"radic", 8730},
{"rang", 9002},
@@ -416,10 +414,8 @@
}
buf[i] = 0;
if(i > 1){
- if((c = findbyname(buf+1)) != Runeerror){
- *r++ = c;
- continue;
- }
+ if((c = findbyname(buf+1)) != Runeerror)
+ goto out;
if(i > 2 && buf[1] == '#'){
if(i > 3 && strchr("xX", buf[2]))
c = strtol(buf+3, &p, 16);
@@ -427,8 +423,7 @@
c = strtol(buf+2, &p, 10);
if(*p || c >= NRUNE || c < 0)
goto bad;
- *r++ = c;
- continue;
+ goto out;
}
}
bad:
@@ -442,6 +437,12 @@
}
}
continue;
+ out:
+ if(strchr("<>&\"'", c)){
+ s = ';';
+ i = sprint(buf, "&%s", findbyrune(c));
+ goto bad;
+ }
}
*r++ = c;
}
--- a/sys/src/cmd/uhtml.c
+++ b/sys/src/cmd/uhtml.c
@@ -41,7 +41,7 @@
main(int argc, char *argv[])
{
int pfd[2], pflag = 0;
- char *arg[4], *s;
+ char *arg[4], *s, *p;
ARGBEGIN {
case 'h':
@@ -59,21 +59,32 @@
if(open(*argv, OREAD) != 1)
sysfatal("open: %r");
}
- if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
+ if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
sysfatal("read: %r");
buf[nbuf] = 0;
-
- /* useless BOM marker */
- if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
- memmove(buf, buf+3, nbuf-3);
-
- for(;;){
- if(s = cistrstr(buf, "encoding="))
+ p = buf;
+ while(nbuf > 0){
+ if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
+ p += 3;
+ cset = "utf";
+ break;
+ }
+ if(memcmp(p, "\xFE\xFF", 2) == 0){
+ p += 2;
+ cset = "unicode-be";
+ break;
+ }
+ if(memcmp(p, "\xFF\xFE", 2) == 0){
+ p += 2;
+ cset = "unicode-le";
+ break;
+ }
+ if(s = cistrstr(p, "encoding="))
if(s = strval(s+9)){
cset = s;
break;
}
- if(s = cistrstr(buf, "charset="))
+ if(s = cistrstr(p, "charset="))
if(s = strval(s+8)){
cset = s;
break;
@@ -80,6 +91,7 @@
}
break;
}
+ nbuf -= p - buf;
if(pflag){
print("%s\n", cset);
@@ -86,15 +98,15 @@
exits(0);
}
- if(pipe(pfd) < 0)
- sysfatal("pipe: %r");
-
if(nbuf == 0){
- write(1, buf, 0);
+ write(1, p, 0);
exits(0);
}
- switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){
+ if(pipe(pfd) < 0)
+ sysfatal("pipe: %r");
+
+ switch(rfork(RFFDG|RFREND|RFPROC)){
case -1:
sysfatal("fork: %r");
case 0:
@@ -114,10 +126,13 @@
close(pfd[1]);
while(nbuf > 0){
- if(write(1, buf, nbuf) != nbuf)
+ if(write(1, p, nbuf) != nbuf)
sysfatal("write: %r");
- if((nbuf = read(0, buf, sizeof(buf))) < 0)
+ p = buf;
+ if((nbuf = read(0, p, sizeof(buf))) < 0)
sysfatal("read: %r");
}
+ close(1);
+ waitpid();
exits(0);
}