shithub: riscv

Download patch

ref: 679b092ee02429b444b3e8995f6db11b42008dad
parent: 66f76c28212d3a25d4b039de2ce817fc74c5ca1e
author: cinap_lenrek <[email protected]>
date: Sun May 11 22:38:53 EDT 2014

htmlfmt: use uhtml for character set conversion

--- a/sys/src/cmd/htmlfmt/dat.h
+++ b/sys/src/cmd/htmlfmt/dat.h
@@ -28,12 +28,10 @@
 extern	char*	url;
 extern	int		aflag;
 extern	int		width;
-extern	int		defcharset;
 
 extern	char*	loadhtml(int);
 
 extern	char*	readfile(char*, char*, int*);
-extern	int	charset(char*);
 extern	void*	emalloc(ulong);
 extern	char*	estrdup(char*);
 extern	char*	estrstrdup(char*, char*);
--- a/sys/src/cmd/htmlfmt/html.c
+++ b/sys/src/cmd/htmlfmt/html.c
@@ -285,40 +285,13 @@
 	free(t);
 }
 
-/*
- * Somewhat of a hack.  Not a full parse, just looks for strings in the beginning
- * of the document (cistrstr only looks at first somewhat bytes).
- */
-int
-charset(char *s)
-{
-	char *meta, *emeta, *charset;
-
-	if(defcharset == 0)
-		defcharset = ISO_8859_1;
-	meta = cistrstr(s, "<meta");
-	if(meta == nil)
-		return defcharset;
-	for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
-		;
-	charset = cistrstr(s, "charset=");
-	if(charset == nil)
-		return defcharset;
-	charset += 8;
-	if(*charset == '"')
-		charset++;
-	if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
-		return UTF_8;
-	return defcharset;
-}
-
 void
 rendertext(URLwin *u, Bytes *b)
 {
 	Rune *rurl;
 
-	rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
-	u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
+	rurl = toStr((uchar*)u->url, strlen(u->url), UTF_8);
+	u->items = parsehtml(b->b, b->n, rurl, u->type, UTF_8, &u->docinfo);
 //	free(rurl);
 
 	rerender(u);
--- a/sys/src/cmd/htmlfmt/main.c
+++ b/sys/src/cmd/htmlfmt/main.c
@@ -8,8 +8,35 @@
 char *url = "";
 int aflag;
 int width = 70;
-int defcharset;
+char *defcharset = "latin1";
 
+int
+uhtml(int fd)
+{
+	int p[2];
+
+	if(pipe(p) < 0)
+		return fd;
+	switch(fork()){
+	case -1:
+		break;
+	case 0:
+		dup(fd, 0);
+		dup(p[1], 1);
+		close(p[1]);
+		close(p[0]);
+		execl("/bin/uhtml", "uhtml", "-c", defcharset, nil);
+		execl("/bin/cat", "cat", nil);
+		exits("exec");
+	default:
+		dup(p[0], fd);
+		break;
+	}
+	close(p[0]);
+	close(p[1]);
+	return fd;
+}
+
 void
 usage(void)
 {
@@ -21,7 +48,7 @@
 main(int argc, char *argv[])
 {
 	int i, fd;
-	char *p, *err, *file;
+	char *err, *file;
 	char errbuf[ERRMAX];
 
 	ARGBEGIN{
@@ -29,9 +56,7 @@
 		aflag++;
 		break;
 	case 'c':
-		p = smprint("<meta charset=\"%s\">", EARGF(usage()));
-		defcharset = charset(p);
-		free(p);
+		defcharset = EARGF(usage());
 		break;
 	case 'l': case 'w':
 		err = EARGF(usage());
@@ -50,7 +75,7 @@
 	err = nil;
 	file = "<stdin>";
 	if(argc == 0)
-		err = loadhtml(0);
+		err = loadhtml(uhtml(0));
 	else
 		for(i=0; err==nil && i<argc; i++){
 			file = argv[i];
@@ -60,7 +85,7 @@
 				err = errbuf;
 				break;
 			}
-			err = loadhtml(fd);
+			err = loadhtml(uhtml(fd));
 			close(fd);
 			if(err)
 				break;