ref: 51c7856350c5c542269f01cf98a7385adbe515a1
parent: 6f5e6eff590d3e8b7303bbd5591e505b355a7b79
author: cinap_lenrek <cinap_lenrek@centraldogma>
date: Wed Oct 5 00:47:53 EDT 2011
uhtml: assume latin1 if not valid utf8
--- a/sys/src/cmd/uhtml.c
+++ b/sys/src/cmd/uhtml.c
@@ -3,8 +3,8 @@
#include <ctype.h>
int nbuf;
-char buf[4096+1];
-char *cset = "utf";
+char buf[64*1024+1];
+char *cset = nil;
void
usage(void)
@@ -40,8 +40,9 @@
void
main(int argc, char *argv[])
{
- int pfd[2], pflag = 0;
+ int n, pfd[2], pflag = 0;
char *arg[4], *s, *p;
+ Rune r;
ARGBEGIN {
case 'h':
@@ -59,26 +60,30 @@
if(open(*argv, OREAD) != 1)
sysfatal("open: %r");
}
- if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
- sysfatal("read: %r");
- buf[nbuf] = 0;
+ nbuf = 0;
p = buf;
- while(nbuf > 0){
- if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
- p += 3;
- cset = "utf";
+ while(nbuf < sizeof(buf)-1){
+ if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
break;
+ nbuf += n;
+ buf[nbuf] = 0;
+ if(nbuf == n){
+ if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
+ p += 3;
+ cset = "utf";
+ break;
+ }
+ if(memcmp(p, "\xFE\xFF", 2) == 0){
+ p += 2;
+ cset = "unicode-be";
+ break;
+ }
+ if(memcmp(p, "\xFF\xFE", 2) == 0){
+ p += 2;
+ cset = "unicode-le";
+ break;
+ }
}
- if(memcmp(p, "\xFE\xFF", 2) == 0){
- p += 2;
- cset = "unicode-be";
- break;
- }
- if(memcmp(p, "\xFF\xFE", 2) == 0){
- p += 2;
- cset = "unicode-le";
- break;
- }
if(s = cistrstr(p, "encoding="))
if(s = strval(s+9)){
cset = s;
@@ -89,9 +94,20 @@
cset = s;
break;
}
- break;
}
nbuf -= p - buf;
+
+ if(cset == nil){
+ cset = "utf";
+ s = p;
+ while(s+UTFmax < p+nbuf){
+ s += chartorune(&r, s);
+ if(r == Runeerror){
+ cset = "latin1";
+ break;
+ }
+ }
+ }
if(pflag){
print("%s\n", cset);