ref: 84d1ef146386cfe9c26cc897d458e832930ed3e0
dir: /sys/src/cmd/htmlfmt/html.c/
#include <u.h> #include <libc.h> #include <bio.h> #include <draw.h> #include <regexp.h> #include <html.h> #include <ctype.h> #include "dat.h" char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)" "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; Reprog *urlprog; int inword = 0; int col = 0; int wordi = 0; char* loadhtml(int fd) { URLwin *u; Bytes *b; int n; char buf[4096]; u = emalloc(sizeof(URLwin)); u->infd = fd; u->outfd = 1; u->url = estrdup(url); u->type = TextHtml; b = emalloc(sizeof(Bytes)); while((n = read(fd, buf, sizeof buf)) > 0) growbytes(b, buf, n); if(b->b == nil) return nil; /* empty file */ rendertext(u, b); freeurlwin(u); return nil; } char* runetobyte(Rune *r, int n) { char *s; if(n == 0) return emalloc(1); s = smprint("%.*S", n, r); if(s == nil) error("malloc failed"); return s; } int closingpunct(char c) { return strchr(".,:;'\")]}>!?", c) != nil; } void emitword(Bytes *b, Rune *r, int nr) { char *s; int space; if(nr == 0) return; s = smprint("%.*S", nr, r); space = b->n > 0 && !isspace(b->b[b->n-1]) && !closingpunct(*s); if(col > 0 && col+space+nr > width){ growbytes(b, "\n", 1); space = 0; col = 0; } if(space && col > 0){ growbytes(b, " ", 1); col++; } growbytes(b, s, strlen(s)); col += nr; free(s); inword = 0; } void renderrunes(Bytes *b, Rune *r) { int i, n; n = runestrlen(r); for(i=0; i<n; i++){ switch(r[i]){ case '\n': if(inword) emitword(b, r+wordi, i-wordi); col = 0; if(b->n == 0) break; /* don't start with blank lines */ if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') growbytes(b, "\n", 1); break; case ' ': if(inword) emitword(b, r+wordi, i-wordi); break; default: if(!inword) wordi = i; inword = 1; break; } } if(inword) emitword(b, r+wordi, i-wordi); } void renderbytes(Bytes *b, char *fmt, ...) { Rune *r; va_list arg; va_start(arg, fmt); r = runevsmprint(fmt, arg); va_end(arg); renderrunes(b, r); free(r); } char* baseurl(char *url) { char *base, *slash; Resub rs[10]; if(url == nil) return nil; if(urlprog == nil){ urlprog = regcomp(urlexpr); if(urlprog == nil) error("can't compile URL regexp"); } memset(rs, 0, sizeof rs); if(regexec(urlprog, url, rs, nelem(rs)) == 0) return nil; base = estrdup(url); slash = strrchr(base, '/'); if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp]) *slash = '\0'; else base[rs[0].ep-rs[0].sp] = '\0'; return base; } char* fullurl(URLwin *u, Rune *rhref) { char *base, *href, *hrefbase; char *result; if(rhref == nil) return estrdup("NULL URL"); href = runetobyte(rhref, runestrlen(rhref)); hrefbase = baseurl(href); result = nil; if(hrefbase==nil && (base = baseurl(u->url))!=nil){ result = estrdup(base); if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) result = eappend(result, "/", ""); free(base); } if(href){ if(result) result = eappend(result, "", href); else result = estrdup(href); } free(hrefbase); if(result == nil) return estrdup("***unknown***"); return result; } void render(URLwin *u, Bytes *t, Item *items, int curanchor) { Item *il; Itext *it; Ifloat *ifl; Ispacer *is; Itable *ita; Iimage *im; Anchor *a; Table *tab; Tablecell *cell; char *href; inword = 0; col = 0; wordi = 0; for(il=items; il!=nil; il=il->next){ if(il->state & IFbrk) renderbytes(t, "\n"); if(il->state & IFbrksp) renderbytes(t, "\n"); switch(il->tag){ case Itexttag: it = (Itext*)il; if(it->state & IFwrap) renderrunes(t, it->s); else emitword(t, it->s, runestrlen(it->s)); break; case Iruletag: if(t->n>0 && t->b[t->n-1]!='\n') renderbytes(t, "\n"); renderbytes(t, "=======\n"); break; case Iimagetag: if(!aflag) break; im = (Iimage*)il; if(im->imsrc){ href = fullurl(u, im->imsrc); renderbytes(t, "[image %s]", href); free(href); } break; case Iformfieldtag: if(aflag) renderbytes(t, "[formfield]"); break; case Itabletag: ita = (Itable*)il; tab = ita->table; for(cell=tab->cells; cell!=nil; cell=cell->next){ render(u, t, cell->content, curanchor); } if(t->n>0 && t->b[t->n-1]!='\n') renderbytes(t, "\n"); break; case Ifloattag: ifl = (Ifloat*)il; render(u, t, ifl->item, curanchor); break; case Ispacertag: is = (Ispacer*)il; if(is->spkind != ISPnull) renderbytes(t, " "); break; default: error("unknown item tag %d\n", il->tag); } if(il->anchorid != 0 && il->anchorid!=curanchor){ for(a=u->docinfo->anchors; a!=nil; a=a->next) if(aflag && a->index == il->anchorid){ href = fullurl(u, a->href); renderbytes(t, "[%s]", href); free(href); break; } curanchor = il->anchorid; } } if(t->n>0 && t->b[t->n-1]!='\n') renderbytes(t, "\n"); } void rerender(URLwin *u) { Bytes *t; t = emalloc(sizeof(Bytes)); render(u, t, u->items, 0); if(t->n) write(u->outfd, (char*)t->b, t->n); free(t->b); free(t); } /* * Somewhat of a hack. Not a full parse, just looks for strings in the beginning * of the document (cistrstr only looks at first somewhat bytes). */ int charset(char *s) { char *meta, *emeta, *charset; if(defcharset == 0) defcharset = ISO_8859_1; meta = cistrstr(s, "<meta"); if(meta == nil) return defcharset; for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) ; charset = cistrstr(s, "charset="); if(charset == nil) return defcharset; charset += 8; if(*charset == '"') charset++; if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) return UTF_8; return defcharset; } void rendertext(URLwin *u, Bytes *b) { Rune *rurl; rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); // free(rurl); rerender(u); } void freeurlwin(URLwin *u) { freeitems(u->items); u->items = nil; freedocinfo(u->docinfo); u->docinfo = nil; free(u); }