shithub: riscv

ref: 81d393942d8834b6e071ab0957b655a99e737486
dir: /sys/src/cmd/htmlfmt/html.c/

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <regexp.h>
#include <html.h>
#include <ctype.h>
#include "dat.h"

char urlexpr[] =
	"^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
	"://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
Reprog	*urlprog;

int inword = 0;
int col = 0;
int wordi = 0;

char*
loadhtml(int fd)
{
	URLwin *u;
	Bytes *b;
	int n;
	char buf[4096];

	u = emalloc(sizeof(URLwin));
	u->infd = fd;
	u->outfd = 1;
	u->url = estrdup(url);
	u->type = TextHtml;

	b = emalloc(sizeof(Bytes));
	while((n = read(fd, buf, sizeof buf)) > 0)
		growbytes(b, buf, n);
	if(b->b == nil)
		return nil;	/* empty file */
	rendertext(u, b);
	freeurlwin(u);
	return nil;
}

char*
runetobyte(Rune *r, int n)
{
	char *s;

	if(n == 0)
		return emalloc(1);
	s = smprint("%.*S", n, r);
	if(s == nil)
		error("malloc failed");
	return s;
}

int
closingpunct(char c)
{
	return strchr(".,:;'\")]}>!?", c) != nil;
}

void
emitword(Bytes *b, Rune *r, int nr)
{
	char *s;
	int space;

	if(nr == 0)
		return;
	s = smprint("%.*S", nr, r);
	space = b->n > 0 && !isspace(b->b[b->n-1]) && !closingpunct(*s);
	if(col > 0 && col+space+nr > width){
		growbytes(b, "\n", 1);
		space = 0;
		col = 0;
	}
	if(space && col > 0){
		growbytes(b, " ", 1);
		col++;
	}
	growbytes(b, s, strlen(s));
	col += nr;
	free(s);
	inword = 0;
}

void
renderrunes(Bytes *b, Rune *r)
{
	int i, n;

	n = runestrlen(r);
	for(i=0; i<n; i++){
		switch(r[i]){
		case '\n':
			if(inword)
				emitword(b, r+wordi, i-wordi);
			col = 0;
			if(b->n == 0)
				break;	/* don't start with blank lines */
			if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
				growbytes(b, "\n", 1);
			break;
		case ' ':
			if(inword)
				emitword(b, r+wordi, i-wordi);
			break;
		default:
			if(!inword)
				wordi = i;
			inword = 1;
			break;
		}
	}
	if(inword)
		emitword(b, r+wordi, i-wordi);
}

void
renderbytes(Bytes *b, char *fmt, ...)
{
	Rune *r;
	va_list arg;

	va_start(arg, fmt);
	r = runevsmprint(fmt, arg);
	va_end(arg);
	renderrunes(b, r);
	free(r);
}

char*
baseurl(char *url)
{
	char *base, *slash;
	Resub rs[10];

	if(url == nil)
		return nil;
	if(urlprog == nil){
		urlprog = regcomp(urlexpr);
		if(urlprog == nil)
			error("can't compile URL regexp");
	}
	memset(rs, 0, sizeof rs);
	if(regexec(urlprog, url, rs, nelem(rs)) == 0)
		return nil;
	base = estrdup(url);
	slash = strrchr(base, '/');
	if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
		*slash = '\0';
	else
		base[rs[0].ep-rs[0].sp] = '\0';
	return base;
}

char*
fullurl(URLwin *u, Rune *rhref)
{
	char *base, *href, *hrefbase;
	char *result;

	if(rhref == nil)
		return estrdup("NULL URL");
	href = runetobyte(rhref, runestrlen(rhref));
	hrefbase = baseurl(href);
	result = nil;
	if(hrefbase==nil && (base = baseurl(u->url))!=nil){
		result = estrdup(base);
		if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
			result = eappend(result, "/", "");
		free(base);
	}
	if(href){
		if(result)
			result = eappend(result, "", href);
		else
			result = estrdup(href);
	}
	free(hrefbase);
	if(result == nil)
		return estrdup("***unknown***");
	return result;
}

void
render(URLwin *u, Bytes *t, Item *items, int curanchor)
{
	Item *il;
	Itext *it;
	Ifloat *ifl;
	Ispacer *is;
	Itable *ita;
	Iimage *im;
	Anchor *a;
	Table *tab;
	Tablecell *cell;
	char *href;

	inword = 0;
	col = 0;
	wordi = 0;

	for(il=items; il!=nil; il=il->next){
		if(il->state & IFbrk)
			renderbytes(t, "\n");
		if(il->state & IFbrksp)
			renderbytes(t, "\n");

		switch(il->tag){
		case Itexttag:
			it = (Itext*)il;
			if(it->state & IFwrap)
				renderrunes(t, it->s);
			else
				emitword(t, it->s, runestrlen(it->s));
			break;
		case Iruletag:
			if(t->n>0 && t->b[t->n-1]!='\n')
				renderbytes(t, "\n");
			renderbytes(t, "=======\n");
			break;
		case Iimagetag:
			if(!aflag)
				break;
			im = (Iimage*)il;
			if(im->imsrc){
				href = fullurl(u, im->imsrc);
				renderbytes(t, "[image %s]", href);
				free(href);
			}
			break;
		case Iformfieldtag:
			if(aflag)
				renderbytes(t, "[formfield]");
			break;
		case Itabletag:
			ita = (Itable*)il;
			tab = ita->table;
			for(cell=tab->cells; cell!=nil; cell=cell->next){
				render(u, t, cell->content, curanchor);
			}
			if(t->n>0 && t->b[t->n-1]!='\n')
				renderbytes(t, "\n");
			break;
		case Ifloattag:
			ifl = (Ifloat*)il;
			render(u, t, ifl->item, curanchor);
			break;
		case Ispacertag:
			is = (Ispacer*)il;
			if(is->spkind != ISPnull)
				renderbytes(t, " ");
			break;
		default:
			error("unknown item tag %d\n", il->tag);
		}
		if(il->anchorid != 0 && il->anchorid!=curanchor){
			for(a=u->docinfo->anchors; a!=nil; a=a->next)
				if(aflag && a->index == il->anchorid){
					href = fullurl(u, a->href);
					renderbytes(t, "[%s]", href);
					free(href);
					break;
				}
			curanchor = il->anchorid;
		}
	}
	if(t->n>0 && t->b[t->n-1]!='\n')
		renderbytes(t, "\n");
}

void
rerender(URLwin *u)
{
	Bytes *t;

	t = emalloc(sizeof(Bytes));

	render(u, t, u->items, 0);

	if(t->n)
		write(u->outfd, (char*)t->b, t->n);
	free(t->b);
	free(t);
}

/*
 * Somewhat of a hack.  Not a full parse, just looks for strings in the beginning
 * of the document (cistrstr only looks at first somewhat bytes).
 */
int
charset(char *s)
{
	char *meta, *emeta, *charset;

	if(defcharset == 0)
		defcharset = ISO_8859_1;
	meta = cistrstr(s, "<meta");
	if(meta == nil)
		return defcharset;
	for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
		;
	charset = cistrstr(s, "charset=");
	if(charset == nil)
		return defcharset;
	charset += 8;
	if(*charset == '"')
		charset++;
	if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
		return UTF_8;
	return defcharset;
}

void
rendertext(URLwin *u, Bytes *b)
{
	Rune *rurl;

	rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
	u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
//	free(rurl);

	rerender(u);
}


void
freeurlwin(URLwin *u)
{
	freeitems(u->items);
	u->items = nil;
	freedocinfo(u->docinfo);
	u->docinfo = nil;
	free(u);
}