shithub: pdffs

Download patch

ref: b574ea6ce2f6fb2aa129da26687d4affeb1faabc
parent: 8a7f9b4b5c158944978efb13bc03dfd6e42899d3
author: Noam Preil <[email protected]>
date: Mon Jul 19 20:33:14 EDT 2021

Significantly improved text output

--- a/main.c
+++ b/main.c
@@ -17,17 +17,21 @@
 	threadexitsall("usage");
 }
 
-static void
+static int
 dumppage(Object *page)
 {
+	int ret;
 	Page p;
-	pageinit(&p);
-	if(pagerender(&p, page) && p.buf.sz != 0)
-		write(1, p.buf.b, p.buf.sz);
+	pageinit(&p, page);
+	ret = pagerender(&p);
+	if(ret)
+		if(p.buf.sz != 0)
+			fprint(1, "%s", (char*)p.buf.b);
 	pagefree(&p);
+	return ret;
 }
 
-static void
+static int
 dumppages(Object *pages)
 {
 	Object *page, *kids, *type;
@@ -39,13 +43,19 @@
 		// Must be a dict, either Page or Pages
 		type = dictget(page, "Type");
 		// MUST be a name.
-		if(strcmp(type->name, "Pages") == 0)
-			dumppages(page);
-		else if(strcmp(type->name, "Page") == 0)
-			dumppage(page);
+		if(strcmp(type->name, "Pages") == 0){
+			if(!dumppages(page))
+				return 0;
+		}
+		else if(strcmp(type->name, "Page") == 0){
+			if(!dumppage(page))
+				return 0;
+			print("\n");
+		}
 		else
 			sysfatal("Unexpected page node type '%s'", type->name);
 	}
+	return 1;
 }
 
 
--- a/misc.c
+++ b/misc.c
@@ -50,6 +50,7 @@
 		return fmtprint(f, "%g", o->num.d);
 
 	case Ostr:
+	case Oop:
 		if(isutf8(o->str, o->len))
 			return fmtprint(f, "%q", o->str);
 		return fmtprint(f, "<%.*H>", o->len, o->str);
--- a/object.c
+++ b/object.c
@@ -8,6 +8,52 @@
 Object *pdfarray(Pdf *pdf, Stream *s);
 Object *pdfdict(Pdf *pdf, Stream *s);
 
+/* returns 1 if str is at the beginning of the stream, and
+	is followed either by whitespace or, if delim is 1,
+	a delimiter.
+	strlen(str) must be in (0, 16)
+	on match, the stream seeks to right after the string.
+	otherwise, the stream position is unchanged. */
+static int
+sismatch(Stream *s, char *str, int delim)
+{
+	long len = strlen(str);
+	vlong off = Soffset(s);
+	char b[16];
+	if(len == 0 || len > 16)
+		return 0;
+	if(Sread(s, b, len + 1) == len + 1 && memcmp(b, str, len) == 0 && (isws(b[len]) || (delim && isdelim(b[len])))){
+		Sungetc(s);
+		return 1;
+	}
+
+	Sseek(s, off, 0);
+	return 0;
+}
+
+char *
+suntilend(Stream *s)
+{
+	int sz, c, full = 0;
+	char buf[8];
+	for(sz = 0; sz < 7;){
+		c = Sgetc(s);
+		if(c < 0)
+			break;
+		if(isws(c) || isdelim(c)){
+			Sungetc(s);
+			full = 1;
+			break;
+		}
+		buf[sz] = c;
+		sz += 1;
+	}
+	if(!full)
+		return nil;
+	buf[sz] = 0;
+	return strdup(buf);
+}
+
 /* General function to parse an object of any type. */
 Object *
 pdfobj(Pdf *pdf, Stream *s)
@@ -14,9 +60,8 @@
 {
 	Object *o, *o2;
 	vlong off;
-	int c, tf;
+	int c;
 	Xref xref;
-	char b[16];
 
 	o = o2 = nil;
 	do; while(isws(c = Sgetc(s)));
@@ -23,6 +68,30 @@
 	if(c < 0)
 		goto err;
 
+	if(isascii(c) && isalpha(c)){
+		Sungetc(s);
+		// bool, null, or op
+		if(sismatch(s, "null", 1)){
+			fprint(1, "NULL\n");
+			return &null;
+		}
+		if((o = calloc(1, sizeof(*o))) == nil)
+			goto err;
+		o->type = Obool;
+		o->pdf = pdf;
+		if(sismatch(s, "true", 1)){
+			o->bool = 1;
+			return o;
+		}
+		if(sismatch(s, "false", 1)){
+			o->bool = 0;
+			return o;
+		}
+		o->type = Oop;
+		o->str = suntilend(s);
+		return o;
+	}
+
 	switch(c){
 	case '<': /* dictionary or a string */
 		c = Sgetc(s);
@@ -33,7 +102,8 @@
 				off = Soffset(s);
 				do; while(isws(Sgetc(s)));
 				Sungetc(s);
-				if(Sread(s, b, 7) == 7 && memcmp(b, "stream", 6) == 0 && isws(c = b[6])){
+				if(sismatch(s, "stream", 0)){
+					c = Sgetc(s);
 					/* there IS a stream */
 					if(c == '\r' && (c = Sgetc(s)) < 0)
 						goto err;
@@ -71,45 +141,8 @@
 			o->pdf = pdf;
 		return o;
 
-	case 'n':
-		off = Soffset(s);
-		if(Sgetc(s) == 'u' && Sgetc(s) == 'l' && Sgetc(s) == 'l' && (isws(c = Sgetc(s)) || isdelim(c))){
-			Sungetc(s);
-			return &null;
-		}
-		Sseek(s, off, 0);
-		c = 'n';
-		goto unexpected;
-
-	case 't':
-		off = Soffset(s);
-		tf = 1;
-		if(Sgetc(s) == 'r' && Sgetc(s) == 'u' && Sgetc(s) == 'e' && (isws(c = Sgetc(s)) || isdelim(c)))
-			goto bool;
-		Sseek(s, off, 0);
-		c = 't';
-		goto unexpected;
-
-	case 'f':
-		off = Soffset(s);
-		tf = 0;
-		if(Sgetc(s) == 'a' && Sgetc(s) == 'l' && Sgetc(s) == 's' && Sgetc(s) == 'e' && (isws(c = Sgetc(s)) || isdelim(c)))
-			goto bool;
-		Sseek(s, off, 0);
-		c = 'f';
-		goto unexpected;
-bool:
-		Sungetc(s);
-		if((o = calloc(1, sizeof(*o))) == nil)
-			goto err;
-		o->type = Obool;
-		o->pdf = pdf;
-		o->bool = tf;
-		return o;
-
 	default:
 		if(!isdigit(c) && c != '-'){
-unexpected:
 			Sungetc(s);
 			werrstr("unexpected char '%c' at %#x+%#x (%d left)", c, Sobjoffset(s), Soffset(s), Ssize(s));
 			goto err;
@@ -178,6 +211,7 @@
 		return;
 
 	case Ostr:
+	case Oop:
 	case Oname:
 		free(o->str);
 		break;
--- a/op.c
+++ b/op.c
@@ -12,6 +12,30 @@
 
 typedef struct Op Op;
 
+static void
+matidentity(double *arr)
+{
+	double src[6] = {
+					1, 0,
+					0, 1,
+					0, 0
+	};
+	memcpy(arr, src, sizeof(double) * 6);
+}
+
+static void
+matmult(double *m1, double *m2, double *out)
+{
+	double result[6];
+	result[0] = m1[0] * m2[0] + m1[1] * m2[2];
+	result[1] = m1[0] * m2[1] + m1[1] * m2[3];
+	result[2] = m1[2] * m2[0] + m1[3] * m2[2];
+	result[3] = m1[2] * m2[1] + m1[3] * m2[3];
+	result[4] = m1[4] * m2[0] + m1[5] * m2[2] + m2[4];
+	result[5] = m1[4] * m2[1] + m1[5] * m2[3] + m2[5];
+	memcpy(out, result, sizeof(double) * 6);
+}
+
 struct Op {
 	char *s;
 	int (*f)(Op *op, Page *p);
@@ -20,6 +44,16 @@
 };
 
 static int
+flagless(Op *op)
+{
+	if(op->flags != 0){
+		fprint(2, "Op '%s' expected no flags\n", op->s);
+		return 0;
+	}
+	return 1;
+}
+
+static int
 cobegin(Op *op, Page *p)
 {
 	USED(op, p);
@@ -36,29 +70,54 @@
 static int
 gspush(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	USED(op);
+	GS *r = realloc(p->GS, sizeof(GS) * (p->nGS + 1));
+	if(r == nil)
+		return 0;
+	p->GS = r;
+	p->nGS += 1;
+	p->GSactive = &p->GS[p->nGS - 1];
+	*(p->GSactive) = p->GS[p->nGS - 2];
+	return 1;
 }
 
 static int
 gspop(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	USED(op);
+	GS *r = realloc(p->GS, sizeof(GS) * (p->nGS - 1));
+	if(r == nil)
+		return 0;
+	p->GS = r;
+	p->nGS -= 1;
+	p->GSactive = &p->GS[p->nGS - 1];
+	return 1;
 }
 
+/* six parameters give the inputs a,b,c,d,e,f for the matrix
+	[a b 0]
+	[c d 0]
+	[e f 1]
+ That matrix should be premultiplied with the current matrix
+ newCTM = input x oldCTM
+ (8.3.4)
+ */
 static int
 gsctm(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	double input[6];
+	int i;
+	for(i = 0; i < 6; i += 1)
+		input[i] = arrayget(p->stack, i)->num.d;
+	matmult(input, p->GSactive->CTM, p->GSactive->CTM);
+	return flagless(op);
 }
 
 static int
 gswidth(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	p->GSactive->LW = arrayget(p->stack, 0)->num.i;
+	return flagless(op);
 }
 
 static int
@@ -99,8 +158,8 @@
 static int
 gsflatness(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	p->GSactive->FL = arrayget(p->stack, 0)->num.d;
+	return flagless(op);
 }
 
 static int
@@ -114,7 +173,7 @@
 pcmove(Op *op, Page *p)
 {
 	USED(op, p);
-	return 0;
+	return 1;
 }
 
 static int
@@ -121,7 +180,7 @@
 pcline(Op *op, Page *p)
 {
 	USED(op, p);
-	return 0;
+	return 1;
 }
 
 static int
@@ -128,7 +187,7 @@
 pccurve(Op *op, Page *p)
 {
 	USED(op, p);
-	return 0;
+	return 1;
 }
 
 static int
@@ -149,7 +208,7 @@
 ppstroke(Op *op, Page *p)
 {
 	USED(op, p);
-	return 0;
+	return 1;
 }
 
 static int
@@ -218,8 +277,21 @@
 static int
 cgray(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	int value = 255 * arrayget(p->stack, 0)->num.d;
+	int i;
+	u32int *color;
+	if(op->flags & Nonstroking){
+		color = &p->GSactive->NSC;
+		p->GSactive->NSCS = DeviceGray;
+	} else{
+		color = &p->GSactive->SC;
+		p->GSactive->SCS = DeviceGray;
+	}
+	*color = 0;
+	for(i = 0; i < 3; i += 1)
+		*color = (*color | value) << 8;
+	*color |= 255;
+	return 1;
 }
 
 static int
@@ -295,21 +367,51 @@
 static int
 tslead(Op *op, Page *p)
 {
-	int d = arrayget(p->stack, 0)->num.d / 20;
-	while(d > 0){
-		d -= 1;
-		if(bufput(&p->buf, (uchar*)"\n", 1) == -1)
-			sysfatal("OOM");
+	p->TS.TL = arrayget(p->stack, 0)->num.d;
+	return flagless(op);
+}
+
+static int
+fontwidths(Page *p)
+{
+	Object *o;
+	int i;
+	if(p->GSactive->Font.widths != nil)
+		free(p->GSactive->Font.widths);
+	o = dictget(p->GSactive->Font.font, "FirstChar");
+	if(o == nil)
+		return 1;
+	p->GSactive->Font.first = o->num.i;
+	p->GSactive->Font.last = dictget(p->GSactive->Font.font, "LastChar")->num.i;
+	p->GSactive->Font.widths = malloc(sizeof(int) * (p->GSactive->Font.last - p->GSactive->Font.first + 1));
+	if(p->GSactive->Font.widths == nil){
+		print("Failed to allocate for (%d, %d): %d\n", p->GSactive->Font.first, p->GSactive->Font.last, p->GSactive->Font.last - p->GSactive->Font.first + 1);
+		return 1;
 	}
-	USED(op, p);
-	return 0;
+	o = dictget(p->GSactive->Font.font, "Widths");
+	if(o == nil)
+		return 0;
+	for(i = 0; i < arraylen(o); i += 1)
+		p->GSactive->Font.widths[i] = arrayget(o, i)->num.i;
+	o = dictget(p->GSactive->Font.font, "FontDescriptor");
+	p->GSactive->Font.defwidth = dictget(o, "MissingWidth")->num.i;
+	return 1;
 }
 
 static int
 tsfontsz(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	char *name = arrayget(p->stack, 0)->name;
+	p->GSactive->Font.font = dictget(dictget(dictget(p->obj, "Resources"), "Font"), name);
+	if(p->GSactive->Font.font == nil){
+		werrstr("Font not found: '%s'", name);
+		return 0;
+	}
+	p->GSactive->Font.enc = dictget(p->GSactive->Font.font, "Encoding");
+	if(p->GSactive->Font.enc)
+		p->GSactive->Font.enc = dictget(p->GSactive->Font.enc, "Differences");
+	p->GSactive->Font.size = arrayget(p->stack, 1)->num.d;
+	return fontwidths(p) && flagless(op);
 }
 
 static int
@@ -329,79 +431,182 @@
 static int
 tobegin(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	if(p->TS.inobj){
+		werrstr("Text objects must not be nested");
+		return 0;
+	}
+	matidentity(p->TS.Tm);
+	matidentity(p->TS.Tlm);
+	p->TS.inobj = 1;
+	p->GSactive->Font.font = nil;
+	return flagless(op);
 }
 
 static int
 toend(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	if(!p->TS.inobj){
+		werrstr("ET found without BT");
+		return 0;
+	}
+	p->TS.inobj = 0;
+	return flagless(op);
 }
 
 static int
+tmove(Page *p, double x, double y, int tlm)
+{
+	double shift[6] = {1, 0, 0, 1, x, y};
+	if(tlm){
+		matmult(shift, p->TS.Tlm, p->TS.Tlm);
+		memcpy(p->TS.Tm, p->TS.Tlm, sizeof(double) * 6);
+	} else{
+		matmult(shift, p->TS.Tm, p->TS.Tm);
+	}
+	return 1;
+}
+
+static int
 tpmove(Op *op, Page *p)
 {
 	Object *x, *y;
 	x = arrayget(p->stack, 0);
 	y = arrayget(p->stack, 1);
-	if(y->num.d != 0){
-		if(bufput(&p->buf, (uchar*)"\n", 1) == -1)
-			sysfatal("OOM");
-	}
-	else if(x->num.d < 50)
-		if(bufput(&p->buf, (uchar*)" ", 1) == -1)
-			sysfatal("OOM");
-	USED(op, p);
-	return 0;
+	if(op->flags & Leading)
+		p->TS.TL = -y->num.d;
+	return tmove(p, x->num.d, y->num.d, 1);
 }
 
 static int
 tpmatrix(Op *op, Page *p)
 {
-	USED(op, p);
-	return 0;
+	int i;
+	for(i = 0; i < 6; i += 1){
+		p->TS.Tm[i] = arrayget(p->stack, i)->num.d;
+		p->TS.Tlm[i] = p->TS.Tm[i];
+	}
+	return flagless(op);
 }
 
 static int
 tpmove0(Op *op, Page *p)
 {
-	USED(op, p);
-	if(bufput(&p->buf, (uchar*)"\n", 1) == -1)
-		sysfatal("OOM");
-	return 0;
+	return tmove(p, 0, 0 - p->TS.TL, 1) && flagless(op);
 }
 
 static int
+writepatched(Page *p, uchar c)
+{
+	int i, len, d = 0;
+	Object *o;
+	if(p->GSactive->Font.enc != nil){
+		len = arraylen(p->GSactive->Font.enc);
+		for(i = 0; i < len; i += 1){
+			o = arrayget(p->GSactive->Font.enc, i);
+			if(o->type == Onum)
+				d = o->num.i;
+			else if(d == c){
+				if(strcmp(o->name, "endash") == 0)
+					return bufput(&p->buf, (uchar*)"-", 1) == 1;
+				if(strcmp(o->name, "fi") == 0)
+					return bufput(&p->buf, (uchar*)"fi", 2) == 2;
+				if(strcmp(o->name, "ff") == 0)
+					return bufput(&p->buf, (uchar*)"ff", 2) == 2;
+				if(strcmp(o->name, "ffi") == 0)
+					return bufput(&p->buf, (uchar*)"ffi", 3) == 3;
+				if(strcmp(o->name, "bullet") == 0)
+					return bufput(&p->buf, (uchar*)"•", strlen("•")) == 3;
+				if(strcmp(o->name, "quotedblleft") == 0)
+					return bufput(&p->buf, (uchar*)"\"", 1) == 1;
+				if(strcmp(o->name, "quotedblright") == 0)
+					return bufput(&p->buf, (uchar*)"\"", 1) == 1;
+				if(strcmp(o->name, "quoteleft") == 0)
+					return bufput(&p->buf, (uchar*)"'", 1) == 1;
+				if(strcmp(o->name, "quoteright") == 0)
+					return bufput(&p->buf, (uchar*)"'", 1) == 1;
+				fprint(2, "TODO: recognize glyph name '%s'\n", o->name);
+				return 1;
+			} else
+				d += 1;
+		}
+	}
+	return bufput(&p->buf, (uchar*)&c, 1) == 1;
+}
+
+/* Renders one character / glyph and updates the text state */
+static int
+tchar(Page *p, ulong c)
+{
+	double Trm[6] = {p->GSactive->Font.size, 0, 0, p->GSactive->Font.size, 0, 0};
+	double tx;
+	int i;
+	matmult(Trm, p->TS.Tm, Trm);
+	matmult(Trm, p->GSactive->CTM, Trm);
+	tx = p->GSactive->Font.size / 1000;
+	if(c >= p->GSactive->Font.first && c <= p->GSactive->Font.last)
+		tx = tx * (double)p->GSactive->Font.widths[c - p->GSactive->Font.first];
+	else
+		tx = tx * (double)p->GSactive->Font.defwidth;
+	// Check if whitespace is needed
+	if(p->buf.sz > 1){
+		if(p->TS.y != Trm[5]){
+			for(i = 0; i < (int)((p->TS.y - Trm[5]) / p->GSactive->Font.size); i += 1)
+				if(bufput(&p->buf, (uchar*)"\n", 1) != 1)
+					return 0;
+		}
+		if(Trm[4] - p->TS.x > 2.5){
+			if(bufput(&p->buf, (uchar*)" ", 1) != 1)
+				return 0;
+		}
+	}
+	if(!writepatched(p, c) || !tmove(p, tx, 0, 0))
+		return 0;
+	p->TS.x = Trm[4] + tx;
+	p->TS.y = Trm[5];
+	return 1;
+}
+
+static int
+tstr(Page *p, char *str, ulong len)
+{
+	ulong i;
+	for(i = 0; i < len; i += 1)
+		if(!tchar(p, str[i]))
+			return 0;
+	return 1;
+}
+
+static int
 thshow(Op *op, Page *p)
 {
+	if(op->flags != 0){
+		fprint(2, "TODO: thshow != Tj\n");
+		return 0;
+	}
 	Object *o = arrayget(p->stack, 0);
-	if(bufput(&p->buf, (uchar*)o->str, o->len) == -1)
-		sysfatal("OOM");
-	USED(op);
-	return 0;
+	if(!tstr(p, o->str, o->len))
+		return 0;
+	return 1;
 }
 
 static int
 thshowarr(Op *op, Page *p)
 {
-	Object *arr = arrayget(p->stack, 0);
-	Object *o;
+	Object *o, *arr = arrayget(p->stack, 0);
 	int i;
 	for(i = 0; i < arraylen(arr); i += 1){
 		o = arrayget(arr, i);
 		if(o->type == Ostr){
-			if(bufput(&p->buf, (uchar*)o->str, o->len) == -1)
-				sysfatal("OOM");
+			if(!tstr(p, o->str, o->len))
+				return 0;
 		}
-		else if(o->num.d < -150){
-			if(bufput(&p->buf, (uchar*)" ", 1) == -1)
-				sysfatal("OOM");
+		else{
+			double shift = 0 - (p->GSactive->Font.size * o->num.d / 1000);
+			if(!tmove(p, shift, 0, 0))
+				return 0;
 		}
 	}
-	USED(op);
-	return 0;
+	return flagless(op);
 }
 
 static int
@@ -746,7 +951,7 @@
 opignore(Op *op, Page *p)
 {
 	USED(op, p);
-	return 1;
+	return 0;
 }
 
 static Op ops[] = {
@@ -833,8 +1038,8 @@
 	/* 9.4.2 Text position operators */
 	{"Td", tpmove, 2,},           /* move, next line */
 	{"TD", tpmove, 2, Leading,},  /* move, next line, leading */
-	{"Tm", tpmatrix, 6,},         /* (line) matrix */
-	{"T*", tpmove0, 0, Leading,}, /* move, next line, leading */
+	{"Tm", tpmatrix, 6,},         /* set Tm and Tlm */
+	{"T*", tpmove0, 0,}, /* move, next line, leading */
 
 	/* 9.4.3 Text showing operators */
 	{"Tj", thshow, 1,},                /* show string */
@@ -902,24 +1107,15 @@
 	{nil, nil, 0,},
 };
 
-// If an op is found at the current position in the stream, the associated Op is
-// returned and the stream is advanced. Otherwise, nil is returned and the stream
-// is left unchanged.
 Op *
-opfind(Stream *s)
+opfind(char *name)
 {
-	int i;
-	uint len;
+	int i = 0;
 	Op *op;
-	char *b = (char*)s->buf.b + s->buf.off;
-	i = 0;
 	while(ops[i].s != nil){
 		op = &ops[i];
-		len = strlen(op->s);
-		if(strncmp(op->s, b, len) == 0 && (isws(b[len]) || isdelim(b[len]))){
-			s->buf.off += len;
+		if(strcmp(op->s, name) == 0)
 			return op;
-		}
 		i += 1;
 	}
 	return nil;
@@ -926,32 +1122,77 @@
 }
 
 void
-pageinit(Page *page)
+pageinit(Page *page, Object *o)
 {
 	bufinit(&page->buf, 0, 0);
 	// Stack is per-content-stream, so we don't create it here
 	page->stack = nil;
+	page->obj = o;
+	page->TS.inobj = 0;
+	page->TS.x = 0;
+	page->TS.y = 0;
 }
 
 void
-pagefree(Page *p)
+gsinit(Page *p, GS *gs)
 {
-	buffree(&p->buf);
-	pdfobjfree(p->stack);
+	USED(p);
+	/* todo: actually initialize the full state */
+	/* CTM maps user coords to device coords. 
+	TODO: use mediabox and screen info to init CTM
+	*/
+	matidentity(gs->CTM);
+	gs->LW = 1;
+	gs->LC = 0;
+	gs->LJ = 0;
+	gs->ML = 10;
+	gs->SCS = gs->NSCS = DeviceGray;
+	// Alpha is lowest byte; this is (0, 0, 0, 255) == black
+	gs->SC = gs->NSC = 255;
+	gs->Font.font = nil;
+	gs->Font.enc = nil;
+	gs->Font.widths = nil;
 }
 
-static void
-stackreset(Object *stack)
+void
+gsfree(GS gs)
 {
+	free(gs.Font.widths);
+	pdfobjfree(gs.Font.font);
+	gs.Font.font = nil;
+	gs.Font.enc = nil;
+	gs.Font.widths = nil;
+}
+
+void
+pagegsclean(Page *p)
+{
 	int i;
-	for(i = 0; i < stack->array.ne; i += 1)
-		pdfobjfree(stack->array.e[i]);
-	stack->array.ne = 0;
-	free(stack->array.e);
-	stack->array.e = nil;
+	p->GSactive = nil;
+	for(i = 0; i < p->nGS; i += 1)
+		gsfree(p->GS[i]);
+	free(p->GS);
+	p->GS = nil;
+	p->nGS = 0;
 }
 
-static void
+static int
+stackreset(Page *p)
+{
+	pdfobjfree(p->stack);
+	p->stack = arraynew(p->obj->pdf);
+	return p->stack != nil;
+}
+
+void
+pagefree(Page *p)
+{
+	buffree(&p->buf);
+	pdfobjfree(p->stack);
+	pagegsclean(p);
+}
+
+static int
 pagerendercontent(Page *p, Object *content)
 {
 	Stream *s;
@@ -964,43 +1205,63 @@
 	}
 	p->stack = arraynew(content->pdf);
 	if(p->stack == nil)
-		return;
+		return 0;
 	while(s->buf.off != s->buf.sz){
 		while(isws(s->buf.b[s->buf.off]) && s->buf.off != s->buf.sz)
 			s->buf.off += 1;
 		if(s->buf.off == s->buf.sz)
 			break;
-		op = opfind(s);
-		if(op != nil){
-			op->f(op, p);
-			stackreset(p->stack);
-		} else{
-			o = pdfobj(content->pdf, s);
-			if(o == nil){
-				fprint(2, "failed to read operand: %r\n");
-				break;
+		o = pdfobj(content->pdf, s);
+		if(o == nil)
+			return 0;
+		if(o->type == Oop){
+			op = opfind(o->str);
+			if(op == nil){
+				fprint(2, "Unknown op: %s\n", o->str);
+				pdfobjfree(o);
+				return 0;
 			}
+			pdfobjfree(o);
+			if(!op->f(op, p)){
+				fprint(2, "'%s' failed!\n", op->s);
+				return 0;
+			}
+			if(!stackreset(p))
+				return 0;
+		} else{
 			if(!arrayadd(p->stack, o)){
 				fprint(2, "Failed to push operand to stack: %r\n");
-				break;
+				return 0;
 			}
 		}
 	}
-	if(bufput(&p->buf, (uchar*)"\n", 1) == -1)
-		sysfatal("OOM");
+	if(bufput(&p->buf, (uchar*)"\n\0", 2) != 2)
+		return 0;
 	Sclose(s);
+	return 1;
 }
 
 int
-pagerender(Page *p, Object *o)
+pagerender(Page *p)
 {
 	Object *content;
 	int i;
-	content = dictget(o, "Contents");
-	if(content->type == Oarray)
+	p->nGS = 1;
+	p->GS = malloc(sizeof(GS));
+	if(p->GS == nil){
+		werrstr("Out of memory");
+		return 0;
+	}
+	gsinit(p, p->GS);
+	content = dictget(p->obj, "Contents");
+	if(content->type == Oarray){
 		for(i = 0; i < arraylen(content); i += 1)
-			pagerendercontent(p, arrayget(content, i));
+			if(!pagerendercontent(p, arrayget(content, i)))
+				return 0;
+	}
 	else if(content->type != Onull)
-		pagerendercontent(p, content);
+		if(!pagerendercontent(p, content))
+			return 0;
+	pagegsclean(p);
 	return 1;
 }
--- a/pdf.h
+++ b/pdf.h
@@ -8,10 +8,12 @@
 	Ostream, /* 7.3.8 */
 	Onull,   /* 7.3.9 */
 	Oindir,  /* 7.3.10 */
+	Oop,     /* 7.8.2 */
 };
 
 typedef struct Buffer Buffer;
 typedef struct Filter Filter;
+typedef struct TS TS;
 typedef struct GS GS;
 typedef struct GSD GSD;
 typedef struct GSFont GSFont;
@@ -35,11 +37,6 @@
 	void *(*memimage)(Buffer *b);
 };
 
-struct Page {
-	Object *stack;
-	Buffer buf;
-};
-
 struct Filter {
 	char *name;
 	int (*readall)(void *aux, Buffer *bi, Buffer *bo);
@@ -104,16 +101,36 @@
 struct GSFont {
 	Object *font;
 	double size;
+	Object *enc; /* TODO: drop enc, use the encoding table */
+	struct{
+		// If a character c is in [first, last], replace it with values[c], which may be multibyte.
+		int first, last;
+		char **values;
+	} encoding;
+	struct{
+		int first, last;
+		int *widths;
+		int defwidth;
+	};
 };
 
+/* Color spaces; 8.6.3 / table 61 */
+typedef enum ColorSpace {
+	DeviceGray, DeviceRGB, DeviceCMYK, /* Device family */
+	CalGray, CalRGB, Lab, ICCBased, /* CIE-based family */
+	Pattern, Indexed, Separation, DeviceN, /* Special family */
+} ColorSpace;
+
 struct GS {
+	double CTM[6]; /* current transformation matrix ; 8.3 */
 	Object *BG, *UCR, *UCR2, *TR, *TR2, *HT, *BM, *SMask, *UseBlackPTComp, *HTO;
 	int LW, LC, LJ, ML, RI, OP, op, OPM, SA, AIS, TK;
-	double SM, CA, ca;
-	struct {
-		GSFont *Font;
-		int nFont;
+	double SM, CA, ca, FL;
+	struct{ /* coloring info */
+		ColorSpace SCS, NSCS; /* stroking color space and nonstroking color space */
+		u32int SC, NSC;
 	};
+	GSFont Font;
 	struct {
 		GSD *d;
 		int nd;
@@ -120,6 +137,29 @@
 	};
 };
 
+struct TS {
+	double Tm[6]; /* text matrix */
+	double Tlm[6]; /* text line matrix */
+	/* Tracks if we're in a text object; nesting is verboten */
+	int inobj;
+	double TL;
+	/* Temporary, for pdf2txt functionality: tracks the last character's position so we know whether whitespace is needed */
+	double x, y;
+};
+
+struct Page {
+	Object *obj;
+	Object *stack;
+	Buffer buf;
+	/* The graphical state stack. GSactive is always a shortcut for the top of the stack, GS[nGS - 1] */
+	struct{
+		GS *GS;
+		GS *GSactive;
+		int nGS;
+	};
+	TS TS;
+};
+
 struct Pdf {
 	Stream *s;
 	Xref *xref;
@@ -246,8 +286,8 @@
 int bufget(Buffer *b, uchar *d, int sz);
 void bufdump(Buffer *b);
 
-void pageinit(Page *p);
-int pagerender(Page *p, Object *o);
+void pageinit(Page *p, Object *o);
+int pagerender(Page *p);
 void pagefree(Page *p);
 
 #pragma varargck type "O" Object*