shithub: xml-9atom

Download patch

ref: 4e09200f7e4bf8791c9c720922734e0759206b72
parent: 524f6890a72aa3629cc91ecf5c505ffa52f7eda4
author: sirjofri <[email protected]>
date: Fri Aug 2 09:19:50 EDT 2024

adds libxml support for processing instructions and comments

this also adds support for processing-instruction() and comment() node in libxpath

--- a/README
+++ b/README
@@ -33,6 +33,10 @@
 To get a good overview of how these features work, check the man page.
 
 - Namespace support for Elems and Attrs, which supersedes Fstripnamespace
+- Processing instructions are Elems which names start with '?'
+- Comments are now full nodes with their name set to nil and pcdata is the comment
+
+Caution: The introduction of processing instructions and comment nodes can lead to weird behaviour with illegal XML files, like ones containing things like <?procinst/> and <?procinst></procinst>.
 
 
 Libxpath
--- a/libxml/mkfile
+++ b/libxml/mkfile
@@ -4,6 +4,7 @@
 
 OFILES=\
 	xmlattr.$O\
+	xmlcomment.$O\
 	xmlelem.$O\
 	xmlfind.$O\
 	xmlfree.$O\
--- a/libxml/state-machine.h
+++ b/libxml/state-machine.h
@@ -6,6 +6,7 @@
 	Tequal,
 	Tendblk,
 	Tnulblk,
+	Tcomment,
 	NumToks
 };
 
@@ -28,9 +29,10 @@
 	Apcdata	= 3,
 	Aattr	= 4,
 	Avalue	= 5,
-	Aup	= 6,
+	Aup 	= 6,
 	Adown	= 7,
 	Acheck	= 8,
+	Acomment = 9,
 	NumActions
 };
 
@@ -39,7 +41,7 @@
 	[Twhite]	"white",	[Topen]		"open",
 	[Tname]		"name",		[Tclose]	"close",
 	[Tequal]	"equal",	[Tendblk]	"endblk",
-	[Tnulblk]	"nulblk"
+	[Tnulblk]	"nulblk",	[Tcomment]	"comment",
 };
 
 static char *
@@ -56,28 +58,28 @@
 	[Apcdata]	"pcdata",	[Aattr]		"attr",
 	[Avalue]	"value",	[Aelem]		"elem",
 	[Aup]		"up",		[Adown]		"down",
-	[Acheck]	"check"
+	[Acheck]	"check",	[Acomment]	"comment",
 };
 
 
-static int statab[7][7] = {	/* Parser state transition table */
-/* 			Twhite	Topen	Tname	Tclose	Tequal	Tendblk	Tnulblk */ 
-	[Slost]     {	Slost, 	Sopened,Slost, 	Slost, 	Slost, 	Slost,	Slost },
-	[Sopened]   {	0, 	0, 	Snamed,	0, 	0, 	0, 	0 },
-	[Snamed]    {	Snamed, 0, 	Sattred,Sendblk,0, 	Slost,	Slost },            
-	[Sattred]   {	Sattred, 0, 	0, 	0, 	Sequed, 0, 	0 },
-	[Sequed]    {	Sequed, 0, 	Snamed,	0, 	0, 	0, 	0 },
-	[Sendblk]   {	0, 	0, 	Sclosed,0, 	0, 	0, 	0 },
-	[Sclosed]   {	0,	0,	0,	Slost,	0,	0,	0 },          
+static int statab[7][8] = {	/* Parser state transition table */
+/*           Twhite	  Topen	   Tname    Tclose   Tequal  Tendblk Tnulblk Tcomment */ 
+ [Slost]   { Slost,   Sopened, Slost,   Slost,   Slost,  Slost,  Slost,  Slost },
+ [Sopened] { 0,       0,       Snamed,  0,       0,      0,      0,      0 },
+ [Snamed]  { Snamed,  0,       Sattred, Sendblk, 0,      Slost,  Slost,  0 },
+ [Sattred] { Sattred, 0,       0,       0,       Sequed, 0,      0,      0 },
+ [Sequed]  { Sequed,  0,       Snamed,  0,       0,      0,      0,      0 },
+ [Sendblk] { 0,       0,       Sclosed, 0,       0,      0,      0,      0 },
+ [Sclosed] { 0,       0,       0,       Slost,   0,      0,      0,      0 },
 };
 
-static int acttab[7][7] = {	/* Parser action table */
-/* 			Twhite	Topen	Tname	Tclose	Tequal	Tendblk	Tnulblk */         
-	[Slost]     {	Apcdata, Anop, 	Apcdata, Apcdata, Apcdata, Aup,	Apcdata },
-	[Sopened]   {	0, 	0, 	Aelem,	0, 	0, 	0, 	0 },
-	[Snamed]    {	Anop,	0, 	Aattr,	Adown,	0, 	Anop,	Anop },            
-	[Sattred]   {	Anop, 	0, 	0, 	0, 	Anop,	0, 	0 },
-	[Sequed]    {	Anop, 	0, 	Avalue,	0, 	0, 	0, 	0 },
-	[Sendblk]   {	0, 	0, 	Acheck, 0, 	0, 	0, 	0 },
-	[Sclosed]   {	0,	0,	0,	Anop,	0,	0,	0 },          
+static int acttab[7][8] = {	/* Parser action table */
+/*           Twhite   Topen Tname    Tclose   Tequal   Tendblk Tnulblk  Tcomment */         
+ [Slost]   { Apcdata, Anop, Apcdata, Apcdata, Apcdata, Aup,    Apcdata, Acomment },
+ [Sopened] { 0,       0,    Aelem,   0,       0,       0,      0,       0 },
+ [Snamed]  { Anop,    0,    Aattr,   Adown,   0,       Anop,   Anop,    0 },
+ [Sattred] { Anop,    0,    0,       0,       Anop,    0,      0,       0 },
+ [Sequed]  { Anop,    0,    Avalue,  0,       0,       0,      0,       0 },
+ [Sendblk] { 0,       0,    Acheck,  0,       0,       0,      0,       0 },
+ [Sclosed] { 0,       0,    0,       Anop,    0,       0,      0,       0 },
 };
--- /dev/null
+++ b/libxml/xmlcomment.c
@@ -1,0 +1,29 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Elem *
+xmlcomment(Xml *xp, Elem **root, Elem *parent, char *comment)
+{
+	Elem *ep, *t;
+	
+	ep = xmlcalloc(xp, sizeof(Elem), 1);
+	if (!ep)
+		sysfatal("no memory - %r");
+	if (! *root)
+		*root = ep;
+	else {
+		for (t = *root; t->next; t = t->next)
+			continue;
+		t->next = ep;
+	}
+	ep->ns = nil;
+	ep->parent = parent;
+	ep->name = nil;
+	if (comment) {
+		ep->pcdata = xmlstrdup(xp, comment, 0);
+		if (!ep->pcdata)
+			sysfatal("no memory - %r");
+	}
+	return ep;
+}
--- a/libxml/xmlparse.c
+++ b/libxml/xmlparse.c
@@ -11,7 +11,7 @@
 	Grain = 16
 };
 
-#define isname1(c)	(isalpha((c)) || c == '_')	/* FIXME: not enforced yet */
+#define isname1(c)	(isalpha((c)) || c == '_' || c == '?')	/* FIXME: not enforced yet */
 #define isnameN(r)	(isalpharune((r)) || isdigitrune((r)) || r == L'_' || r == L'-' || r == L'.' || r == L':')
 #define Roundup(x, g)	(((x) + (unsigned)(g-1)) & ~((unsigned)(g-1)))
 
@@ -219,15 +219,25 @@
 }
 
 static int
-comment(State *st)
+comment(State *st, Lexbuf *lb)
 {
 	long r;
 	int startline;
+	char *p;
 
 	startline = st->line;
+	
+	/* trim leading whitespace */
+	while ((r = get(st)) != -1 && isspacerune(r))
+		continue;
+	unget(st, r);
+	
+	if (lb->buf)
+		lb->buf[0] = 0;
 	do{
-		if(get(st) == -1)
+		if((r = get(st)) == -1)
 			break;
+		growrune(st, lb, r);
 	}while(match(st, L"--") == 1);
 
 	r = get(st);
@@ -239,7 +249,11 @@
 		failed(st, "'--' illegal in a comment (re: line %d)", startline);
 		return Twhite;
 	}
-	return Twhite;
+	/* trim trailing whitespace */
+	p = strrchr(lb->buf, 0);
+	for (p--; p >= lb->buf && isspace(*p); p--)
+		*p = 0;
+	return Tcomment;
 }
 
 static int
@@ -344,14 +358,11 @@
 			r = get(st);
 			switch(r){
 			case L'?':
-				while((r = get(st)) != -1 && r != L'>')
-					continue;
-				if(r == -1)
-					return -1;
-				return Twhite;
+				unget(st, r);
+				return Topen;
 			case L'!':
 				if(match(st, L"--") == 0)
-					return comment(st);
+					return comment(st, lb);
 				if(match(st, L"DOCTYPE ") == 0)
 					return doctype(st, lb);
 				if(match(st, L"[CDATA[") == 0)
@@ -385,6 +396,18 @@
 					return Tnulblk;
 				unget(st, r);
 				continue;
+			case '?':
+				r = get(st);
+				if (r == '>')
+					return Tnulblk;
+				growrune(st, lb, '?');
+				do
+					growrune(st, lb, r);
+				while ((r = get(st)) != -1 && isnameN(r));
+				if (r == -1)
+					return -1;
+				unget(st, r);
+				return Tname;
 			case '\'':
 			case '"':		/* attribute value */
 				q = r;
@@ -468,6 +491,12 @@
 			if(!isname1(lb->buf[0]))
 				failed(st, "'%s' is an illegal element name", lb->buf);
 			assert((ep = xmlelem(st->xml, &root, parent, lb->buf)) != nil);
+			ep->line = st->line;
+			break;
+		case Acomment:
+			if(xmldebug == 1)
+				fprint(2, "%-3d %*.scomment '%s'\n", st->line, depth, "", lb->buf);
+			assert((ep = xmlcomment(st->xml, &root, parent, lb->buf)) != nil);
 			ep->line = st->line;
 			break;
 		case Apcdata:
--- a/libxml/xmlprint.c
+++ b/libxml/xmlprint.c
@@ -38,10 +38,20 @@
 
 	for(; ep; ep = ep->next){
 		ns = ep->ns ? ep->ns->name : nil;
-		Bprint(bp, "%*s<", in, "");
-		if (ns)
-			Bprint(bp, "%s:", ns);
-		Bprint(bp, "%s", ep->name);
+		if (ep->name) {
+			/* node */
+			Bprint(bp, "%*s<", in, "");
+			if (ns)
+				Bprint(bp, "%s:", ns);
+			Bprint(bp, "%s", ep->name);
+		} else {
+			/* comment */
+			Bprint(bp, "%*s<!--", in, "");
+			if (ep->pcdata)
+				Bprint(bp, "%s", ep->pcdata);
+			Bprint(bp, "-->\n");
+			continue;
+		}
 
 		for (ap = ep->attrs; ap; ap = ap->next){
 			ns = ap->ns ? ap->ns->name : nil;
@@ -69,6 +79,8 @@
 				prval(bp, ep->pcdata);
 				Bprint(bp, "\n%*s</%s>\n", in, "", ep->name);
 			}
+			else if(ep->name[0] == '?')
+				Bprint(bp, "?>\n");
 			else
 				Bprint(bp, "/>\n");
 		}
--- a/libxpath/dat.h
+++ b/libxpath/dat.h
@@ -66,6 +66,7 @@
 Func* addfunc(Name*, void (*f)(XpResult*, Elem*));
 void initfuncs(void);
 
+void appendresult(XpResult*, XpResult);
 void buildsinglestring(XpResult*, char*);
 void buildsingleelem(XpResult*, Elem*);
 void buildsinglenum(XpResult*, int);
--- a/libxpath/fns.c
+++ b/libxpath/fns.c
@@ -13,7 +13,7 @@
 	
 	i = 0;
 	for (p = p->child; p; p = p->next) {
-		if (strcmp(p->name, e->name) == 0)
+		if (p->name && e->name && strcmp(p->name, e->name) == 0)
 			i++;
 		if (p == e)
 			return i;
@@ -51,13 +51,33 @@
 void
 fprocinst(XpResult *r, Elem *ep)
 {
-	fprint(2, "function processing-instruction()");
+	Elem *e;
+	
+	for (e = ep->child; e; e = e->next) {
+		if (e->name && e->name[0] == '?')
+			goto Found;
+	}
+	return;
+Found:
+	buildsingleelem(r, e);
 }
 
 void
 fcomment(XpResult *r, Elem *ep)
 {
-	fprint(2, "function comment()");
+	Elem *e;
+	XpResult t;
+	
+	for (e = ep->child; e; e = e->next) {
+		if (!e->name) {
+			if (r->type != Xstring) {
+				r->type = Xstring;
+				r->num = 0;
+			}
+			buildsinglestring(&t, e->pcdata);
+			appendresult(r, t);
+		}
+	}
 }
 
 void
--- a/libxpath/test/t.c
+++ b/libxpath/test/t.c
@@ -25,7 +25,9 @@
 	{ "/html/2", 1, 0, Xnum, "2" },
 	{ "/html/'hello'", 1, 0, Xstring, "hello" },
 	{ "/[inval]", 0, 1, 0, nil },
-	{ "position()", 1, 0, Xnum, "number" },
+	{ "position()", 1, 0, Xnum, "number 1" },
+	{ "/html/processing-instruction()", 1, 0, Xelem, "?pi attr=value" },
+	{ "/html/comment()", 1, 0, Xstring, "comment with many words" },
 	{ nil, 0, 0, 0, nil },
 };
 
@@ -53,7 +55,10 @@
 	print("el: <%s", e->name);
 	for (a = e->attrs; a; a = a->next)
 		print(" %s='%s'", a->name, a->value);
-	print(" />\n");
+	if (e->name[0] == '?')
+		print(" ?>\n");
+	else
+		print(" />\n");
 }
 
 void
@@ -153,12 +158,17 @@
 	if (fd < 0)
 		sysfatal("unable to test: %r");
 	
+	xmldebug = 0;
 	x = xmlparse(fd, 8192, 0);
+	if (!x)
+		sysfatal("parse error: %r");
 	
 	close(fd);
 	
-//	xmldebug = 1;
+//	xmlprint(x, 2);
+//	exits(nil);
 	
+	xmldebug = 0;
 	for (t = tests; t->s; t++) {
 		runtest(t);
 	}
--- a/libxpath/test/test.xml
+++ b/libxpath/test/test.xml
@@ -1,4 +1,6 @@
 <html lang='en'>
+	<?pi attr="value"?>
+	<!--comment with many words-->
 	<a href='help.php'>Some link</a>
 	<a href='whatever.php'>Other link</a>
 	<a href='p.php' bref='p.php'>Second link</a>
--- a/libxpath/xmllookpath.c
+++ b/libxpath/xmllookpath.c
@@ -37,7 +37,7 @@
 	return "invalid";
 }
 
-static void
+void
 appendresult(XpResult *a, XpResult b)
 {
 	int n;
@@ -290,7 +290,7 @@
 	if (n->type == Ndescself) {
 		/* descendant or self */
 		for (Elem *e = ep->child; e; e = e->next) {
-			if (strcmp(e->name, n->name->name) == 0
+			if (e->name && strcmp(e->name, n->name->name) == 0
 			 && evalcond(e, n->cond)) {
 			 	/* if found, proceed with next rule */
 				appendresult(&r, recurse(e, n->chain));
@@ -303,7 +303,7 @@
 	
 	if (n->type == Nchild) {
 		for (Elem *e = ep->child; e; e = e->next)
-			if (strcmp(e->name, n->name->name) == 0
+			if (e->name && strcmp(e->name, n->name->name) == 0
 			 && evalcond(e, n->cond)) {
 				appendresult(&r, recurse(e, n->chain));
 			}
--- a/libxpath/xmlpathl.l
+++ b/libxpath/xmlpathl.l
@@ -39,8 +39,8 @@
 }
 %}
 
-A	[a-zA-Z_.]
-AN	[a-zA-Z0-9_.]
+A	[a-zA-Z_.-]
+AN	[a-zA-Z0-9_.-]
 D	[0-9]
 LIT	[/@=*()'!<>]
 Q	[^']
--- a/xml
+++ b/xml
@@ -66,6 +66,8 @@
 Attr*	xmlattr(Xml *xp, Attr **root, Elem *parent,
 		char *name, char *value)
 .PB
+Elem*	xmlcomment(Xml *xp, Elem **root, Elem *parent, char *comment)
+.PB
 Elem*	xmlelem(Xml *xp, Elem **root, Elem *parent, char *name)
 .PB
 Elem*	xmlfind(Xml *xp, Elem *ep, char *path)
@@ -148,6 +150,15 @@
 but requires both an atribute name and value,
 and returns the address of the new attribute.
 .PP
+.I Xmlcomment
+is equivalent to
+.IR xmlelem ,
+but comment nodes have their name set to
+.I nil
+and
+.I pcdata
+is the comment.
+.PP
 .I Xmllook
 descends through the tree rooted at
 .I ep
@@ -195,6 +206,11 @@
 The current namespace implementation supersedes
 .I Fstripnamespace
 and is probably buggy.
+.PP
+XML element names can start with
+.IR ? ,
+which makes them a processing instruction.
+Processing instruction support is probably buggy.
 .PP
 A SAX model parser will probably be needed sometime (e.g. for Ebooks).
 .PP
--- a/xml.h
+++ b/xml.h
@@ -56,6 +56,7 @@
 extern int xmldebug;
 
 Attr*	xmlattr(Xml *, Attr **, Elem *, char *, char *);
+Elem*	xmlcomment(Xml *, Elem **, Elem *, char *);
 Elem*	xmlelem(Xml *, Elem **, Elem *, char *);
 Elem*	xmlfind(Xml *, Elem *, char *);
 void	xmlfree(Xml *);