ref: 4e09200f7e4bf8791c9c720922734e0759206b72
parent: 524f6890a72aa3629cc91ecf5c505ffa52f7eda4
author: sirjofri <[email protected]>
date: Fri Aug 2 09:19:50 EDT 2024
adds libxml support for processing instructions and comments this also adds support for processing-instruction() and comment() node in libxpath
--- a/README
+++ b/README
@@ -33,6 +33,10 @@
To get a good overview of how these features work, check the man page.
- Namespace support for Elems and Attrs, which supersedes Fstripnamespace
+- Processing instructions are Elems which names start with '?'
+- Comments are now full nodes with their name set to nil and pcdata is the comment
+
+Caution: The introduction of processing instructions and comment nodes can lead to weird behaviour with illegal XML files, like ones containing things like <?procinst/> and <?procinst></procinst>.
Libxpath
--- a/libxml/mkfile
+++ b/libxml/mkfile
@@ -4,6 +4,7 @@
OFILES=\
xmlattr.$O\
+ xmlcomment.$O\
xmlelem.$O\
xmlfind.$O\
xmlfree.$O\
--- a/libxml/state-machine.h
+++ b/libxml/state-machine.h
@@ -6,6 +6,7 @@
Tequal,
Tendblk,
Tnulblk,
+ Tcomment,
NumToks
};
@@ -28,9 +29,10 @@
Apcdata = 3,
Aattr = 4,
Avalue = 5,
- Aup = 6,
+ Aup = 6,
Adown = 7,
Acheck = 8,
+ Acomment = 9,
NumActions
};
@@ -39,7 +41,7 @@
[Twhite] "white", [Topen] "open",
[Tname] "name", [Tclose] "close",
[Tequal] "equal", [Tendblk] "endblk",
- [Tnulblk] "nulblk"
+ [Tnulblk] "nulblk", [Tcomment] "comment",
};
static char *
@@ -56,28 +58,28 @@
[Apcdata] "pcdata", [Aattr] "attr",
[Avalue] "value", [Aelem] "elem",
[Aup] "up", [Adown] "down",
- [Acheck] "check"
+ [Acheck] "check", [Acomment] "comment",
};
-static int statab[7][7] = { /* Parser state transition table */
-/* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk */
- [Slost] { Slost, Sopened,Slost, Slost, Slost, Slost, Slost },
- [Sopened] { 0, 0, Snamed, 0, 0, 0, 0 },
- [Snamed] { Snamed, 0, Sattred,Sendblk,0, Slost, Slost },
- [Sattred] { Sattred, 0, 0, 0, Sequed, 0, 0 },
- [Sequed] { Sequed, 0, Snamed, 0, 0, 0, 0 },
- [Sendblk] { 0, 0, Sclosed,0, 0, 0, 0 },
- [Sclosed] { 0, 0, 0, Slost, 0, 0, 0 },
+static int statab[7][8] = { /* Parser state transition table */
+/* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk Tcomment */
+ [Slost] { Slost, Sopened, Slost, Slost, Slost, Slost, Slost, Slost },
+ [Sopened] { 0, 0, Snamed, 0, 0, 0, 0, 0 },
+ [Snamed] { Snamed, 0, Sattred, Sendblk, 0, Slost, Slost, 0 },
+ [Sattred] { Sattred, 0, 0, 0, Sequed, 0, 0, 0 },
+ [Sequed] { Sequed, 0, Snamed, 0, 0, 0, 0, 0 },
+ [Sendblk] { 0, 0, Sclosed, 0, 0, 0, 0, 0 },
+ [Sclosed] { 0, 0, 0, Slost, 0, 0, 0, 0 },
};
-static int acttab[7][7] = { /* Parser action table */
-/* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk */
- [Slost] { Apcdata, Anop, Apcdata, Apcdata, Apcdata, Aup, Apcdata },
- [Sopened] { 0, 0, Aelem, 0, 0, 0, 0 },
- [Snamed] { Anop, 0, Aattr, Adown, 0, Anop, Anop },
- [Sattred] { Anop, 0, 0, 0, Anop, 0, 0 },
- [Sequed] { Anop, 0, Avalue, 0, 0, 0, 0 },
- [Sendblk] { 0, 0, Acheck, 0, 0, 0, 0 },
- [Sclosed] { 0, 0, 0, Anop, 0, 0, 0 },
+static int acttab[7][8] = { /* Parser action table */
+/* Twhite Topen Tname Tclose Tequal Tendblk Tnulblk Tcomment */
+ [Slost] { Apcdata, Anop, Apcdata, Apcdata, Apcdata, Aup, Apcdata, Acomment },
+ [Sopened] { 0, 0, Aelem, 0, 0, 0, 0, 0 },
+ [Snamed] { Anop, 0, Aattr, Adown, 0, Anop, Anop, 0 },
+ [Sattred] { Anop, 0, 0, 0, Anop, 0, 0, 0 },
+ [Sequed] { Anop, 0, Avalue, 0, 0, 0, 0, 0 },
+ [Sendblk] { 0, 0, Acheck, 0, 0, 0, 0, 0 },
+ [Sclosed] { 0, 0, 0, Anop, 0, 0, 0, 0 },
};
--- /dev/null
+++ b/libxml/xmlcomment.c
@@ -1,0 +1,29 @@
+#include <u.h>
+#include <libc.h>
+#include "xml.h"
+
+Elem *
+xmlcomment(Xml *xp, Elem **root, Elem *parent, char *comment)
+{
+ Elem *ep, *t;
+
+ ep = xmlcalloc(xp, sizeof(Elem), 1);
+ if (!ep)
+ sysfatal("no memory - %r");
+ if (! *root)
+ *root = ep;
+ else {
+ for (t = *root; t->next; t = t->next)
+ continue;
+ t->next = ep;
+ }
+ ep->ns = nil;
+ ep->parent = parent;
+ ep->name = nil;
+ if (comment) {
+ ep->pcdata = xmlstrdup(xp, comment, 0);
+ if (!ep->pcdata)
+ sysfatal("no memory - %r");
+ }
+ return ep;
+}
--- a/libxml/xmlparse.c
+++ b/libxml/xmlparse.c
@@ -11,7 +11,7 @@
Grain = 16
};
-#define isname1(c) (isalpha((c)) || c == '_') /* FIXME: not enforced yet */
+#define isname1(c) (isalpha((c)) || c == '_' || c == '?') /* FIXME: not enforced yet */
#define isnameN(r) (isalpharune((r)) || isdigitrune((r)) || r == L'_' || r == L'-' || r == L'.' || r == L':')
#define Roundup(x, g) (((x) + (unsigned)(g-1)) & ~((unsigned)(g-1)))
@@ -219,15 +219,25 @@
}
static int
-comment(State *st)
+comment(State *st, Lexbuf *lb)
{
long r;
int startline;
+ char *p;
startline = st->line;
+
+ /* trim leading whitespace */
+ while ((r = get(st)) != -1 && isspacerune(r))
+ continue;
+ unget(st, r);
+
+ if (lb->buf)
+ lb->buf[0] = 0;
do{
- if(get(st) == -1)
+ if((r = get(st)) == -1)
break;
+ growrune(st, lb, r);
}while(match(st, L"--") == 1);
r = get(st);
@@ -239,7 +249,11 @@
failed(st, "'--' illegal in a comment (re: line %d)", startline);
return Twhite;
}
- return Twhite;
+ /* trim trailing whitespace */
+ p = strrchr(lb->buf, 0);
+ for (p--; p >= lb->buf && isspace(*p); p--)
+ *p = 0;
+ return Tcomment;
}
static int
@@ -344,14 +358,11 @@
r = get(st);
switch(r){
case L'?':
- while((r = get(st)) != -1 && r != L'>')
- continue;
- if(r == -1)
- return -1;
- return Twhite;
+ unget(st, r);
+ return Topen;
case L'!':
if(match(st, L"--") == 0)
- return comment(st);
+ return comment(st, lb);
if(match(st, L"DOCTYPE ") == 0)
return doctype(st, lb);
if(match(st, L"[CDATA[") == 0)
@@ -385,6 +396,18 @@
return Tnulblk;
unget(st, r);
continue;
+ case '?':
+ r = get(st);
+ if (r == '>')
+ return Tnulblk;
+ growrune(st, lb, '?');
+ do
+ growrune(st, lb, r);
+ while ((r = get(st)) != -1 && isnameN(r));
+ if (r == -1)
+ return -1;
+ unget(st, r);
+ return Tname;
case '\'':
case '"': /* attribute value */
q = r;
@@ -468,6 +491,12 @@
if(!isname1(lb->buf[0]))
failed(st, "'%s' is an illegal element name", lb->buf);
assert((ep = xmlelem(st->xml, &root, parent, lb->buf)) != nil);
+ ep->line = st->line;
+ break;
+ case Acomment:
+ if(xmldebug == 1)
+ fprint(2, "%-3d %*.scomment '%s'\n", st->line, depth, "", lb->buf);
+ assert((ep = xmlcomment(st->xml, &root, parent, lb->buf)) != nil);
ep->line = st->line;
break;
case Apcdata:
--- a/libxml/xmlprint.c
+++ b/libxml/xmlprint.c
@@ -38,10 +38,20 @@
for(; ep; ep = ep->next){
ns = ep->ns ? ep->ns->name : nil;
- Bprint(bp, "%*s<", in, "");
- if (ns)
- Bprint(bp, "%s:", ns);
- Bprint(bp, "%s", ep->name);
+ if (ep->name) {
+ /* node */
+ Bprint(bp, "%*s<", in, "");
+ if (ns)
+ Bprint(bp, "%s:", ns);
+ Bprint(bp, "%s", ep->name);
+ } else {
+ /* comment */
+ Bprint(bp, "%*s<!--", in, "");
+ if (ep->pcdata)
+ Bprint(bp, "%s", ep->pcdata);
+ Bprint(bp, "-->\n");
+ continue;
+ }
for (ap = ep->attrs; ap; ap = ap->next){
ns = ap->ns ? ap->ns->name : nil;
@@ -69,6 +79,8 @@
prval(bp, ep->pcdata);
Bprint(bp, "\n%*s</%s>\n", in, "", ep->name);
}
+ else if(ep->name[0] == '?')
+ Bprint(bp, "?>\n");
else
Bprint(bp, "/>\n");
}
--- a/libxpath/dat.h
+++ b/libxpath/dat.h
@@ -66,6 +66,7 @@
Func* addfunc(Name*, void (*f)(XpResult*, Elem*));
void initfuncs(void);
+void appendresult(XpResult*, XpResult);
void buildsinglestring(XpResult*, char*);
void buildsingleelem(XpResult*, Elem*);
void buildsinglenum(XpResult*, int);
--- a/libxpath/fns.c
+++ b/libxpath/fns.c
@@ -13,7 +13,7 @@
i = 0;
for (p = p->child; p; p = p->next) {
- if (strcmp(p->name, e->name) == 0)
+ if (p->name && e->name && strcmp(p->name, e->name) == 0)
i++;
if (p == e)
return i;
@@ -51,13 +51,33 @@
void
fprocinst(XpResult *r, Elem *ep)
{
- fprint(2, "function processing-instruction()");
+ Elem *e;
+
+ for (e = ep->child; e; e = e->next) {
+ if (e->name && e->name[0] == '?')
+ goto Found;
+ }
+ return;
+Found:
+ buildsingleelem(r, e);
}
void
fcomment(XpResult *r, Elem *ep)
{
- fprint(2, "function comment()");
+ Elem *e;
+ XpResult t;
+
+ for (e = ep->child; e; e = e->next) {
+ if (!e->name) {
+ if (r->type != Xstring) {
+ r->type = Xstring;
+ r->num = 0;
+ }
+ buildsinglestring(&t, e->pcdata);
+ appendresult(r, t);
+ }
+ }
}
void
--- a/libxpath/test/t.c
+++ b/libxpath/test/t.c
@@ -25,7 +25,9 @@
{ "/html/2", 1, 0, Xnum, "2" },
{ "/html/'hello'", 1, 0, Xstring, "hello" },
{ "/[inval]", 0, 1, 0, nil },
- { "position()", 1, 0, Xnum, "number" },
+ { "position()", 1, 0, Xnum, "number 1" },
+ { "/html/processing-instruction()", 1, 0, Xelem, "?pi attr=value" },
+ { "/html/comment()", 1, 0, Xstring, "comment with many words" },
{ nil, 0, 0, 0, nil },
};
@@ -53,7 +55,10 @@
print("el: <%s", e->name);
for (a = e->attrs; a; a = a->next)
print(" %s='%s'", a->name, a->value);
- print(" />\n");
+ if (e->name[0] == '?')
+ print(" ?>\n");
+ else
+ print(" />\n");
}
void
@@ -153,12 +158,17 @@
if (fd < 0)
sysfatal("unable to test: %r");
+ xmldebug = 0;
x = xmlparse(fd, 8192, 0);
+ if (!x)
+ sysfatal("parse error: %r");
close(fd);
-// xmldebug = 1;
+// xmlprint(x, 2);
+// exits(nil);
+ xmldebug = 0;
for (t = tests; t->s; t++) {
runtest(t);
}
--- a/libxpath/test/test.xml
+++ b/libxpath/test/test.xml
@@ -1,4 +1,6 @@
<html lang='en'>
+ <?pi attr="value"?>
+ <!--comment with many words-->
<a href='help.php'>Some link</a>
<a href='whatever.php'>Other link</a>
<a href='p.php' bref='p.php'>Second link</a>
--- a/libxpath/xmllookpath.c
+++ b/libxpath/xmllookpath.c
@@ -37,7 +37,7 @@
return "invalid";
}
-static void
+void
appendresult(XpResult *a, XpResult b)
{
int n;
@@ -290,7 +290,7 @@
if (n->type == Ndescself) {
/* descendant or self */
for (Elem *e = ep->child; e; e = e->next) {
- if (strcmp(e->name, n->name->name) == 0
+ if (e->name && strcmp(e->name, n->name->name) == 0
&& evalcond(e, n->cond)) {
/* if found, proceed with next rule */
appendresult(&r, recurse(e, n->chain));
@@ -303,7 +303,7 @@
if (n->type == Nchild) {
for (Elem *e = ep->child; e; e = e->next)
- if (strcmp(e->name, n->name->name) == 0
+ if (e->name && strcmp(e->name, n->name->name) == 0
&& evalcond(e, n->cond)) {
appendresult(&r, recurse(e, n->chain));
}
--- a/libxpath/xmlpathl.l
+++ b/libxpath/xmlpathl.l
@@ -39,8 +39,8 @@
}
%}
-A [a-zA-Z_.]
-AN [a-zA-Z0-9_.]
+A [a-zA-Z_.-]
+AN [a-zA-Z0-9_.-]
D [0-9]
LIT [/@=*()'!<>]
Q [^']
--- a/xml
+++ b/xml
@@ -66,6 +66,8 @@
Attr* xmlattr(Xml *xp, Attr **root, Elem *parent,
char *name, char *value)
.PB
+Elem* xmlcomment(Xml *xp, Elem **root, Elem *parent, char *comment)
+.PB
Elem* xmlelem(Xml *xp, Elem **root, Elem *parent, char *name)
.PB
Elem* xmlfind(Xml *xp, Elem *ep, char *path)
@@ -148,6 +150,15 @@
but requires both an atribute name and value,
and returns the address of the new attribute.
.PP
+.I Xmlcomment
+is equivalent to
+.IR xmlelem ,
+but comment nodes have their name set to
+.I nil
+and
+.I pcdata
+is the comment.
+.PP
.I Xmllook
descends through the tree rooted at
.I ep
@@ -195,6 +206,11 @@
The current namespace implementation supersedes
.I Fstripnamespace
and is probably buggy.
+.PP
+XML element names can start with
+.IR ? ,
+which makes them a processing instruction.
+Processing instruction support is probably buggy.
.PP
A SAX model parser will probably be needed sometime (e.g. for Ebooks).
.PP
--- a/xml.h
+++ b/xml.h
@@ -56,6 +56,7 @@
extern int xmldebug;
Attr* xmlattr(Xml *, Attr **, Elem *, char *, char *);
+Elem* xmlcomment(Xml *, Elem **, Elem *, char *);
Elem* xmlelem(Xml *, Elem **, Elem *, char *);
Elem* xmlfind(Xml *, Elem *, char *);
void xmlfree(Xml *);