ref: 9ae083d81642be3a3ee7ff1e8d28fb9381bb1abf
parent: f94167ebeebc108f393b3b2ec279cee2afae56e7
author: spew <devnull@localhost>
date: Thu Feb 2 16:21:34 EST 2017
libregexp: simplify regular expression vm implementation Make the logic around who has priority over the final match simpler by merging the priority generation and match fields in a smarter way. Move the creation of new thread matches up to the top to avoid jumping all over the place.
--- a/sys/src/libregexp/regcomp.c
+++ b/sys/src/libregexp/regcomp.c
@@ -156,17 +156,6 @@
return plex;
}
-static int
-maxthreads(Renode *tree)
-{
- tree = tree->left;
- if(tree->op == TCAT)
- tree = tree->left;
- if(tree->op == TBOL)
- return 2;
- return -1;
-}
-
static Reprog*
regcomp1(char *regstr, int nl, int lit)
{
@@ -187,7 +176,7 @@
return nil;
}
- maxthr = regstrlen;
+ maxthr = regstrlen + 1;
parsetr = node(&plex, TSUB, e0(&plex), nil);
// prtree(parsetr, 0, 1);
@@ -304,12 +293,13 @@
static Reinst*
compile(Renode *parsetr, Reprog *reprog, int nl)
{
- Reinst *reinst;
+ Reinst *reinst, *end;
int sub;
sub = 0;
reinst = (Reinst*)(reprog+1);
- compile1(parsetr, reinst, &sub, nl);
+ end = compile1(parsetr, reinst, &sub, nl);
+ assert(reinst + reprog->len == end);
return reinst;
}
--- a/sys/src/libregexp/regexec.c
+++ b/sys/src/libregexp/regexec.c
@@ -4,30 +4,31 @@
#include "regimpl.h"
typedef struct RethreadQ RethreadQ;
-struct RethreadQ
-{
+struct RethreadQ {
Rethread *head;
Rethread **tail;
};
int
-regexec(Reprog *prog, char *str, Resub *sem, int msize)
+regexec(Reprog *p, char *str, Resub *sem, int msize)
{
RethreadQ lists[2], *clist, *nlist, *tmp;
- Rethread *t, *next, *pooltop, *avail;
- Reinst *curinst;
+ Rethread *t, *next, *pool, *avail;
+ Reinst *ci;
Rune r;
char *sp, *ep, endc;
- int i, match, first, gen, matchpri, pri;
+ int i, matchgen, gen;
if(msize > NSUBEXPM)
msize = NSUBEXPM;
- if(prog->startinst->gen != 0) {
- for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++)
- curinst->gen = 0;
+ if(p->startinst->gen != 0) {
+ for(ci = p->startinst; ci < p->startinst + p->len; ci++)
+ ci->gen = 0;
}
+ memset(p->threads, 0, sizeof(Rethread)*p->nthr);
+
clist = lists;
clist->head = nil;
clist->tail = &clist->head;
@@ -35,10 +36,10 @@
nlist->head = nil;
nlist->tail = &nlist->head;
- pooltop = prog->threads + prog->nthr;
+ pool = p->threads;
avail = nil;
- pri = matchpri = gen = match = 0;
+ gen = matchgen = 0;
sp = str;
ep = nil;
endc = '\0';
@@ -51,26 +52,40 @@
*sem->ep = '\0';
}
}
- r = Runemax + 1;
- for(; r != L'\0'; sp += i) {
- gen++;
+ for(r = 1; r != L'\0'; sp += i) {
i = chartorune(&r, sp);
- first = 1;
+ gen++;
+ if(matchgen == 0) {
+ if(avail == nil) {
+ assert(pool < p->threads + p->nthr);
+ t = pool++;
+ } else {
+ t = avail;
+ avail = avail->next;
+ }
+ t->i = p->startinst;
+ if(msize > 0)
+ memset(t->sem, 0, sizeof(Resub)*msize);
+ t->next = nil;
+ t->gen = gen;
+ *clist->tail = t;
+ clist->tail = &t->next;
+ }
t = clist->head;
if(t == nil)
- goto Start;
- curinst = t->pc;
+ break;
+ ci = t->i;
Again:
- if(curinst->gen == gen)
+ if(ci->gen == gen || matchgen && t->gen > matchgen)
goto Done;
- curinst->gen = gen;
- switch(curinst->op) {
+ ci->gen = gen;
+ switch(ci->op) {
case ORUNE:
- if(r != curinst->r)
+ if(r != ci->r)
goto Done;
case OANY: /* fallthrough */
next = t->next;
- t->pc = curinst + 1;
+ t->i = ci + 1;
t->next = nil;
*nlist->tail = t;
nlist->tail = &t->next;
@@ -77,18 +92,18 @@
if(next == nil)
break;
t = next;
- curinst = t->pc;
+ ci = t->i;
goto Again;
case OCLASS:
Class:
- if(r < curinst->r)
+ if(r < ci->r)
goto Done;
- if(r > curinst->r1) {
- curinst++;
+ if(r > ci->r1) {
+ ci++;
goto Class;
}
next = t->next;
- t->pc = curinst->a;
+ t->i = ci->a;
t->next = nil;
*nlist->tail = t;
nlist->tail = &t->next;
@@ -95,56 +110,53 @@
if(next == nil)
break;
t = next;
- curinst = t->pc;
+ ci = t->i;
goto Again;
case ONOTNL:
if(r != L'\n') {
- curinst++;
+ ci++;
goto Again;
}
goto Done;
case OBOL:
if(sp == str || sp[-1] == '\n') {
- curinst++;
+ ci++;
goto Again;
}
goto Done;
case OEOL:
if(r == L'\n' || r == L'\0' && ep == nil) {
- curinst++;
+ ci++;
goto Again;
}
goto Done;
case OJMP:
- curinst = curinst->a;
+ ci = ci->a;
goto Again;
case OSPLIT:
- if(avail == nil)
- next = --pooltop;
- else {
+ if(avail == nil) {
+ assert(pool < p->threads + p->nthr);
+ next = pool++;
+ } else {
next = avail;
avail = avail->next;
}
- next->pc = curinst->b;
+ next->i = ci->b;
if(msize > 0)
memcpy(next->sem, t->sem, sizeof(Resub)*msize);
- next->pri = t->pri;
next->next = t->next;
+ next->gen = t->gen;
t->next = next;
- curinst = curinst->a;
+ ci = ci->a;
goto Again;
case OSAVE:
- if(curinst->sub < msize)
- t->sem[curinst->sub].sp = sp;
- curinst++;
+ if(ci->sub < msize)
+ t->sem[ci->sub].sp = sp;
+ ci++;
goto Again;
case OUNSAVE:
- if(curinst->sub == 0) {
- /* "Highest" priority is the left-most longest. */
- if (t->pri > matchpri)
- goto Done;
- match = 1;
- matchpri = t->pri;
+ if(ci->sub == 0) {
+ matchgen = t->gen;
if(sem != nil && msize > 0) {
memcpy(sem, t->sem, sizeof(Resub)*msize);
sem->ep = sp;
@@ -151,9 +163,9 @@
}
goto Done;
}
- if(curinst->sub < msize)
- t->sem[curinst->sub].ep = sp;
- curinst++;
+ if(ci->sub < msize)
+ t->sem[ci->sub].ep = sp;
+ ci++;
goto Again;
Done:
next = t->next;
@@ -162,30 +174,9 @@
if(next == nil)
break;
t = next;
- curinst = t->pc;
+ ci = t->i;
goto Again;
}
-Start:
- /* Start again once if we haven't found anything. */
- if(first == 1 && match == 0) {
- first = 0;
- if(avail == nil)
- t = --pooltop;
- else {
- t = avail;
- avail = avail->next;
- }
- if(msize > 0)
- memset(t->sem, 0, sizeof(Resub)*msize);
- /* "Lower" priority thread */
- t->pri = matchpri = pri++;
- t->next = nil;
- curinst = prog->startinst;
- goto Again;
- }
- /* If we have a match and no extant threads, we are done. */
- if(match == 1 && nlist->head == nil)
- break;
tmp = clist;
clist = nlist;
nlist = tmp;
@@ -194,5 +185,5 @@
}
if(ep != nil)
*ep = endc;
- return match;
+ return matchgen > 0 ? 1 : 0;
}
--- a/sys/src/libregexp/regimpl.h
+++ b/sys/src/libregexp/regimpl.h
@@ -1,5 +1,4 @@
-enum
-{
+enum {
LANY = 0,
LBOL,
LCLASS,
@@ -30,8 +29,7 @@
typedef struct Parselex Parselex;
typedef struct Renode Renode;
-struct Parselex
-{
+struct Parselex {
/* Parse */
Renode *next;
Renode *nodes;
@@ -50,8 +48,8 @@
Rune cpairs[400+2];
int nc;
};
-struct Renode
-{
+
+struct Renode {
int op;
Renode *left;
Rune r;
@@ -63,15 +61,15 @@
};
int nclass;
};
-struct Rethread
-{
- Reinst *pc;
+
+struct Rethread {
+ Reinst *i;
Resub sem[NSUBEXPM];
- int pri;
Rethread *next;
+ int gen;
};
-struct Reinst
-{
+
+struct Reinst {
char op;
int gen;
Reinst *a;
--- a/sys/src/libregexp/rregexec.c
+++ b/sys/src/libregexp/rregexec.c
@@ -4,29 +4,30 @@
#include "regimpl.h"
typedef struct RethreadQ RethreadQ;
-struct RethreadQ
-{
+struct RethreadQ {
Rethread *head;
Rethread **tail;
};
int
-rregexec(Reprog *prog, Rune *str, Resub *sem, int msize)
+rregexec(Reprog *p, Rune *str, Resub *sem, int msize)
{
RethreadQ lists[2], *clist, *nlist, *tmp;
- Rethread *t, *next, *pooltop, *avail;
- Reinst *curinst;
- Rune *rsp, *rep, endr, last;
- int match, first, gen, pri, matchpri;
+ Rethread *t, *next, *pool, *avail;
+ Reinst *ci;
+ Rune *rsp, *rep, endr, r;
+ int matchgen, gen;
if(msize > NSUBEXPM)
msize = NSUBEXPM;
- if(prog->startinst->gen != 0) {
- for(curinst = prog->startinst; curinst < prog->startinst + prog->len; curinst++)
- curinst->gen = 0;
+ if(p->startinst->gen != 0) {
+ for(ci = p->startinst; ci < p->startinst + p->len; ci++)
+ ci->gen = 0;
}
+ memset(p->threads, 0, sizeof(Rethread)*p->nthr);
+
clist = lists;
clist->head = nil;
clist->tail = &clist->head;
@@ -34,10 +35,10 @@
nlist->head = nil;
nlist->tail = &nlist->head;
- pooltop = prog->threads + prog->nthr;
+ pool = p->threads;
avail = nil;
- pri = matchpri = gen = match = 0;
+ gen = matchgen = 0;
rsp = str;
rep = nil;
endr = L'\0';
@@ -50,26 +51,40 @@
*sem->rep = '\0';
}
}
- last = 1;
- for(; last != L'\0'; rsp++) {
+ for(r = 1; r != L'\0'; rsp++) {
+ r = *rsp;
gen++;
- last = *rsp;
- first = 1;
+ if(matchgen == 0) {
+ if(avail == nil) {
+ assert(pool < p->threads + p->nthr);
+ t = pool++;
+ } else {
+ t = avail;
+ avail = avail->next;
+ }
+ t->i = p->startinst;
+ if(msize > 0)
+ memset(t->sem, 0, sizeof(Resub)*msize);
+ t->next = nil;
+ t->gen = gen;
+ *clist->tail = t;
+ clist->tail = &t->next;
+ }
t = clist->head;
if(t == nil)
- goto Start;
- curinst = t->pc;
+ break;
+ ci = t->i;
Again:
- if(curinst->gen == gen)
+ if(ci->gen == gen || matchgen && t->gen > matchgen)
goto Done;
- curinst->gen = gen;
- switch(curinst->op) {
+ ci->gen = gen;
+ switch(ci->op) {
case ORUNE:
- if(*rsp != curinst->r)
+ if(r != ci->r)
goto Done;
case OANY: /* fallthrough */
next = t->next;
- t->pc = curinst + 1;
+ t->i = ci + 1;
t->next = nil;
*nlist->tail = t;
nlist->tail = &t->next;
@@ -76,18 +91,18 @@
if(next == nil)
break;
t = next;
- curinst = t->pc;
+ ci = t->i;
goto Again;
case OCLASS:
Class:
- if(*rsp < curinst->r)
+ if(r < ci->r)
goto Done;
- if(*rsp > curinst->r1) {
- curinst++;
+ if(r > ci->r1) {
+ ci++;
goto Class;
}
next = t->next;
- t->pc = curinst->a;
+ t->i = ci->a;
t->next = nil;
*nlist->tail = t;
nlist->tail = &t->next;
@@ -94,56 +109,53 @@
if(next == nil)
break;
t = next;
- curinst = t->pc;
+ ci = t->i;
goto Again;
case ONOTNL:
- if(*rsp != L'\n') {
- curinst++;
+ if(r != L'\n') {
+ ci++;
goto Again;
}
goto Done;
case OBOL:
if(rsp == str || rsp[-1] == L'\n') {
- curinst++;
+ ci++;
goto Again;
}
goto Done;
case OEOL:
- if(*rsp == '\n' || *rsp == L'\0' && rep == nil) {
- curinst++;
+ if(r == L'\n' || r == L'\0' && rep == nil) {
+ ci++;
goto Again;
}
goto Done;
case OJMP:
- curinst = curinst->a;
+ ci = ci->a;
goto Again;
case OSPLIT:
- if(avail == nil)
- next = --pooltop;
- else {
+ if(avail == nil) {
+ assert(pool < p->threads + p->nthr);
+ next = pool++;
+ } else {
next = avail;
avail = avail->next;
}
- next->pc = curinst->b;
+ next->i = ci->b;
if(msize > 0)
memcpy(next->sem, t->sem, sizeof(Resub)*msize);
- next->pri = t->pri;
next->next = t->next;
+ next->gen = t->gen;
t->next = next;
- curinst = curinst->a;
+ ci = ci->a;
goto Again;
case OSAVE:
- if(curinst->sub < msize)
- t->sem[curinst->sub].rsp = rsp;
- curinst++;
+ if(ci->sub < msize)
+ t->sem[ci->sub].rsp = rsp;
+ ci++;
goto Again;
case OUNSAVE:
- if(curinst->sub == 0) {
- /* "Highest" priority is the left-most longest. */
- if (t->pri > matchpri)
- goto Done;
- match = 1;
- matchpri = t->pri;
+ if(ci->sub == 0) {
+ matchgen = t->gen;
if(sem != nil && msize > 0) {
memcpy(sem, t->sem, sizeof(Resub)*msize);
sem->rep = rsp;
@@ -150,9 +162,9 @@
}
goto Done;
}
- if(curinst->sub < msize)
- t->sem[curinst->sub].rep = rsp;
- curinst++;
+ if(ci->sub < msize)
+ t->sem[ci->sub].rep = rsp;
+ ci++;
goto Again;
Done:
next = t->next;
@@ -161,30 +173,9 @@
if(next == nil)
break;
t = next;
- curinst = t->pc;
+ ci = t->i;
goto Again;
}
-Start:
- /* Start again once if we haven't found anything. */
- if(first == 1 && match == 0) {
- first = 0;
- if(avail == nil)
- t = --pooltop;
- else {
- t = avail;
- avail = avail->next;
- }
- if(msize > 0)
- memset(t->sem, 0, sizeof(Resub)*msize);
- /* "Lower" priority thread */
- t->pri = matchpri = pri++;
- t->next = nil;
- curinst = prog->startinst;
- goto Again;
- }
- /* If we have a match and no extant threads, we are done. */
- if(match == 1 && nlist->head == nil)
- break;
tmp = clist;
clist = nlist;
nlist = tmp;
@@ -193,5 +184,5 @@
}
if(rep != nil)
*rep = endr;
- return match;
+ return matchgen > 0 ? 1 : 0;
}