ref: bfe569921d63fdbb29fe06c8e19ac402e009b960
parent: 01d85a49949e513d82882239a028ca2ba0790b36
author: Tor Andersson <[email protected]>
date: Wed Feb 26 15:22:53 EST 2014
Improve Resub API. Hold the subexpression count and array of matches inside a struct.
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -29,9 +29,9 @@
void js_RegExp_prototype_exec(js_State *J, js_Regexp *re, const char *text)
{
- Resub m[REG_MAXSUB];
unsigned int i;
int opts;
+ Resub m;
opts = 0;
if (re->flags & JS_REGEXP_G) {
@@ -46,14 +46,14 @@
}
}
- if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
+ if (!js_regexec(re->prog, text, &m, opts)) {
js_newarray(J);
- for (i = 0; i < nelem(m) && m[i].sp; ++i) {
- js_pushlstring(J, m[i].sp, m[i].ep - m[i].sp);
+ for (i = 0; i < m.nsub; ++i) {
+ js_pushlstring(J, m.sub[i].sp, m.sub[i].ep - m.sub[i].sp);
js_setindex(J, -2, i);
}
if (re->flags & JS_REGEXP_G)
- re->last = re->last + (m[0].ep - text);
+ re->last = re->last + (m.sub[0].ep - text);
return;
}
@@ -67,8 +67,8 @@
{
js_Regexp *re;
const char *text;
- Resub m[REG_MAXSUB];
int opts;
+ Resub m;
re = js_toregexp(J, 0);
text = js_tostring(J, 1);
@@ -86,9 +86,9 @@
}
}
- if (!js_regexec(re->prog, text, nelem(m), m, opts)) {
+ if (!js_regexec(re->prog, text, &m, opts)) {
if (re->flags & JS_REGEXP_G)
- re->last = re->last + (m[0].ep - text);
+ re->last = re->last + (m.sub[0].ep - text);
js_pushboolean(J, 1);
return;
}
--- a/jsstring.c
+++ b/jsstring.c
@@ -307,10 +307,10 @@
static void Sp_match(js_State *J, unsigned int argc)
{
js_Regexp *re;
- Resub m[REG_MAXSUB];
const char *text;
unsigned int len;
const char *a, *b, *c, *e;
+ Resub m;
text = js_tostring(J, 0);
@@ -335,11 +335,11 @@
a = text;
e = text + strlen(text);
while (a <= e) {
- if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
+ if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
break;
- b = m[0].sp;
- c = m[0].ep;
+ b = m.sub[0].sp;
+ c = m.sub[0].ep;
js_pushlstring(J, b, c - b);
js_setindex(J, -2, len++);
@@ -353,8 +353,8 @@
static void Sp_search(js_State *J, unsigned int argc)
{
js_Regexp *re;
- Resub m[REG_MAXSUB];
const char *text;
+ Resub m;
text = js_tostring(J, 0);
@@ -367,8 +367,8 @@
re = js_toregexp(J, -1);
- if (!js_regexec(re->prog, text, nelem(m), m, 0))
- js_pushnumber(J, js_utfptrtoidx(text, m[0].sp));
+ if (!js_regexec(re->prog, text, &m, 0))
+ js_pushnumber(J, js_utfptrtoidx(text, m.sub[0].sp));
else
js_pushnumber(J, -1);
}
@@ -376,15 +376,15 @@
static void Sp_replace_regexp(js_State *J, unsigned int argc)
{
js_Regexp *re;
- Resub m[REG_MAXSUB];
const char *source, *s, *r;
js_Buffer *sb = NULL;
- int n, x;
+ unsigned int n, x;
+ Resub m;
source = js_tostring(J, 0);
re = js_toregexp(J, 1);
- if (js_regexec(re->prog, source, nelem(m), m, 0)) {
+ if (js_regexec(re->prog, source, &m, 0)) {
js_copy(J, 0);
return;
}
@@ -392,14 +392,14 @@
re->last = 0;
loop:
- s = m[0].sp;
- n = m[0].ep - m[0].sp;
+ s = m.sub[0].sp;
+ n = m.sub[0].ep - m.sub[0].sp;
if (js_iscallable(J, 2)) {
js_copy(J, 2);
js_pushglobal(J);
- for (x = 0; m[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
- js_pushlstring(J, m[x].sp, m[x].ep - m[x].sp);
+ for (x = 0; m.sub[x].sp; ++x) /* arg 0..x: substring and subexps that matched */
+ js_pushlstring(J, m.sub[x].sp, m.sub[x].ep - m.sub[x].sp);
js_pushnumber(J, s - source); /* arg x+2: offset within search string */
js_copy(J, 0); /* arg x+3: search string */
js_call(J, 2 + x);
@@ -425,8 +425,8 @@
if (r[1] >= '0' && r[1] <= '9')
x = x * 10 + *(++r) - '0';
// TODO: use prog->nsub somehow
- if (x > 0 && x < REG_MAXSUB && m[x].sp) {
- sb_putm(&sb, m[x].sp, m[x].ep);
+ if (x > 0 && x < m.nsub) {
+ sb_putm(&sb, m.sub[x].sp, m.sub[x].ep);
} else {
sb_putc(&sb, '$');
if (x > 10) {
@@ -450,7 +450,7 @@
}
if (re->flags & JS_REGEXP_G) {
- source = m[0].ep;
+ source = m.sub[0].ep;
if (n == 0) {
if (*source)
sb_putc(&sb, *source++);
@@ -457,7 +457,7 @@
else
goto end;
}
- if (!js_regexec(re->prog, source, nelem(m), m, REG_NOTBOL))
+ if (!js_regexec(re->prog, source, &m, REG_NOTBOL))
goto loop;
}
@@ -544,10 +544,10 @@
static void Sp_split_regexp(js_State *J, unsigned int argc)
{
js_Regexp *re;
- Resub m[REG_MAXSUB];
const char *text;
unsigned int limit, len, k;
const char *p, *a, *b, *c, *e;
+ Resub m;
text = js_tostring(J, 0);
re = js_toregexp(J, 1);
@@ -560,7 +560,7 @@
/* splitting the empty string */
if (e == 0) {
- if (js_regexec(re->prog, text, nelem(m), m, 0)) {
+ if (js_regexec(re->prog, text, &m, 0)) {
if (len == limit) return;
js_pushliteral(J, "");
js_setindex(J, -2, 0);
@@ -570,11 +570,11 @@
p = a = text;
while (a < e) {
- if (js_regexec(re->prog, a, nelem(m), m, a > text ? REG_NOTBOL : 0))
+ if (js_regexec(re->prog, a, &m, a > text ? REG_NOTBOL : 0))
break; /* no match */
- b = m[0].sp;
- c = m[0].ep;
+ b = m.sub[0].sp;
+ c = m.sub[0].ep;
/* empty string at end of last match */
if (b == p) {
@@ -586,9 +586,9 @@
js_pushlstring(J, p, b - p);
js_setindex(J, -2, len++);
- for (k = 1; k < nelem(m) && m[k].sp; ++k) {
+ for (k = 1; k < m.nsub; ++k) {
if (len == limit) return;
- js_pushlstring(J, m[k].sp, m[k].ep - m[k].sp);
+ js_pushlstring(J, m.sub[k].sp, m.sub[k].ep - m.sub[k].sp);
js_setindex(J, -2, len++);
}
--- a/regex.c
+++ b/regex.c
@@ -30,7 +30,7 @@
struct Reprog {
Reinst *start, *end;
int flags;
- unsigned int ncap;
+ unsigned int nsub;
Reclass cclass[16];
};
@@ -40,8 +40,8 @@
const char *source;
unsigned int ncclass;
- unsigned int ncap;
- Renode *cap[MAXSUB];
+ unsigned int nsub;
+ Renode *sub[MAXSUB];
int lookahead;
Rune yychar;
@@ -77,7 +77,7 @@
L_NLA, /* "(?!" negative lookahead */
L_WORD, /* "\b" word boundary */
L_NWORD, /* "\B" non-word boundary */
- L_REF, /* "\0" back-reference */
+ L_REF, /* "\1" back-reference */
L_COUNT, /* {M,N} */
};
@@ -459,10 +459,10 @@
}
if (g->lookahead == L_REF) {
atom = newnode(g, P_REF);
- if (g->yychar == 0 || g->yychar > g->ncap || !g->cap[g->yychar])
+ if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
die(g, "invalid back-reference");
atom->n = g->yychar;
- atom->x = g->cap[g->yychar];
+ atom->x = g->sub[g->yychar];
next(g);
return atom;
}
@@ -470,12 +470,11 @@
return newnode(g, P_ANY);
if (accept(g, '(')) {
atom = newnode(g, P_PAR);
- if (++g->ncap == MAXSUB)
+ if (g->nsub == MAXSUB)
die(g, "too many captures");
- atom->n = g->ncap;
- g->cap[atom->n] = NULL;
+ atom->n = g->nsub++;
atom->x = parsealt(g);
- g->cap[atom->n] = atom;
+ g->sub[atom->n] = atom;
if (!accept(g, ')'))
die(g, "unmatched '('");
return atom;
@@ -805,9 +804,9 @@
g.source = pattern;
g.ncclass = 0;
- g.ncap = 0;
+ g.nsub = 1;
for (i = 0; i < MAXSUB; ++i)
- g.cap[i] = 0;
+ g.sub[i] = 0;
g.prog->flags = cflags;
@@ -818,7 +817,7 @@
if (g.lookahead != 0)
die(&g, "syntax error");
- g.prog->ncap = g.ncap;
+ g.prog->nsub = g.nsub;
g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
split = emit(g.prog, I_SPLIT);
@@ -905,7 +904,7 @@
struct Rethread {
Reinst *pc;
const char *sp;
- Resub sub[MAXSUB];
+ Resub sub;
};
static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
@@ -912,14 +911,14 @@
{
t->pc = pc;
t->sp = sp;
- memcpy(t->sub, sub, sizeof t->sub);
+ memcpy(&t->sub, sub, sizeof t->sub);
}
static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
{
Rethread ready[MAXTHREAD];
- Resub scrap[MAXSUB];
- Resub sub[MAXSUB];
+ Resub scratch;
+ Resub sub;
Rune c;
unsigned int nready;
int i;
@@ -933,13 +932,13 @@
--nready;
pc = ready[nready].pc;
sp = ready[nready].sp;
- memcpy(sub, ready[nready].sub, sizeof sub);
+ memcpy(&sub, &ready[nready].sub, sizeof sub);
for (;;) {
switch (pc->opcode) {
case I_END:
for (i = 0; i < MAXSUB; ++i) {
- out[i].sp = sub[i].sp;
- out[i].ep = sub[i].ep;
+ out->sub[i].sp = sub.sub[i].sp;
+ out->sub[i].ep = sub.sub[i].ep;
}
return 1;
case I_JUMP:
@@ -950,18 +949,18 @@
fprintf(stderr, "regexec: backtrack overflow!\n");
return 0;
}
- spawn(&ready[nready++], pc->y, sp, sub);
+ spawn(&ready[nready++], pc->y, sp, &sub);
pc = pc->x;
continue;
case I_PLA:
- if (!match(pc->x, sp, bol, flags, sub))
+ if (!match(pc->x, sp, bol, flags, &sub))
goto dead;
pc = pc->y;
continue;
case I_NLA:
- memcpy(scrap, sub, sizeof scrap);
- if (match(pc->x, sp, bol, flags, scrap))
+ memcpy(&scratch, &sub, sizeof scratch);
+ if (match(pc->x, sp, bol, flags, &scratch))
goto dead;
pc = pc->y;
continue;
@@ -1012,12 +1011,12 @@
}
break;
case I_REF:
- i = sub[pc->n].ep - sub[pc->n].sp;
+ i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
if (flags & REG_ICASE) {
- if (strncmpcanon(sp, sub[pc->n].sp, i))
+ if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
goto dead;
} else {
- if (strncmp(sp, sub[pc->n].sp, i))
+ if (strncmp(sp, sub.sub[pc->n].sp, i))
goto dead;
}
if (i > 0)
@@ -1052,10 +1051,10 @@
goto dead;
case I_LPAR:
- sub[pc->n].sp = sp;
+ sub.sub[pc->n].sp = sp;
break;
case I_RPAR:
- sub[pc->n].ep = sp;
+ sub.sub[pc->n].ep = sp;
break;
default:
goto dead;
@@ -1067,17 +1066,19 @@
return 0;
}
-int regexec(Reprog *prog, const char *sp, int n, Resub *m, int eflags)
+int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
{
- Resub gm[MAXSUB];
- unsigned int i;
+ Resub scratch;
+ int i;
- m = m ? m : gm;
+ if (!sub)
+ sub = &scratch;
+ sub->nsub = prog->nsub;
for (i = 0; i < MAXSUB; ++i)
- m[i].sp = m[i].ep = i <= prog->ncap ? sp : NULL;
+ sub->sub[i].sp = sub->sub[i].ep = NULL;
- return !match(prog->start, sp, sp, prog->flags | eflags, m);
+ return !match(prog->start, sp, sp, prog->flags | eflags, sub);
}
#ifdef TEST
@@ -1086,8 +1087,8 @@
const char *error;
const char *s;
Reprog *p;
- Resub m[MAXSUB];
- int i;
+ Resub m;
+ unsigned int i;
if (argc > 1) {
p = regcomp(argv[1], 0, &error);
@@ -1098,13 +1099,12 @@
if (argc > 2) {
s = argv[2];
- printf("ncap = %d\n", p->ncap);
- if (!regexec(p, s, MAXSUB, m, 0)) {
- for (i = 0; i < MAXSUB; ++i)
- if (m[i].sp) {
- int n = m[i].ep - m[i].sp;
- printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);
- }
+ printf("nsub = %d\n", p->nsub);
+ if (!regexec(p, s, &m, 0)) {
+ for (i = 0; i < m.nsub; ++i) {
+ int n = m.sub[i].ep - m.sub[i].sp;
+ printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
+ }
} else {
printf("no match\n");
}
--- a/regex.h
+++ b/regex.h
@@ -7,13 +7,9 @@
typedef struct Reprog Reprog;
typedef struct Resub Resub;
-struct Resub {
- const char *sp;
- const char *ep;
-};
Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
-int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);
+int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
void regfree(Reprog *prog);
enum {
@@ -26,6 +22,14 @@
/* limits */
REG_MAXSUB = 16
+};
+
+struct Resub {
+ unsigned int nsub;
+ struct {
+ const char *sp;
+ const char *ep;
+ } sub[REG_MAXSUB];
};
#endif