ref: a3975e7b0a4a2b694ee635cb9d27b413234c846a
dir: /regexp.c/
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <setjmp.h> #include <limits.h> #include "regexp.h" #include "utf.h" #define emit regemit #define next regnext #define accept regaccept #define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) #define REPINF 255 #ifndef REG_MAXPROG #define REG_MAXPROG (32 << 10) #endif #ifndef REG_MAXREC #define REG_MAXREC 1024 #endif #ifndef REG_MAXSPAN #define REG_MAXSPAN 64 #endif #ifndef REG_MAXCLASS #define REG_MAXCLASS 128 #endif typedef struct Reclass Reclass; typedef struct Renode Renode; typedef struct Reinst Reinst; typedef struct Rethread Rethread; struct Reclass { Rune *end; Rune spans[REG_MAXSPAN]; }; struct Reprog { Reinst *start, *end; Reclass *cclass; int flags; int nsub; }; struct cstate { Reprog *prog; Renode *pstart, *pend; const char *source; int ncclass; int nsub; Renode *sub[REG_MAXSUB]; int lookahead; Rune yychar; Reclass *yycc; int yymin, yymax; const char *error; jmp_buf kaboom; Reclass cclass[REG_MAXCLASS]; }; static void die(struct cstate *g, const char *message) { g->error = message; longjmp(g->kaboom, 1); } static int canon(Rune c) { Rune u = toupperrune(c); if (c >= 128 && u < 128) return c; return u; } /* Scan */ enum { L_CHAR = 256, L_CCLASS, /* character class */ L_NCCLASS, /* negative character class */ L_NC, /* "(?:" no capture */ L_PLA, /* "(?=" positive lookahead */ L_NLA, /* "(?!" negative lookahead */ L_WORD, /* "\b" word boundary */ L_NWORD, /* "\B" non-word boundary */ L_REF, /* "\1" back-reference */ L_COUNT, /* {M,N} */ }; static int hex(struct cstate *g, int c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; die(g, "invalid escape sequence"); return 0; } static int dec(struct cstate *g, int c) { if (c >= '0' && c <= '9') return c - '0'; die(g, "invalid quantifier"); return 0; } #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789" static int isunicodeletter(int c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); } static int nextrune(struct cstate *g) { if (!*g->source) { g->yychar = EOF; return 0; } g->source += chartorune(&g->yychar, g->source); if (g->yychar == '\\') { if (!*g->source) die(g, "unterminated escape sequence"); g->source += chartorune(&g->yychar, g->source); switch (g->yychar) { case 'f': g->yychar = '\f'; return 0; case 'n': g->yychar = '\n'; return 0; case 'r': g->yychar = '\r'; return 0; case 't': g->yychar = '\t'; return 0; case 'v': g->yychar = '\v'; return 0; case 'c': if (!g->source[0]) die(g, "unterminated escape sequence"); g->yychar = (*g->source++) & 31; return 0; case 'x': if (!g->source[0] || !g->source[1]) die(g, "unterminated escape sequence"); g->yychar = hex(g, *g->source++) << 4; g->yychar += hex(g, *g->source++); if (g->yychar == 0) { g->yychar = '0'; return 1; } return 0; case 'u': if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3]) die(g, "unterminated escape sequence"); g->yychar = hex(g, *g->source++) << 12; g->yychar += hex(g, *g->source++) << 8; g->yychar += hex(g, *g->source++) << 4; g->yychar += hex(g, *g->source++); if (g->yychar == 0) { g->yychar = '0'; return 1; } return 0; case 0: g->yychar = '0'; return 1; } if (strchr(ESCAPES, g->yychar)) return 1; if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */ die(g, "invalid escape character"); return 0; } return 0; } static int lexcount(struct cstate *g) { g->yychar = *g->source++; g->yymin = dec(g, g->yychar); g->yychar = *g->source++; while (g->yychar != ',' && g->yychar != '}') { g->yymin = g->yymin * 10 + dec(g, g->yychar); g->yychar = *g->source++; if (g->yymin >= REPINF) die(g, "numeric overflow"); } if (g->yychar == ',') { g->yychar = *g->source++; if (g->yychar == '}') { g->yymax = REPINF; } else { g->yymax = dec(g, g->yychar); g->yychar = *g->source++; while (g->yychar != '}') { g->yymax = g->yymax * 10 + dec(g, g->yychar); g->yychar = *g->source++; if (g->yymax >= REPINF) die(g, "numeric overflow"); } } } else { g->yymax = g->yymin; } return L_COUNT; } static void newcclass(struct cstate *g) { if (g->ncclass >= REG_MAXCLASS) die(g, "too many character classes"); g->yycc = g->cclass + g->ncclass++; g->yycc->end = g->yycc->spans; } static void addrange(struct cstate *g, Rune a, Rune b) { Reclass *cc = g->yycc; Rune *p; if (a > b) die(g, "invalid character class range"); /* extend existing ranges if they overlap */ for (p = cc->spans; p < cc->end; p += 2) { /* completely inside old range */ if (a >= p[0] && b <= p[1]) return; /* completely swallows old range */ if (a < p[0] && b >= p[1]) { p[0] = a; p[1] = b; return; } /* extend at start */ if (b >= p[0] - 1 && b <= p[1] && a < p[0]) { p[0] = a; return; } /* extend at end */ if (a >= p[0] && a <= p[1] + 1 && b > p[1]) { p[1] = b; return; } } if (cc->end + 2 >= cc->spans + nelem(cc->spans)) die(g, "too many character class ranges"); *cc->end++ = a; *cc->end++ = b; } static void addranges_d(struct cstate *g) { addrange(g, '0', '9'); } static void addranges_D(struct cstate *g) { addrange(g, 0, '0'-1); addrange(g, '9'+1, 0xFFFF); } static void addranges_s(struct cstate *g) { addrange(g, 0x9, 0xD); addrange(g, 0x20, 0x20); addrange(g, 0xA0, 0xA0); addrange(g, 0x2028, 0x2029); addrange(g, 0xFEFF, 0xFEFF); } static void addranges_S(struct cstate *g) { addrange(g, 0, 0x9-1); addrange(g, 0xD+1, 0x20-1); addrange(g, 0x20+1, 0xA0-1); addrange(g, 0xA0+1, 0x2028-1); addrange(g, 0x2029+1, 0xFEFF-1); addrange(g, 0xFEFF+1, 0xFFFF); } static void addranges_w(struct cstate *g) { addrange(g, '0', '9'); addrange(g, 'A', 'Z'); addrange(g, '_', '_'); addrange(g, 'a', 'z'); } static void addranges_W(struct cstate *g) { addrange(g, 0, '0'-1); addrange(g, '9'+1, 'A'-1); addrange(g, 'Z'+1, '_'-1); addrange(g, '_'+1, 'a'-1); addrange(g, 'z'+1, 0xFFFF); } static int lexclass(struct cstate *g) { int type = L_CCLASS; int quoted, havesave, havedash; Rune save = 0; newcclass(g); quoted = nextrune(g); if (!quoted && g->yychar == '^') { type = L_NCCLASS; quoted = nextrune(g); } havesave = havedash = 0; for (;;) { if (g->yychar == EOF) die(g, "unterminated character class"); if (!quoted && g->yychar == ']') break; if (!quoted && g->yychar == '-') { if (havesave) { if (havedash) { addrange(g, save, '-'); havesave = havedash = 0; } else { havedash = 1; } } else { save = '-'; havesave = 1; } } else if (quoted && strchr("DSWdsw", g->yychar)) { if (havesave) { addrange(g, save, save); if (havedash) addrange(g, '-', '-'); } switch (g->yychar) { case 'd': addranges_d(g); break; case 's': addranges_s(g); break; case 'w': addranges_w(g); break; case 'D': addranges_D(g); break; case 'S': addranges_S(g); break; case 'W': addranges_W(g); break; } havesave = havedash = 0; } else { if (quoted) { if (g->yychar == 'b') g->yychar = '\b'; else if (g->yychar == '0') g->yychar = 0; /* else identity escape */ } if (havesave) { if (havedash) { addrange(g, save, g->yychar); havesave = havedash = 0; } else { addrange(g, save, save); save = g->yychar; } } else { save = g->yychar; havesave = 1; } } quoted = nextrune(g); } if (havesave) { addrange(g, save, save); if (havedash) addrange(g, '-', '-'); } return type; } static int lex(struct cstate *g) { int quoted = nextrune(g); if (quoted) { switch (g->yychar) { case 'b': return L_WORD; case 'B': return L_NWORD; case 'd': newcclass(g); addranges_d(g); return L_CCLASS; case 's': newcclass(g); addranges_s(g); return L_CCLASS; case 'w': newcclass(g); addranges_w(g); return L_CCLASS; case 'D': newcclass(g); addranges_d(g); return L_NCCLASS; case 'S': newcclass(g); addranges_s(g); return L_NCCLASS; case 'W': newcclass(g); addranges_w(g); return L_NCCLASS; case '0': g->yychar = 0; return L_CHAR; } if (g->yychar >= '0' && g->yychar <= '9') { g->yychar -= '0'; if (*g->source >= '0' && *g->source <= '9') g->yychar = g->yychar * 10 + *g->source++ - '0'; return L_REF; } return L_CHAR; } switch (g->yychar) { case EOF: case '$': case ')': case '*': case '+': case '.': case '?': case '^': case '|': return g->yychar; } if (g->yychar == '{') return lexcount(g); if (g->yychar == '[') return lexclass(g); if (g->yychar == '(') { if (g->source[0] == '?') { if (g->source[1] == ':') { g->source += 2; return L_NC; } if (g->source[1] == '=') { g->source += 2; return L_PLA; } if (g->source[1] == '!') { g->source += 2; return L_NLA; } } return '('; } return L_CHAR; } /* Parse */ enum { P_CAT, P_ALT, P_REP, P_BOL, P_EOL, P_WORD, P_NWORD, P_PAR, P_PLA, P_NLA, P_ANY, P_CHAR, P_CCLASS, P_NCCLASS, P_REF, }; struct Renode { unsigned char type; unsigned char ng, m, n; Rune c; int cc; Renode *x; Renode *y; }; static Renode *newnode(struct cstate *g, int type) { Renode *node = g->pend++; node->type = type; node->cc = -1; node->c = 0; node->ng = 0; node->m = 0; node->n = 0; node->x = node->y = NULL; return node; } static int empty(Renode *node) { if (!node) return 1; switch (node->type) { default: return 1; case P_CAT: return empty(node->x) && empty(node->y); case P_ALT: return empty(node->x) || empty(node->y); case P_REP: return empty(node->x) || node->m == 0; case P_PAR: return empty(node->x); case P_REF: return empty(node->x); case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0; } } static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max) { Renode *rep = newnode(g, P_REP); if (max == REPINF && empty(atom)) die(g, "infinite loop matching the empty string"); rep->ng = ng; rep->m = min; rep->n = max; rep->x = atom; return rep; } static void next(struct cstate *g) { g->lookahead = lex(g); } static int accept(struct cstate *g, int t) { if (g->lookahead == t) { next(g); return 1; } return 0; } static Renode *parsealt(struct cstate *g); static Renode *parseatom(struct cstate *g) { Renode *atom; if (g->lookahead == L_CHAR) { atom = newnode(g, P_CHAR); atom->c = g->yychar; next(g); return atom; } if (g->lookahead == L_CCLASS) { atom = newnode(g, P_CCLASS); atom->cc = g->yycc - g->cclass; next(g); return atom; } if (g->lookahead == L_NCCLASS) { atom = newnode(g, P_NCCLASS); atom->cc = g->yycc - g->cclass; next(g); return atom; } if (g->lookahead == L_REF) { atom = newnode(g, P_REF); if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar]) die(g, "invalid back-reference"); atom->n = g->yychar; atom->x = g->sub[g->yychar]; next(g); return atom; } if (accept(g, '.')) return newnode(g, P_ANY); if (accept(g, '(')) { atom = newnode(g, P_PAR); if (g->nsub == REG_MAXSUB) die(g, "too many captures"); atom->n = g->nsub++; atom->x = parsealt(g); g->sub[atom->n] = atom; if (!accept(g, ')')) die(g, "unmatched '('"); return atom; } if (accept(g, L_NC)) { atom = parsealt(g); if (!accept(g, ')')) die(g, "unmatched '('"); return atom; } if (accept(g, L_PLA)) { atom = newnode(g, P_PLA); atom->x = parsealt(g); if (!accept(g, ')')) die(g, "unmatched '('"); return atom; } if (accept(g, L_NLA)) { atom = newnode(g, P_NLA); atom->x = parsealt(g); if (!accept(g, ')')) die(g, "unmatched '('"); return atom; } die(g, "syntax error"); return NULL; } static Renode *parserep(struct cstate *g) { Renode *atom; if (accept(g, '^')) return newnode(g, P_BOL); if (accept(g, '$')) return newnode(g, P_EOL); if (accept(g, L_WORD)) return newnode(g, P_WORD); if (accept(g, L_NWORD)) return newnode(g, P_NWORD); atom = parseatom(g); if (g->lookahead == L_COUNT) { int min = g->yymin, max = g->yymax; next(g); if (max < min) die(g, "invalid quantifier"); return newrep(g, atom, accept(g, '?'), min, max); } if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF); if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF); if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1); return atom; } static Renode *parsecat(struct cstate *g) { Renode *cat, *head, **tail; if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { /* Build a right-leaning tree by splicing in new 'cat' at the tail. */ head = parserep(g); tail = &head; while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { cat = newnode(g, P_CAT); cat->x = *tail; cat->y = parserep(g); *tail = cat; tail = &cat->y; } return head; } return NULL; } static Renode *parsealt(struct cstate *g) { Renode *alt, *x; alt = parsecat(g); while (accept(g, '|')) { x = alt; alt = newnode(g, P_ALT); alt->x = x; alt->y = parsecat(g); } return alt; } /* Compile */ enum { I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA, I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF, I_BOL, I_EOL, I_WORD, I_NWORD, I_LPAR, I_RPAR }; struct Reinst { unsigned char opcode; unsigned char n; Rune c; Reclass *cc; Reinst *x; Reinst *y; }; static int count(struct cstate *g, Renode *node, int depth) { int min, max, n; if (!node) return 0; if (++depth > REG_MAXREC) die(g, "stack overflow"); switch (node->type) { default: return 1; case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth); case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2; case P_REP: min = node->m; max = node->n; if (min == max) n = count(g, node->x, depth) * min; else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min); else n = count(g, node->x, depth) * (min + 1) + 2; if (n < 0 || n > REG_MAXPROG) die(g, "program too large"); return n; case P_PAR: return count(g, node->x, depth) + 2; case P_PLA: return count(g, node->x, depth) + 2; case P_NLA: return count(g, node->x, depth) + 2; } } static Reinst *emit(Reprog *prog, int opcode) { Reinst *inst = prog->end++; inst->opcode = opcode; inst->n = 0; inst->c = 0; inst->cc = NULL; inst->x = inst->y = NULL; return inst; } static void compile(Reprog *prog, Renode *node) { Reinst *inst, *split, *jump; int i; loop: if (!node) return; switch (node->type) { case P_CAT: compile(prog, node->x); node = node->y; goto loop; case P_ALT: split = emit(prog, I_SPLIT); compile(prog, node->x); jump = emit(prog, I_JUMP); compile(prog, node->y); split->x = split + 1; split->y = jump + 1; jump->x = prog->end; break; case P_REP: inst = NULL; /* silence compiler warning. assert(node->m > 0). */ for (i = 0; i < node->m; ++i) { inst = prog->end; compile(prog, node->x); } if (node->m == node->n) break; if (node->n < REPINF) { for (i = node->m; i < node->n; ++i) { split = emit(prog, I_SPLIT); compile(prog, node->x); if (node->ng) { split->y = split + 1; split->x = prog->end; } else { split->x = split + 1; split->y = prog->end; } } } else if (node->m == 0) { split = emit(prog, I_SPLIT); compile(prog, node->x); jump = emit(prog, I_JUMP); if (node->ng) { split->y = split + 1; split->x = prog->end; } else { split->x = split + 1; split->y = prog->end; } jump->x = split; } else { split = emit(prog, I_SPLIT); if (node->ng) { split->y = inst; split->x = prog->end; } else { split->x = inst; split->y = prog->end; } } break; case P_BOL: emit(prog, I_BOL); break; case P_EOL: emit(prog, I_EOL); break; case P_WORD: emit(prog, I_WORD); break; case P_NWORD: emit(prog, I_NWORD); break; case P_PAR: inst = emit(prog, I_LPAR); inst->n = node->n; compile(prog, node->x); inst = emit(prog, I_RPAR); inst->n = node->n; break; case P_PLA: split = emit(prog, I_PLA); compile(prog, node->x); emit(prog, I_END); split->x = split + 1; split->y = prog->end; break; case P_NLA: split = emit(prog, I_NLA); compile(prog, node->x); emit(prog, I_END); split->x = split + 1; split->y = prog->end; break; case P_ANY: emit(prog, I_ANY); break; case P_CHAR: inst = emit(prog, I_CHAR); inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; break; case P_CCLASS: inst = emit(prog, I_CCLASS); inst->cc = prog->cclass + node->cc; break; case P_NCCLASS: inst = emit(prog, I_NCCLASS); inst->cc = prog->cclass + node->cc; break; case P_REF: inst = emit(prog, I_REF); inst->n = node->n; break; } } #ifdef TEST static void dumpnode(struct cstate *g, Renode *node) { Rune *p; Reclass *cc; if (!node) { printf("Empty"); return; } switch (node->type) { case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; case P_REP: printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n); dumpnode(g, node->x); printf(")"); break; case P_BOL: printf("Bol"); break; case P_EOL: printf("Eol"); break; case P_WORD: printf("Word"); break; case P_NWORD: printf("NotWord"); break; case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break; case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break; case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break; case P_ANY: printf("Any"); break; case P_CHAR: printf("Char(%c)", node->c); break; case P_CCLASS: printf("Class("); cc = g->cclass + node->cc; for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); printf(")"); break; case P_NCCLASS: printf("NotClass("); cc = g->cclass + node->cc; for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); printf(")"); break; case P_REF: printf("Ref(%d)", node->n); break; } } static void dumpcclass(Reclass *cc) { Rune *p; for (p = cc->spans; p < cc->end; p += 2) { if (p[0] > 32 && p[0] < 127) printf(" %c", p[0]); else printf(" \\x%02x", p[0]); if (p[1] > 32 && p[1] < 127) printf("-%c", p[1]); else printf("-\\x%02x", p[1]); } putchar('\n'); } static void dumpprog(Reprog *prog) { Reinst *inst; int i; for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { printf("% 5d: ", i); switch (inst->opcode) { case I_END: puts("end"); break; case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break; case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; case I_ANY: puts("any"); break; case I_ANYNL: puts("anynl"); break; case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break; case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break; case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break; case I_REF: printf("ref %d\n", inst->n); break; case I_BOL: puts("bol"); break; case I_EOL: puts("eol"); break; case I_WORD: puts("word"); break; case I_NWORD: puts("nword"); break; case I_LPAR: printf("lpar %d\n", inst->n); break; case I_RPAR: printf("rpar %d\n", inst->n); break; } } } #endif Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx, const char *pattern, int cflags, const char **errorp) { struct cstate g; Renode *node; Reinst *split, *jump; int i, n; g.pstart = NULL; g.prog = NULL; if (setjmp(g.kaboom)) { if (errorp) *errorp = g.error; alloc(ctx, g.pstart, 0); if (g.prog) { alloc(ctx, g.prog->cclass, 0); alloc(ctx, g.prog->start, 0); alloc(ctx, g.prog, 0); } return NULL; } g.prog = alloc(ctx, NULL, sizeof (Reprog)); if (!g.prog) die(&g, "cannot allocate regular expression"); g.prog->start = NULL; g.prog->cclass = NULL; n = strlen(pattern) * 2; if (n > REG_MAXPROG) die(&g, "program too large"); if (n > 0) { g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n); if (!g.pstart) die(&g, "cannot allocate regular expression parse list"); } g.source = pattern; g.ncclass = 0; g.nsub = 1; for (i = 0; i < REG_MAXSUB; ++i) g.sub[i] = 0; g.prog->flags = cflags; next(&g); node = parsealt(&g); if (g.lookahead == ')') die(&g, "unmatched ')'"); if (g.lookahead != EOF) die(&g, "syntax error"); #ifdef TEST dumpnode(&g, node); putchar('\n'); #endif n = 6 + count(&g, node, 0); if (n < 0 || n > REG_MAXPROG) die(&g, "program too large"); g.prog->nsub = g.nsub; g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst)); if (!g.prog->start) die(&g, "cannot allocate regular expression instruction list"); if (g.ncclass > 0) { g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass)); if (!g.prog->cclass) die(&g, "cannot allocate regular expression character class list"); memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass)); for (i = 0; i < g.ncclass; ++i) g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans); } split = emit(g.prog, I_SPLIT); split->x = split + 3; split->y = split + 1; emit(g.prog, I_ANYNL); jump = emit(g.prog, I_JUMP); jump->x = split; emit(g.prog, I_LPAR); compile(g.prog, node); emit(g.prog, I_RPAR); emit(g.prog, I_END); #ifdef TEST dumpprog(g.prog); #endif alloc(ctx, g.pstart, 0); if (errorp) *errorp = NULL; return g.prog; } void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog) { if (prog) { if (prog->cclass) alloc(ctx, prog->cclass, 0); alloc(ctx, prog->start, 0); alloc(ctx, prog, 0); } } static void *default_alloc(void *ctx, void *p, int n) { if (n == 0) { free(p); return NULL; } return realloc(p, (size_t)n); } Reprog *regcomp(const char *pattern, int cflags, const char **errorp) { return regcompx(default_alloc, NULL, pattern, cflags, errorp); } void regfree(Reprog *prog) { regfreex(default_alloc, NULL, prog); } /* Match */ static int isnewline(int c) { return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; } static int iswordchar(int c) { return c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9'); } static int incclass(Reclass *cc, Rune c) { Rune *p; for (p = cc->spans; p < cc->end; p += 2) if (p[0] <= c && c <= p[1]) return 1; return 0; } static int incclasscanon(Reclass *cc, Rune c) { Rune *p, r; for (p = cc->spans; p < cc->end; p += 2) for (r = p[0]; r <= p[1]; ++r) if (c == canon(r)) return 1; return 0; } static int strncmpcanon(const char *a, const char *b, int n) { Rune ra, rb; int c; while (n--) { if (!*a) return -1; if (!*b) return 1; a += chartorune(&ra, a); b += chartorune(&rb, b); c = canon(ra) - canon(rb); if (c) return c; } return 0; } static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth) { Resub scratch; int result; int i; Rune c; /* stack overflow */ if (depth > REG_MAXREC) return -1; for (;;) { switch (pc->opcode) { case I_END: return 0; case I_JUMP: pc = pc->x; break; case I_SPLIT: scratch = *out; result = match(pc->x, sp, bol, flags, &scratch, depth+1); if (result == -1) return -1; if (result == 0) { *out = scratch; return 0; } pc = pc->y; break; case I_PLA: result = match(pc->x, sp, bol, flags, out, depth+1); if (result == -1) return -1; if (result == 1) return 1; pc = pc->y; break; case I_NLA: scratch = *out; result = match(pc->x, sp, bol, flags, &scratch, depth+1); if (result == -1) return -1; if (result == 0) return 1; pc = pc->y; break; case I_ANYNL: if (!*sp) return 1; sp += chartorune(&c, sp); pc = pc + 1; break; case I_ANY: if (!*sp) return 1; sp += chartorune(&c, sp); if (isnewline(c)) return 1; pc = pc + 1; break; case I_CHAR: if (!*sp) return 1; sp += chartorune(&c, sp); if (flags & REG_ICASE) c = canon(c); if (c != pc->c) return 1; pc = pc + 1; break; case I_CCLASS: if (!*sp) return 1; sp += chartorune(&c, sp); if (flags & REG_ICASE) { if (!incclasscanon(pc->cc, canon(c))) return 1; } else { if (!incclass(pc->cc, c)) return 1; } pc = pc + 1; break; case I_NCCLASS: if (!*sp) return 1; sp += chartorune(&c, sp); if (flags & REG_ICASE) { if (incclasscanon(pc->cc, canon(c))) return 1; } else { if (incclass(pc->cc, c)) return 1; } pc = pc + 1; break; case I_REF: i = out->sub[pc->n].ep - out->sub[pc->n].sp; if (flags & REG_ICASE) { if (strncmpcanon(sp, out->sub[pc->n].sp, i)) return 1; } else { if (strncmp(sp, out->sub[pc->n].sp, i)) return 1; } if (i > 0) sp += i; pc = pc + 1; break; case I_BOL: if (sp == bol && !(flags & REG_NOTBOL)) { pc = pc + 1; break; } if (flags & REG_NEWLINE) { if (sp > bol && isnewline(sp[-1])) { pc = pc + 1; break; } } return 1; case I_EOL: if (*sp == 0) { pc = pc + 1; break; } if (flags & REG_NEWLINE) { if (isnewline(*sp)) { pc = pc + 1; break; } } return 1; case I_WORD: i = sp > bol && iswordchar(sp[-1]); i ^= iswordchar(sp[0]); if (!i) return 1; pc = pc + 1; break; case I_NWORD: i = sp > bol && iswordchar(sp[-1]); i ^= iswordchar(sp[0]); if (i) return 1; pc = pc + 1; break; case I_LPAR: out->sub[pc->n].sp = sp; pc = pc + 1; break; case I_RPAR: out->sub[pc->n].ep = sp; pc = pc + 1; break; default: return 1; } } } int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) { Resub scratch; int i; if (!sub) sub = &scratch; sub->nsub = prog->nsub; for (i = 0; i < REG_MAXSUB; ++i) sub->sub[i].sp = sub->sub[i].ep = NULL; return match(prog->start, sp, sp, prog->flags | eflags, sub, 0); } #ifdef TEST int main(int argc, char **argv) { const char *error; const char *s; Reprog *p; Resub m; int i; if (argc > 1) { p = regcomp(argv[1], 0, &error); if (!p) { fprintf(stderr, "regcomp: %s\n", error); return 1; } if (argc > 2) { s = argv[2]; printf("nsub = %d\n", p->nsub); if (!regexec(p, s, &m, 0)) { for (i = 0; i < m.nsub; ++i) { int n = m.sub[i].ep - m.sub[i].sp; if (n > 0) printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); else printf("match %d: n=0 ''\n", i); } } else { printf("no match\n"); } } } return 0; } #endif