ref: 1bfde841484fadb1c41f41a6279e7e070f61a31b
dir: /sys/src/libregexp/regcomp.c/
#include <u.h> #include <libc.h> #include <regexp.h> #include "regimpl.h" int instrcnt[] = { [TANY] 2, [TBOL] 1, [TCAT] 0, [TCLASS] 1, [TEOL] 1, [TNOTNL] 1, [TOR] 2, [TPLUS] 1, [TQUES] 1, [TRUNE] 1, [TSTAR] 2, [TSUB] 2 }; static Renode* node(Parselex *plex, int op, Renode *l, Renode *r) { Renode *n; plex->instrs += instrcnt[op]; n = plex->next++; n->op = op; n->left = l; n->right = r; return n; } static Renode* e3(Parselex *plex) { char error[128]; Renode *n; switch(lex(plex)) { case LANY: return node(plex, TANY, nil, nil); case LBOL: return node(plex, TBOL, nil, nil); case LEOL: return node(plex, TEOL, nil, nil); case LRUNE: n = node(plex, TRUNE, nil, nil); n->r = plex->rune; return n; case LCLASS: if(plex->nc) return buildclassn(plex); return buildclass(plex); case LLPAR: n = e0(plex); n = node(plex, TSUB, n, nil); if(lex(plex) != LRPAR) { snprint(error, sizeof(error), "regexp %s: no matching parenthesis", plex->orig); regerror(error); longjmp(plex->exitenv, 1); } return n; default: if(plex->rune) snprint(error, sizeof(error), "regexp %s: syntax error: %C", plex->orig, plex->rune); else snprint(error, sizeof(error), "regexp %s: parsing error", plex->orig); regerror(error); longjmp(plex->exitenv, 1); } return nil; } static Renode* e2(Parselex *plex) { Renode *n; n = e3(plex); while(lex(plex) == LREP) { switch(plex->rune) { case L'*': n = node(plex, TSTAR, n, nil); break; case L'+': n = node(plex, TPLUS, n, nil); break; case L'?': n = node(plex, TQUES, n, nil); break; } } plex->peek = 1; return n; } static Renode* invert(Renode *n) { Renode *n1; if(n->op != TCAT) return n; while(n->left->op == TCAT) { n1 = n->left; n->left = n1->right; n1->right = n; n = n1; } return n; } static Renode* e1(Parselex *plex) { Renode *n; int sym; n = e2(plex); for(;;) { sym = lex(plex); if(sym == LEND || sym == LOR || sym == LRPAR) break; plex->peek = 1; n = node(plex, TCAT, n, e2(plex)); } plex->peek = 1; return invert(n); } static Renode* e0(Parselex *plex) { Renode *n; n = e1(plex); for(;;) { if(lex(plex) != LOR) break; n = node(plex, TOR, n, e1(plex)); } plex->peek = 1; return n; } static Parselex* initplex(Parselex *plex, char *regstr, int lit) { plex->getnextr = lit ? getnextrlit : getnextr; plex->rawexp = plex->orig = regstr; plex->sub = 0; plex->instrs = 0; plex->peek = 0; plex->done = 0; return plex; } static Reprog* regcomp1(char *regstr, int nl, int lit) { Reprog *reprog; Parselex plex; Renode *parsetr; int regstrlen, maxthr; regstrlen = utflen(regstr); initplex(&plex, regstr, lit); plex.nodes = calloc(sizeof(*plex.nodes), regstrlen*2); if(plex.nodes == nil) return nil; plex.next = plex.nodes; if(setjmp(plex.exitenv) != 0) { free(plex.nodes); return nil; } maxthr = regstrlen + 1; parsetr = node(&plex, TSUB, e0(&plex), nil); // prtree(parsetr, 0, 1); reprog = malloc(sizeof(Reprog) + sizeof(Reinst) * plex.instrs + sizeof(Rethread) * maxthr); reprog->len = plex.instrs; reprog->nthr = maxthr; reprog->startinst = compile(parsetr, reprog, nl); reprog->threads = (Rethread*)(reprog->startinst + reprog->len); reprog->regstr = regstr; free(plex.nodes); return reprog; } Reprog* regcomp(char *str) { return regcomp1(str, 0, 0); } Reprog* regcomplit(char *str) { return regcomp1(str, 0, 1); } Reprog* regcompnl(char *str) { return regcomp1(str, 1, 0); } static Reinst* compile1(Renode *renode, Reinst *reinst, int *sub, int nl) { Reinst *i; int s; Tailcall: if(renode == nil) return reinst; switch(renode->op) { case TCLASS: reinst->op = OCLASS; reinst->r = renode->r; reinst->r1 = renode->r1; reinst->a = reinst + 1 + renode->nclass; renode = renode->left; reinst++; goto Tailcall; case TCAT: reinst = compile1(renode->left, reinst, sub, nl); renode = renode->right; goto Tailcall; case TOR: reinst->op = OSPLIT; reinst->a = reinst + 1; i = compile1(renode->left, reinst->a, sub, nl); reinst->b = i + 1; i->op = OJMP; i->a = compile1(renode->right, reinst->b, sub, nl); return i->a; case TSTAR: reinst->op = OSPLIT; reinst->a = reinst + 1; i = compile1(renode->left, reinst->a, sub, nl); reinst->b = i + 1; i->op = OJMP; i->a = reinst; return reinst->b; case TPLUS: i = reinst; reinst = compile1(renode->left, reinst, sub, nl); reinst->op = OSPLIT; reinst->a = i; reinst->b = reinst + 1; return reinst->b; case TQUES: reinst->op = OSPLIT; reinst->a = reinst + 1; reinst->b = compile1(renode->left, reinst->a, sub, nl); return reinst->b; case TSUB: reinst->op = OSAVE; reinst->sub = s = (*sub)++; reinst = compile1(renode->left, reinst+1, sub, nl); reinst->op = OUNSAVE; reinst->sub = s; return reinst + 1; case TANY: if(nl == 0) reinst++->op = ONOTNL; reinst->op = OANY; return reinst + 1; case TRUNE: reinst->op = ORUNE; reinst->r = renode->r; return reinst + 1; case TNOTNL: reinst->op = ONOTNL; return reinst + 1; case TEOL: reinst->op = OEOL; return reinst + 1; case TBOL: reinst->op = OBOL; return reinst + 1; } return nil; } static Reinst* compile(Renode *parsetr, Reprog *reprog, int nl) { Reinst *reinst, *end; int sub; sub = 0; reinst = (Reinst*)(reprog+1); end = compile1(parsetr, reinst, &sub, nl); assert(end <= reinst + reprog->len); return reinst; } static void getnextr(Parselex *l) { l->literal = 0; if(l->done) { l->rune = L'\0'; return; } l->rawexp += chartorune(&l->rune, l->rawexp); if(*l->rawexp == 0) l->done = 1; if(l->rune == L'\\') getnextrlit(l); } static void getnextrlit(Parselex *l) { l->literal = 1; if(l->done) { l->literal = 0; l->rune = L'\0'; return; } l->rawexp += chartorune(&l->rune, l->rawexp); if(*l->rawexp == 0) l->done = 1; } static int lex(Parselex *l) { if(l->peek) { l->peek = 0; return l->peeklex; } l->getnextr(l); if(l->literal) return l->peeklex = LRUNE; switch(l->rune){ case L'\0': return l->peeklex = LEND; case L'*': case L'?': case L'+': return l->peeklex = LREP; case L'|': return l->peeklex = LOR; case L'.': return l->peeklex = LANY; case L'(': return l->peeklex = LLPAR; case L')': return l->peeklex = LRPAR; case L'^': return l->peeklex = LBOL; case L'$': return l->peeklex = LEOL; case L'[': getclass(l); return l->peeklex = LCLASS; } return l->peeklex = LRUNE; } static int pcmp(void *va, void *vb) { Rune *a, *b; a = va; b = vb; if(a[0] < b[0]) return 1; if(a[0] > b[0]) return -1; if(a[1] < b[1]) return 1; if(a[1] > b[1]) return -1; return 0; } static void getclass(Parselex *l) { Rune *p, *q, t; l->nc = 0; getnextrlit(l); if(l->rune == L'^') { l->nc = 1; getnextrlit(l); } p = l->cpairs; for(;;) { *p = l->rune; if(l->rune == L']') break; if(l->rune == L'-') { regerror("Malformed class"); longjmp(l->exitenv, 1); } if(l->rune == '\\') { getnextrlit(l); *p = l->rune; } if(l->rune == 0) { regerror("No closing ] for class"); longjmp(l->exitenv, 1); } getnextrlit(l); if(l->rune == L'-') { getnextrlit(l); if(l->rune == L']') { regerror("Malformed class"); longjmp(l->exitenv, 1); } if(l->rune == L'-') { regerror("Malformed class"); longjmp(l->exitenv, 1); } if(l->rune == L'\\') getnextrlit(l); p[1] = l->rune; if(p[0] > p[1]) { t = p[0]; p[0] = p[1]; p[1] = t; } getnextrlit(l); } else p[1] = p[0]; if(p >= l->cpairs + nelem(l->cpairs) - 2) { regerror("Class too big\n"); longjmp(l->exitenv, 1); } p += 2; } *p = L'\0'; qsort(l->cpairs, (p - l->cpairs)/2, 2*sizeof(*l->cpairs), pcmp); q = l->cpairs; for(p = l->cpairs+2; *p != 0; p += 2) { if(p[1] < q[0] - 1) { q[2] = p[0]; q[3] = p[1]; q += 2; continue; } q[0] = p[0]; if(p[1] > q[1]) q[1] = p[1]; } q[2] = 0; } /* classes are in descending order see pcmp */ static Renode* buildclassn(Parselex *l) { Renode *n; Rune *p; int i; i = 0; p = l->cpairs; n = node(l, TCLASS, nil, nil); n->r = p[1] + 1; n->r1 = Runemax; n->nclass = i++; for(; *p != 0; p += 2) { n = node(l, TCLASS, n, nil); n->r = p[3] + 1; n->r1 = p[0] - 1; n->nclass = i++; } n->r = 0; return node(l, TCAT, node(l, TNOTNL, nil, nil), n); } static Renode* buildclass(Parselex *l) { Renode *n; Rune *p; int i; i = 0; n = node(l, TCLASS, nil, nil); n->r = Runemax + 1; n->nclass = i++; for(p = l->cpairs; *p != 0; p += 2) { n = node(l, TCLASS, n, nil); n->r = p[0]; n->r1 = p[1]; n->nclass = i++; } return n; } static void prtree(Renode *tree, int d, int f) { int i; if(tree == nil) return; if(f) for(i = 0; i < d; i++) print("\t"); switch(tree->op) { case TCAT: prtree(tree->left, d, 0); prtree(tree->right, d, 1); break; case TOR: print("TOR\n"); prtree(tree->left, d+1, 1); for(i = 0; i < d; i++) print("\t"); print("|\n"); prtree(tree->right, d+1, 1); break; case TSTAR: print("*\n"); prtree(tree->left, d+1, 1); break; case TPLUS: print("+\n"); prtree(tree->left, d+1, 1); break; case TQUES: print("?\n"); prtree(tree->left, d+1, 1); break; case TANY: print(".\n"); prtree(tree->left, d+1, 1); break; case TBOL: print("^\n"); break; case TEOL: print("$\n"); break; case TSUB: print("TSUB\n"); prtree(tree->left, d+1, 1); break; case TRUNE: print("TRUNE: %C\n", tree->r); break; case TNOTNL: print("TNOTNL: !\\n\n"); break; case TCLASS: print("CLASS: %C-%C\n", tree->r, tree->r1); prtree(tree->left, d, 1); break; } }