ref: 37b86df09ff381bcc4f60802d43e57bd9bfcac73
dir: /sys/src/cmd/awk/re.c/
/**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that the copyright notice and this permission notice and warranty disclaimer appear in supporting documentation, and that the name Lucent Technologies or any of its entities not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission. LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ #include <u.h> #include <libc.h> #include <ctype.h> #include <bio.h> #include <regexp.h> #include "awk.h" #include "y.tab.h" /* This file provides the interface between the main body of * awk and the pattern matching package. It preprocesses * patterns prior to compilation to provide awk-like semantics * to character sequences not supported by the pattern package. * The following conversions are performed: * * "()" -> "[]" * "[-" -> "[\-" * "[^-" -> "[^\-" * "-]" -> "\-]" * "[]" -> "[]*" * "\xdddd" -> "\z" where 'z' is the UTF sequence * for the hex value * "\ddd" -> "\o" where 'o' is a char octal value * "\b" -> "\B" where 'B' is backspace * "\t" -> "\T" where 'T' is tab * "\f" -> "\F" where 'F' is form feed * "\n" -> "\N" where 'N' is newline * "\r" -> "\r" where 'C' is cr */ #define MAXRE 512 static char re[MAXRE]; /* copy buffer */ char *patbeg; int patlen; /* number of chars in pattern */ #define NPATS 20 /* number of slots in pattern cache */ static struct pat_list /* dynamic pattern cache */ { char *re; int use; Reprog *program; } pattern[NPATS]; static int npats; /* cache fill level */ /* Compile a pattern */ void *compre(char *pat) { int i, j, inclass; char c, *p, *s; Reprog *program; if (!compile_time) { /* search cache for dynamic pattern */ for (i = 0; i < npats; i++) if (!strcmp(pat, pattern[i].re)) { pattern[i].use++; return((void *) pattern[i].program); } } /* Preprocess Pattern for compilation */ p = re; s = pat; inclass = 0; while (c = *s++) { if (c == '\\') { quoted(&s, &p, re+MAXRE); continue; } else if (!inclass && c == '(' && *s == ')') { if (p < re+MAXRE-2) { /* '()' -> '[]*' */ *p++ = '['; *p++ = ']'; c = '*'; s++; } else overflow(); } else if (c == '['){ /* '[-' -> '[\-' */ inclass = 1; if (*s == '-') { if (p < re+MAXRE-2) { *p++ = '['; *p++ = '\\'; c = *s++; } else overflow(); } /* '[^-' -> '[^\-'*/ else if (*s == '^' && s[1] == '-'){ if (p < re+MAXRE-3) { *p++ = '['; *p++ = *s++; *p++ = '\\'; c = *s++; } else overflow(); } else if (*s == '['){ /* skip '[[' */ if (p < re+MAXRE-1) *p++ = c; else overflow(); c = *s++; } else if (*s == '^' && s[1] == '[') { /* skip '[^['*/ if (p < re+MAXRE-2) { *p++ = c; *p++ = *s++; c = *s++; } else overflow(); } else if (*s == ']') { /* '[]' -> '[]*' */ if (p < re+MAXRE-2) { *p++ = c; *p++ = *s++; c = '*'; inclass = 0; } else overflow(); } } else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */ if (p < re+MAXRE-1) *p++ = '\\'; else overflow(); } else if (c == ']') inclass = 0; if (p < re+MAXRE-1) *p++ = c; else overflow(); } *p = 0; program = regcomp(re); /* compile pattern */ if (!compile_time) { if (npats < NPATS) /* Room in cache */ i = npats++; else { /* Throw out least used */ int use = pattern[0].use; i = 0; for (j = 1; j < NPATS; j++) { if (pattern[j].use < use) { use = pattern[j].use; i = j; } } xfree(pattern[i].program); xfree(pattern[i].re); } pattern[i].re = tostring(pat); pattern[i].program = program; pattern[i].use = 1; } return((void *) program); } /* T/F match indication - matched string not exported */ int match(void *p, char *s, char *) { return regexec((Reprog *) p, (char *) s, 0, 0); } /* match and delimit the matched string */ int pmatch(void *p, char *s, char *start) { Resub m; m.sp = start; m.ep = 0; if (regexec((Reprog *) p, (char *) s, &m, 1)) { patbeg = m.sp; patlen = m.ep-m.sp; return 1; } patlen = -1; patbeg = start; return 0; } /* perform a non-empty match */ int nematch(void *p, char *s, char *start) { if (pmatch(p, s, start) == 1 && patlen > 0) return 1; patlen = -1; patbeg = start; return 0; } /* in the parsing of regular expressions, metacharacters like . have */ /* to be seen literally; \056 is not a metacharacter. */ hexstr(char **pp) /* find and eval hex string at pp, return new p */ { char c; int n = 0; int i; for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) { if (isdigit(c)) n = 16 * n + c - '0'; else if ('a' <= c && c <= 'f') n = 16 * n + c - 'a' + 10; else if ('A' <= c && c <= 'F') n = 16 * n + c - 'A' + 10; } *pp += i; return n; } /* look for awk-specific escape sequences */ #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ void quoted(char **s, char **to, char *end) /* handle escaped sequence */ { char *p = *s; char *t = *to; Rune c; switch(c = *p++) { case 't': c = '\t'; break; case 'n': c = '\n'; break; case 'f': c = '\f'; break; case 'r': c = '\r'; break; case 'b': c = '\b'; break; default: if (t < end-1) /* all else must be escaped */ *t++ = '\\'; if (c == 'x') { /* hexadecimal goo follows */ c = hexstr(&p); if (t < end-UTFmax) t += runelen(c); else overflow(); *to = t; *s = p; return; } else if (isoctdigit(c)) { /* \d \dd \ddd */ c -= '0'; if (isoctdigit(*p)) { c = 8 * c + *p++ - '0'; if (isoctdigit(*p)) c = 8 * c + *p++ - '0'; } } break; } if (t < end-1) *t++ = c; *s = p; *to = t; } /* pattern package error handler */ void regerror(char *s) { FATAL("%s", s); } void overflow(void) { FATAL("%s", "regular expression too big"); }