ref: 64136bfd1670e1b3bae08684c6c4bb9dfc710043
dir: /sys/src/cmd/dict/dict.c/
#include <u.h> #include <libc.h> #include <bio.h> #include <regexp.h> #include <ctype.h> #include "dict.h" /* * Assumed index file structure: lines of form * [^\t]+\t[0-9]+ * First field is key, second is byte offset into dictionary. * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2 */ typedef struct Addr Addr; struct Addr { int n; /* number of offsets */ int cur; /* current position within doff array */ int maxn; /* actual current size of doff array */ ulong doff[1]; /* doff[maxn], with 0..n-1 significant */ }; Biobuf binbuf; Biobuf boutbuf; Biobuf *bin = &binbuf; /* user cmd input */ Biobuf *bout = &boutbuf; /* output */ Biobuf *bdict; /* dictionary */ Biobuf *bindex; /* index file */ long indextop; /* index offset at end of file */ int lastcmd; /* last executed command */ Addr *dot; /* "current" address */ Dict *dict; /* current dictionary */ int linelen; int breaklen = 60; int outinhibit; int debug; void execcmd(int); int getpref(char*, Rune*); Entry getentry(int); int getfield(Rune*); long locate(Rune*); int parseaddr(char*, char**); int parsecmd(char*); int search(char*, int); long seeknextline(Biobuf*, long); void setdotnext(void); void setdotprev(void); void sortaddr(Addr*); void usage(void); enum { Plen=300, /* max length of a search pattern */ Fieldlen=200, /* max length of an index field */ Aslots=10, /* initial number of slots in an address */ }; void main(int argc, char **argv) { int i, cmd, kflag; char *line, *p; Binit(&binbuf, 0, OREAD); Binit(&boutbuf, 1, OWRITE); kflag = 0; line = 0; dict = 0; for(i=0; dicts[i].name; i++){ if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){ dict = &dicts[i]; break; } } ARGBEGIN { case 'd': p = ARGF(); dict = 0; if(p) { for(i=0; dicts[i].name; i++) if(strcmp(p, dicts[i].name)==0) { dict = &dicts[i]; break; } } if(!dict) usage(); break; case 'c': line = ARGF(); if(!line) usage(); break; case 'k': kflag++; break; case 'D': debug++; break; default: usage(); ARGEND } if(dict == 0){ err("no dictionaries present on this system"); exits("nodict"); } if(kflag) { (*dict->printkey)(); exits(0); } if(argc > 1) usage(); else if(argc == 1) { if(line) usage(); p = argv[0]; line = malloc(strlen(p)+5); sprint(line, "/%s/P\n", p); } bdict = Bopen(dict->path, OREAD); if(!bdict) { err("can't open dictionary %s", dict->path); exits("nodict"); } bindex = Bopen(dict->indexpath, OREAD); if(!bindex) { err("can't open index %s", dict->indexpath); exits("noindex"); } indextop = Bseek(bindex, 0L, 2); dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong)); dot->n = 0; dot->cur = 0; dot->maxn = Aslots; lastcmd = 0; if(line) { cmd = parsecmd(line); if(cmd) execcmd(cmd); } else { for(;;) { Bprint(bout, "*"); Bflush(bout); line = Brdline(bin, '\n'); linelen = 0; if(!line) break; cmd = parsecmd(line); if(cmd) { execcmd(cmd); lastcmd = cmd; } } } exits(0); } void usage(void) { int i; char *a, *b; Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0); Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n"); for(i = 0; dicts[i].name; i++){ a = b = ""; if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){ a = "["; b = "]"; } Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b); } exits("usage"); } int parsecmd(char *line) { char *e; int cmd, ans; if(parseaddr(line, &e) >= 0) line = e; else return 0; cmd = *line; ans = cmd; if(isupper(cmd)) cmd = tolower(cmd); if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' || cmd == '\n')) { err("unknown command %c", cmd); return 0; } if(cmd == '\n') switch(lastcmd) { case 0: ans = 'H'; break; case 'H': ans = 'p'; break; default : ans = lastcmd; break; } else if(line[1] != '\n' && line[1] != 0) err("extra stuff after command %c ignored", cmd); return ans; } void execcmd(int cmd) { Entry e; int cur, doall; if(isupper(cmd)) { doall = 1; cmd = tolower(cmd); cur = 0; } else { doall = 0; cur = dot->cur; } if(debug && doall && cmd == 'a') Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1); for(;;){ if(cur >= dot->n) break; if(doall) { Bprint(bout, "%d\t", cur+1); linelen += 4 + (cur >= 10); } switch(cmd) { case 'a': Bprint(bout, "#%lud\n", dot->doff[cur]); break; case 'h': case 'p': case 'r': e = getentry(cur); (*dict->printentry)(e, cmd); break; } cur++; if(doall) { if(cmd == 'p' || cmd == 'r') { Bputc(bout, '\n'); linelen = 0; } } else break; } if(cur >= dot->n) cur = 0; dot->cur = cur; } /* * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')* * Answer goes in dot. * Return -1 if address starts, but get error. * Return 0 if no address. */ int parseaddr(char *line, char **eptr) { int delim, plen; ulong v; char *e; char pat[Plen]; if(*line == '/' || *line == '!') { /* anchored regular expression match; '!' means no folding */ if(*line == '/') { delim = '/'; e = strpbrk(line+1, "/\n"); } else { delim = '!'; e = strpbrk(line+1, "!\n"); } plen = e-line-1; if(plen >= Plen-3) { err("pattern too big"); return -1; } pat[0] = '^'; memcpy(pat+1, line+1, plen); pat[plen+1] = '$'; pat[plen+2] = 0; if(*e == '\n') line = e; else line = e+1; if(!search(pat, delim == '/')) { err("pattern not found"); return -1; } } else if(*line == '#') { /* absolute byte offset into dictionary */ line++; if(!isdigit(*line)) return -1; v = strtoul(line, &e, 10); line = e; dot->doff[0] = v; dot->n = 1; dot->cur = 0; } else if(isdigit(*line)) { v = strtoul(line, &e, 10); line = e; if(v < 1 || v > dot->n) err(".%d not in range [1,%d], ignored", v, dot->n); else dot->cur = v-1; } else if(*line == '.') { line++; } else { *eptr = line; return 0; } while(*line == '+' || *line == '-') { if(*line == '+') setdotnext(); else setdotprev(); line++; } *eptr = line; return 1; } /* * Index file is sorted by folded field1. * Method: find pre, a folded prefix of r.e. pat, * and then low = offset to beginning of * line in index file where first match of prefix occurs. * Then go through index until prefix no longer matches, * adding each line that matches real pattern to dot. * Finally, sort dot offsets (uniquing). * We know pat len < Plen, and that it is surrounded by ^..$ */ int search(char *pat, int dofold) { int needre, prelen, match, n; Reprog *re; long ioff, v; Rune pre[Plen]; Rune lit[Plen]; Rune entry[Fieldlen]; char fpat[Plen]; prelen = getpref(pat+1, pre); if(pat[prelen+1] == 0 || pat[prelen+1] == '$') { runescpy(lit, pre); if(dofold) fold(lit); needre = 0; SET(re); } else { needre = 1; if(dofold) { foldre(fpat, pat); re = regcomp(fpat); } else re = regcomp(pat); } fold(pre); ioff = locate(pre); if(ioff < 0) return 0; dot->n = 0; Bseek(bindex, ioff, 0); for(;;) { if(!getfield(entry)) break; if(dofold) fold(entry); if(needre) match = rregexec(re, entry, 0, 0); else match = (acomp(lit, entry) == 0); if(match) { if(!getfield(entry)) break; v = runetol(entry); if(dot->n >= dot->maxn) { n = 2*dot->maxn; dot = realloc(dot, sizeof(Addr)+(n-1)*sizeof(long)); if(!dot) { err("out of memory"); exits("nomem"); } dot->maxn = n; } dot->doff[dot->n++] = v; } else { if(!dofold) fold(entry); if(*pre) { n = acomp(pre, entry); if(n < -1 || (!needre && n < 0)) break; } /* get to next index entry */ if(!getfield(entry)) break; } } sortaddr(dot); dot->cur = 0; return dot->n; } /* * Return offset in index file of first line whose folded * first field has pre as a prefix. -1 if none found. */ long locate(Rune *pre) { long top, bot, mid; Rune entry[Fieldlen]; if(*pre == 0) return 0; bot = 0; top = indextop; if(debug>1) fprint(2, "locate looking for prefix %S\n", pre); for(;;) { /* * Loop invariant: foldkey(bot) < pre <= foldkey(top) * and bot < top, and bot,top point at beginning of lines */ mid = (top+bot) / 2; mid = seeknextline(bindex, mid); if(debug > 1) fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n", bot, (top+bot) / 2, mid, top); if(mid == top || !getfield(entry)) break; if(debug > 1) fprint(2, "key=%S\n", entry); /* * here mid is strictly between bot and top */ fold(entry); if(acomp(pre, entry) <= 0) top = mid; else bot = mid; } /* * bot < top, but they don't necessarily point at successive lines * Use linear search from bot to find first line that pre is a * prefix of */ while((bot = seeknextline(bindex, bot)) <= top) { if(!getfield(entry)) return -1; if(debug > 1) fprint(2, "key=%S\n", entry); fold(entry); switch(acomp(pre, entry)) { case -2: return -1; case -1: case 0: return bot; case 1: case 2: continue; } } return -1; } /* * Get prefix of non re-metacharacters, runified, into pre, * and return length */ int getpref(char *pat, Rune *pre) { int n, r; char *p; p = pat; while(*p) { n = chartorune(pre, p); r = *pre; switch(r) { case L'.': case L'*': case L'+': case L'?': case L'[': case L']': case L'(': case ')': case L'|': case L'^': case L'$': *pre = 0; return p-pat; case L'\\': p += n; p += chartorune(++pre, p); pre++; break; default: p += n; pre++; } } return p-pat; } long seeknextline(Biobuf *b, long off) { long c; Bseek(b, off, 0); do { c = Bgetrune(b); } while(c>=0 && c!='\n'); return Boffset(b); } /* * Get next field out of index file (either tab- or nl- terminated) * Answer in *rp, assumed to be Fieldlen long. * Return 0 if read error first. */ int getfield(Rune *rp) { long c; int n; for(n=Fieldlen; n-- > 0; ) { if ((c = Bgetrune(bindex)) < 0) return 0; if(c == '\t' || c == '\n') { *rp = L'\0'; return 1; } *rp++ = c; } err("word too long"); return 0; } /* * A compare longs function suitable for qsort */ static int longcmp(void *av, void *bv) { long v; long *a, *b; a = av; b = bv; v = *a - *b; if(v < 0) return -1; else if(v == 0) return 0; else return 1; } void sortaddr(Addr *a) { int i, j; long v; if(a->n <= 1) return; qsort(a->doff, a->n, sizeof(long), longcmp); /* remove duplicates */ for(i=0, j=0; j < a->n; j++) { v = a->doff[j]; if(i > 0 && v == a->doff[i-1]) continue; a->doff[i++] = v; } a->n = i; } Entry getentry(int i) { long b, e, n; static Entry ans; static int anslen = 0; b = dot->doff[i]; e = (*dict->nextoff)(b+1); ans.doff = b; if(e < 0) { err("couldn't seek to entry"); ans.start = 0; ans.end = 0; } else { n = e-b; if(n+1 > anslen) { ans.start = realloc(ans.start, n+1); if(!ans.start) { err("out of memory"); exits("nomem"); } anslen = n+1; } Bseek(bdict, b, 0); n = Bread(bdict, ans.start, n); ans.end = ans.start + n; *ans.end = 0; } return ans; } void setdotnext(void) { long b; b = (*dict->nextoff)(dot->doff[dot->cur]+1); if(b < 0) { err("couldn't find a next entry"); return; } dot->doff[0] = b; dot->n = 1; dot->cur = 0; } void setdotprev(void) { int tryback; long here, last, p; if(dot->cur < 0 || dot->cur >= dot->n) return; tryback = 2000; here = dot->doff[dot->cur]; last = 0; while(last == 0) { p = here - tryback; if(p < 0) p = 0; for(;;) { p = (*dict->nextoff)(p+1); if(p < 0) return; /* shouldn't happen */ if(p >= here) break; last = p; } if(!last) { if(here - tryback < 0) { err("can't find a previous entry"); return; } tryback = 2*tryback; } } dot->doff[0] = last; dot->n = 1; dot->cur = 0; }