shithub: riscv

ref: f899b2fe3c091ccab33673f9fea9d1d99c0c2940
dir: /sys/src/cmd/dict/dict.c/

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <regexp.h>
#include <ctype.h>
#include "dict.h"

/*
 * Assumed index file structure: lines of form
 * 	[^\t]+\t[0-9]+
 * First field is key, second is byte offset into dictionary.
 * Should be sorted with args -u -t'	' +0f -1 +0 -1 +1n -2
 */
typedef struct Addr Addr;

struct Addr {
	int	n;		/* number of offsets */
	int	cur;		/* current position within doff array */
	int	maxn;		/* actual current size of doff array */
	ulong	doff[1];	/* doff[maxn], with 0..n-1 significant */
};

Biobuf	binbuf;
Biobuf	boutbuf;
Biobuf	*bin = &binbuf;		/* user cmd input */
Biobuf	*bout = &boutbuf;	/* output */
Biobuf	*bdict;			/* dictionary */
Biobuf	*bindex;		/* index file */
long	indextop;		/* index offset at end of file */
int	lastcmd;		/* last executed command */
Addr	*dot;			/* "current" address */
Dict	*dict;			/* current dictionary */
int	linelen;
int	breaklen = 60;
int	outinhibit;
int	debug;

void	execcmd(int);
int	getpref(char*, Rune*);
Entry	getentry(int);
int	getfield(Rune*);
long	locate(Rune*);
int	parseaddr(char*, char**);
int	parsecmd(char*);
int	search(char*, int);
long	seeknextline(Biobuf*, long);
void	setdotnext(void);
void	setdotprev(void);
void	sortaddr(Addr*);
void	usage(void);

enum {
	Plen=300,	/* max length of a search pattern */
	Fieldlen=200,	/* max length of an index field */
	Aslots=10,	/* initial number of slots in an address */
};

void
main(int argc, char **argv)
{
	int i, cmd, kflag;
	char *line, *p;

	Binit(&binbuf, 0, OREAD);
	Binit(&boutbuf, 1, OWRITE);
	kflag = 0;
	line = 0;
	dict = 0;
	for(i=0; dicts[i].name; i++){
		if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
			dict = &dicts[i];
			break;
		}
	}
	ARGBEGIN {
		case 'd':
			p = ARGF();
			dict = 0;
			if(p) {
				for(i=0; dicts[i].name; i++)
					if(strcmp(p, dicts[i].name)==0) {
						dict = &dicts[i];
						break;
					}
			}
			if(!dict)
				usage();
			break;
		case 'c':
			line = ARGF();
			if(!line)
				usage();
			break;
		case 'k':
			kflag++;
			break;
		case 'D':
			debug++;
			break;
		default:
			usage();
	ARGEND }
	if(dict == 0){
		err("no dictionaries present on this system");
		exits("nodict");
	}

	if(kflag) {
		(*dict->printkey)();
		exits(0);
	}
	if(argc > 1)
		usage();
	else if(argc == 1) {
		if(line)
			usage();
		p = argv[0];
		line = malloc(strlen(p)+5);
		sprint(line, "/%s/P\n", p);
	}
	bdict = Bopen(dict->path, OREAD);
	if(!bdict) {
		err("can't open dictionary %s", dict->path);
		exits("nodict");
	}
	bindex = Bopen(dict->indexpath, OREAD);
	if(!bindex) {
		err("can't open index %s", dict->indexpath);
		exits("noindex");
	}
	indextop = Bseek(bindex, 0L, 2);

	dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
	dot->n = 0;
	dot->cur = 0;
	dot->maxn = Aslots;
	lastcmd = 0;

	if(line) {
		cmd = parsecmd(line);
		if(cmd)
			execcmd(cmd);
	} else {
		for(;;) {
			Bprint(bout, "*");
			Bflush(bout);
			line = Brdline(bin, '\n');
			linelen = 0;
			if(!line)
				break;
			cmd = parsecmd(line);
			if(cmd) {
				execcmd(cmd);
				lastcmd = cmd;
			}
		}
	}
	exits(0);
}

void
usage(void)
{
	int i;
	char *a, *b;

	Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
	Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
	for(i = 0; dicts[i].name; i++){
		a = b = "";
		if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
			a = "[";
			b = "]";
		}
		Bprint(bout, "   %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
	}
	exits("usage");
}

int
parsecmd(char *line)
{
	char *e;
	int cmd, ans;

	if(parseaddr(line, &e) >= 0)
		line = e;
	else
		return 0;
	cmd = *line;
	ans = cmd;
	if(isupper(cmd))
		cmd = tolower(cmd);
	if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
	     cmd == '\n')) {
		err("unknown command %c", cmd);
		return 0;
	}
	if(cmd == '\n')
		switch(lastcmd) {
			case 0:	ans = 'H'; break;
			case 'H':	ans = 'p'; break;
			default :	ans = lastcmd; break;
		}
	else if(line[1] != '\n' && line[1] != 0)
		err("extra stuff after command %c ignored", cmd);
	return ans;
}

void
execcmd(int cmd)
{
	Entry e;
	int cur, doall;

	if(isupper(cmd)) {
		doall = 1;
		cmd = tolower(cmd);
		cur = 0;
	} else {
		doall = 0;
		cur = dot->cur;
	}

	if(debug && doall && cmd == 'a')
		Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
	for(;;){
		if(cur >= dot->n)
			break;
		if(doall) {
			Bprint(bout, "%d\t", cur+1);
			linelen += 4 + (cur >= 10);
		}
		switch(cmd) {
		case 'a':
			Bprint(bout, "#%lud\n", dot->doff[cur]);
			break;
		case 'h':
		case 'p':
		case 'r':
			e = getentry(cur);
			(*dict->printentry)(e, cmd);
			break;
		}
		cur++;
		if(doall) {
			if(cmd == 'p' || cmd == 'r') {
				Bputc(bout, '\n');
				linelen = 0;
			}
		} else
			break;
	}
	if(cur >= dot->n)
		cur = 0;
	dot->cur = cur;
}

/*
 * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
 * Answer goes in dot.
 * Return -1 if address starts, but get error.
 * Return 0 if no address.
 */
int
parseaddr(char *line, char **eptr)
{
	int delim, plen;
	ulong v;
	char *e;
	char pat[Plen];

	if(*line == '/' || *line == '!') {
		/* anchored regular expression match; '!' means no folding */
		if(*line == '/') {
			delim = '/';
			e = strpbrk(line+1, "/\n");
		} else {
			delim = '!';
			e = strpbrk(line+1, "!\n");
		}
		plen = e-line-1;
		if(plen >= Plen-3) {
			err("pattern too big");
			return -1;
		}
		pat[0] = '^';
		memcpy(pat+1, line+1, plen);
		pat[plen+1] = '$';
		pat[plen+2] = 0;
		if(*e == '\n')
			line = e;
		else
			line = e+1;
		if(!search(pat, delim == '/')) {
			err("pattern not found");
			return -1;
		}
	} else if(*line == '#') {
		/* absolute byte offset into dictionary */
		line++;
		if(!isdigit(*line))
			return -1;
		v = strtoul(line, &e, 10);
		line = e;
		dot->doff[0] = v;
		dot->n = 1;
		dot->cur = 0;
	} else if(isdigit(*line)) {
		v = strtoul(line, &e, 10);
		line = e;
		if(v < 1 || v > dot->n)
			err(".%d not in range [1,%d], ignored",
				v, dot->n);
		else
			dot->cur = v-1;
	} else if(*line == '.') {
		line++;
	} else {
		*eptr = line;
		return 0;
	}
	while(*line == '+' || *line == '-') {
		if(*line == '+')
			setdotnext();
		else
			setdotprev();
		line++;
	}
	*eptr = line;
	return 1;
}

/*
 * Index file is sorted by folded field1.
 * Method: find pre, a folded prefix of r.e. pat,
 * and then low = offset to beginning of
 * line in index file where first match of prefix occurs.
 * Then go through index until prefix no longer matches,
 * adding each line that matches real pattern to dot.
 * Finally, sort dot offsets (uniquing).
 * We know pat len < Plen, and that it is surrounded by ^..$
 */
int
search(char *pat, int dofold)
{
	int needre, prelen, match, n;
	Reprog *re;
	long ioff, v;
	Rune pre[Plen];
	Rune lit[Plen];
	Rune entry[Fieldlen];
	char fpat[Plen];

	prelen = getpref(pat+1, pre);
	if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
		runescpy(lit, pre);
		if(dofold)
			fold(lit);
		needre = 0;
		SET(re);
	} else {
		needre = 1;
		if(dofold) {
			foldre(fpat, pat);
			re = regcomp(fpat);
		} else
			re = regcomp(pat);
	}
	fold(pre);
	ioff = locate(pre);
	if(ioff < 0)
		return 0;
	dot->n = 0;
	Bseek(bindex, ioff, 0);
	for(;;) {
		if(!getfield(entry))
			break;
		if(dofold)
			fold(entry);
		if(needre)
			match = rregexec(re, entry, 0, 0);
		else
			match = (acomp(lit, entry) == 0);
		if(match) {
			if(!getfield(entry))
				break;
			v = runetol(entry);
			if(dot->n >= dot->maxn) {
				n = 2*dot->maxn;
				dot = realloc(dot,
					sizeof(Addr)+(n-1)*sizeof(long));
				if(!dot) {
					err("out of memory");
					exits("nomem");
				}
				dot->maxn = n;
			}
			dot->doff[dot->n++] = v;
		} else {
			if(!dofold)
				fold(entry);
			if(*pre) {
				n = acomp(pre, entry);
				if(n < -1 || (!needre && n < 0))
					break;
			}
			/* get to next index entry */
			if(!getfield(entry))
				break;
		}
	}
	sortaddr(dot);
	dot->cur = 0;
	return dot->n;
}

/*
 * Return offset in index file of first line whose folded
 * first field has pre as a prefix.  -1 if none found.
 */
long
locate(Rune *pre)
{
	long top, bot, mid;
	Rune entry[Fieldlen];

	if(*pre == 0)
		return 0;
	bot = 0;
	top = indextop;
	if(debug>1)
		fprint(2, "locate looking for prefix %S\n", pre);
	for(;;) {
		/*
		 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
		 * and bot < top, and bot,top point at beginning of lines
		 */
		mid = (top+bot) / 2;
		mid = seeknextline(bindex, mid);
		if(debug > 1)
			fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
				bot, (top+bot) / 2, mid, top);
		if(mid == top || !getfield(entry))
			break;
		if(debug > 1)
			fprint(2, "key=%S\n", entry);
		/*
		 * here mid is strictly between bot and top
		 */
		fold(entry);
		if(acomp(pre, entry) <= 0)
			top = mid;
		else
			bot = mid;
	}
	/*
	 * bot < top, but they don't necessarily point at successive lines
	 * Use linear search from bot to find first line that pre is a
	 * prefix of
	 */
	while((bot = seeknextline(bindex, bot)) <= top) {
		if(!getfield(entry))
			return -1;
		if(debug > 1)
			fprint(2, "key=%S\n", entry);
		fold(entry);
		switch(acomp(pre, entry)) {
		case -2:
			return -1;
		case -1:
		case 0:
			return bot;
		case 1:
		case 2:
			continue;
		}
	}
	return -1;

}

/*
 * Get prefix of non re-metacharacters, runified, into pre,
 * and return length
 */
int
getpref(char *pat, Rune *pre)
{
	int n, r;
	char *p;

	p = pat;
	while(*p) {
		n = chartorune(pre, p);
		r = *pre;
		switch(r) {
		case L'.': case L'*': case L'+': case L'?':
		case L'[': case L']': case L'(': case ')':
		case L'|': case L'^': case L'$':
			*pre = 0;
			return p-pat;
		case L'\\':
			p += n;
			p += chartorune(++pre, p);
			pre++;
			break;
		default:
			p += n;
			pre++;
		}
	}
	return p-pat;
}

long
seeknextline(Biobuf *b, long off)
{
	long c;

	Bseek(b, off, 0);
	do {
		c = Bgetrune(b);
	} while(c>=0 && c!='\n');
	return Boffset(b);
}

/*
 * Get next field out of index file (either tab- or nl- terminated)
 * Answer in *rp, assumed to be Fieldlen long.
 * Return 0 if read error first.
 */
int
getfield(Rune *rp)
{
	long c;
	int n;

	for(n=Fieldlen; n-- > 0; ) {
		if ((c = Bgetrune(bindex)) < 0)
			return 0;
		if(c == '\t' || c == '\n') {
			*rp = L'\0';
			return 1;
		}
		*rp++ = c;
	}
	err("word too long");
	return 0;
}

/*
 * A compare longs function suitable for qsort
 */
static int
longcmp(void *av, void *bv)
{
	long v;
	long *a, *b;

	a = av;
	b = bv;

	v = *a - *b;
	if(v < 0)
		return -1;
	else if(v == 0)
		return 0;
	else
		return 1;
}

void
sortaddr(Addr *a)
{
	int i, j;
	long v;

	if(a->n <= 1)
		return;

	qsort(a->doff, a->n, sizeof(long), longcmp);

	/* remove duplicates */
	for(i=0, j=0; j < a->n; j++) {
		v = a->doff[j];
		if(i > 0 && v == a->doff[i-1])
			continue;
		a->doff[i++] = v;
	}
	a->n = i;
}

Entry
getentry(int i)
{
	long b, e, n;
	static Entry ans;
	static int anslen = 0;

	b = dot->doff[i];
	e = (*dict->nextoff)(b+1);
	ans.doff = b;
	if(e < 0) {
		err("couldn't seek to entry");
		ans.start = 0;
		ans.end = 0;
	} else {
		n = e-b;
		if(n+1 > anslen) {
			ans.start = realloc(ans.start, n+1);
			if(!ans.start) {
				err("out of memory");
				exits("nomem");
			}
			anslen = n+1;
		}
		Bseek(bdict, b, 0);
		n = Bread(bdict, ans.start, n);
		ans.end = ans.start + n;
		*ans.end = 0;
	}
	return ans;
}

void
setdotnext(void)
{
	long b;

	b = (*dict->nextoff)(dot->doff[dot->cur]+1);
	if(b < 0) {
		err("couldn't find a next entry");
		return;
	}
	dot->doff[0] = b;
	dot->n = 1;
	dot->cur = 0;
}

void
setdotprev(void)
{
	int tryback;
	long here, last, p;

	if(dot->cur < 0 || dot->cur >= dot->n)
		return;
	tryback = 2000;
	here = dot->doff[dot->cur];
	last = 0;
	while(last == 0) {
		p = here - tryback;
		if(p < 0)
			p = 0;
		for(;;) {
			p = (*dict->nextoff)(p+1);
			if(p < 0)
				return; /* shouldn't happen */
			if(p >= here)
				break;
			last = p;
		}
		if(!last) {
			if(here - tryback < 0) {
				err("can't find a previous entry");
				return;
			}
			tryback = 2*tryback;
		}
	}
	dot->doff[0] = last;
	dot->n = 1;
	dot->cur = 0;
}