shithub: riscv

ref: 980d64826e1d64c992313ae3f020298dd3dcfe37
dir: /sys/src/cmd/grep/comp.c/

View raw version
#include	"grep.h"

/*
 * incremental compiler.
 * add the branch c to the
 * state s.
 */
void
increment(State *s, int c)
{
	int i;
	State *t, **tt;
	Re *re1, *re2;

	nfollow = 0;
	gen++;
	matched = 0;
	for(i=0; i<s->count; i++)
		fol1(s->re[i], c);
	qsort(follow, nfollow, sizeof(*follow), fcmp);
	for(tt=&state0; t = *tt;) {
		if(t->count > nfollow) {
			tt = &t->linkleft;
			goto cont;
		}
		if(t->count < nfollow) {
			tt = &t->linkright;
			goto cont;
		}
		for(i=0; i<nfollow; i++) {
			re1 = t->re[i];
			re2 = follow[i];
			if(re1 > re2) {
				tt = &t->linkleft;
				goto cont;
			}
			if(re1 < re2) {
				tt = &t->linkright;
				goto cont;
			}
		}
		if(!!matched && !t->match) {
			tt = &t->linkleft;
			goto cont;
		}
		if(!matched && !!t->match) {
			tt = &t->linkright;
			goto cont;
		}
		s->next[c] = t;
		return;
	cont:;
	}

	t = sal(nfollow);
	*tt = t;
	for(i=0; i<nfollow; i++) {
		re1 = follow[i];
		t->re[i] = re1;
	}
	s->next[c] = t;
	t->match = matched;
}

int
fcmp(void *va, void *vb)
{
	Re **aa, **bb;
	Re *a, *b;

	aa = va;
	bb = vb;
	a = *aa;
	b = *bb;
	if(a > b)
		return 1;
	if(a < b)
		return -1;
	return 0;
}

void
fol1(Re *r, int c)
{
	Re *r1;

loop:
	if(r->gen == gen)
		return;
	if(nfollow >= maxfollow)
		error("nfollow");
	r->gen = gen;
	switch(r->type) {
	default:
		error("fol1");

	case Tcase:
		if(c >= 0 && c < 256)
		if(r1 = r->cases[c])
			follow[nfollow++] = r1;
		if(r = r->next)
			goto loop;
		break;

	case Talt:
	case Tor:
		fol1(r->alt, c);
		r = r->next;
		goto loop;

	case Tbegin:
		if(c == '\n' || c == Cbegin)
			follow[nfollow++] = r->next;
		break;

	case Tend:
		if(c == '\n') {
			if(r->next == 0) {
				matched = 1;
				break;
			}
			r = r->next;
			goto loop;
		}
		break;

	case Tclass:
		if(c >= r->lo && c <= r->hi)
			follow[nfollow++] = r->next;
		break;
	}
}

int	tab1[] =
{
	0x007f,
	0x07ff,
	0xffff,
};
int	tab2[] =
{
	0x003f,
	0x0fff,
	0x3ffff,
};

Re2
rclass(Rune p0, Rune p1)
{
	char xc0[6], xc1[6];
	int i, n, m;
	Re2 x;

	if(p0 > p1)
		return re2char(0xff, 0xff);	// no match

	/*
	 * bust range into same length
	 * character sequences
	 */
	for(i=0; i<nelem(tab1); i++) {
		m = tab1[i];
		if(p0 <= m && p1 > m)
			return re2or(rclass(p0, m), rclass(m+1, p1));
	}

	/*
	 * bust range into part of a single page
	 * or into full pages
	 */
	for(i=0; i<nelem(tab2); i++) {
		m = tab2[i];
		if((p0 & ~m) != (p1 & ~m)) {
			if((p0 & m) != 0)
				return re2or(rclass(p0, p0|m), rclass((p0|m)+1, p1));
			if((p1 & m) != m)
				return re2or(rclass(p0, (p1&~m)-1), rclass(p1&~m, p1));
		}
	}

	n = runetochar(xc0, &p0);
	i = runetochar(xc1, &p1);
	if(i != n)
		error("length");

	x = re2char(xc0[0], xc1[0]);
	for(i=1; i<n; i++)
		x = re2cat(x, re2char(xc0[i], xc1[i]));
	return x;
}

int
pcmp(void *va, void *vb)
{
	int n;
	Rune *a, *b;

	a = va;
	b = vb;

	n = a[0] - b[0];
	if(n)
		return n;
	return a[1] - b[1];
}

/*
 * convert character chass into
 * run-pair ranges of matches.
 * this is 10646/utf specific and
 * needs to be changed for some
 * other input character set.
 * this is the key to a fast
 * regular search of characters
 * by looking at sequential bytes.
 */
Re2
re2class(char *s)
{
	Rune pairs[200+2], *p, *q, ov;
	int nc;
	Re2 x;

	nc = 0;
	if(*s == '^') {
		nc = 1;
		s++;
	}

	p = pairs;
	s += chartorune(p, s);
	for(;;) {
		if(*p == '\\')
			s += chartorune(p, s);
		if(*p == 0)
			break;
		p[1] = *p;
		p += 2;
		if(p >= pairs + nelem(pairs) - 2)
			error("class too big");
		s += chartorune(p, s);
		if(*p != '-')
			continue;
		s += chartorune(p, s);
		if(*p == '\\')
			s += chartorune(p, s);
		if(*p == 0)
			break;
		p[-1] = *p;
		s += chartorune(p, s);
	}
	*p = 0;
	qsort(pairs, (p-pairs)/2, 2*sizeof(*pairs), pcmp);

	q = pairs;
	for(p=pairs+2; *p; p+=2) {
		if(p[0] > p[1])
			continue;
		if(p[0] > q[1] || p[1] < q[0]) {
			q[2] = p[0];
			q[3] = p[1];
			q += 2;
			continue;
		}
		if(p[0] < q[0])
			q[0] = p[0];
		if(p[1] > q[1])
			q[1] = p[1];
	}
	q[2] = 0;

	p = pairs;
	if(nc) {
		x = rclass(0, p[0]-1);
		ov = p[1]+1;
		for(p+=2; *p; p+=2) {
			x = re2or(x, rclass(ov, p[0]-1));
			ov = p[1]+1;
		}
		x = re2or(x, rclass(ov, Runemax));
	} else {
		x = rclass(p[0], p[1]);
		for(p+=2; *p; p+=2)
			x = re2or(x, rclass(p[0], p[1]));
	}
	return x;
}