ref: 613fd6b4294ad3277a91c9a2e58b2b2d4f22ab1c
dir: /sys/src/cmd/troff/n8.c/
#include "tdef.h" #include "fns.h" #include "ext.h" #include <assert.h> #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */ /* this value is used (as a literal) in suftab.c */ /* to encode possible hyphenation points in suffixes. */ /* it could be changed, by widening the tables */ /* to be shorts instead of chars. */ /* * troff8.c * * hyphenation */ int hexsize = 0; /* hyphenation exception list size */ char *hbufp = NULL; /* base of list */ char *nexth = NULL; /* first free slot in list */ Tchar *hyend; #define THRESH 160 /* digram goodness threshold */ int thresh = THRESH; int texhyphen(void); static int alpha(Tchar); void hyphen(Tchar *wp) { int j; Tchar *i; i = wp; while (punct((*i++))) ; if (!alpha(*--i)) return; wdstart = i++; while (alpha(*i++)) ; hyend = wdend = --i - 1; while (punct((*i++))) ; if (*--i) return; if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */ return; hyp = hyptr; *hyp = 0; hyoff = 2; /* for now, try exceptions first, then tex (if hyphalg is non-zero), then suffix and digram if tex didn't hyphenate it at all. */ if (!exword() && !texhyphen() && !suffix()) digram(); /* this appears to sort hyphenation points into increasing order */ *hyp++ = 0; if (*hyptr) for (j = 1; j; ) { j = 0; for (hyp = hyptr + 1; *hyp != 0; hyp++) { if (*(hyp - 1) > *hyp) { j++; i = *hyp; *hyp = *(hyp - 1); *(hyp - 1) = i; } } } } static alpha(Tchar i) /* non-zero if really alphabetic */ { if (ismot(i)) return 0; else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */ return 0; /* no good way to make sure i is in range for */ else /* the call of isalpha */ return isalpha(cbits(i)); } punct(Tchar i) { if (!i || alpha(i)) return(0); else return(1); } void caseha(void) /* set hyphenation algorithm */ { hyphalg = HYPHALG; if (skip()) return; noscale++; hyphalg = atoi0(); noscale = 0; } void caseht(void) /* set hyphenation threshold; not in manual! */ { thresh = THRESH; if (skip()) return; noscale++; thresh = atoi0(); noscale = 0; } char *growh(char *where) { char *new; hexsize += NHEX; if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL) return NULL; if (new == hbufp) { return where; } else { int diff; diff = where - hbufp; hbufp = new; return new + diff; } } void casehw(void) { int i, k; char *j; Tchar t; if (nexth == NULL) { if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) { ERROR "No space for exception word list." WARN; return; } hexsize = NHEX; } k = 0; while (!skip()) { if ((j = nexth) >= hbufp + hexsize - 2) if ((j = nexth = growh(j)) == NULL) goto full; for (;;) { if (ismot(t = getch())) continue; i = cbits(t); if (i == ' ' || i == '\n') { *j++ = 0; nexth = j; *j = 0; if (i == ' ') break; else return; } if (i == '-') { k = HY_BIT; continue; } *j++ = maplow(i) | k; k = 0; if (j >= hbufp + hexsize - 2) if ((j = growh(j)) == NULL) goto full; } } return; full: ERROR "Cannot grow exception word list." WARN; *nexth = 0; } int exword(void) { Tchar *w; char *e, *save; e = hbufp; while (1) { save = e; if (e == NULL || *e == 0) return(0); w = wdstart; while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) { e++; w++; } if (!*e) { if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) { w = wdstart; for (e = save; *e; e++) { if (*e & HY_BIT) *hyp++ = w; if (hyp > hyptr + NHYP - 1) hyp = hyptr + NHYP - 1; w++; } return(1); } else { e++; continue; } } else while (*e++) ; } } suffix(void) { Tchar *w; char *s, *s0; Tchar i; extern char *suftab[]; again: i = cbits(*hyend); if (!alpha(i)) return(0); if (i < 'a') i -= 'A' - 'a'; if ((s0 = suftab[i-'a']) == 0) return(0); for (;;) { if ((i = *s0 & 017) == 0) return(0); s = s0 + i - 1; w = hyend - 1; while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) { s--; w--; } if (s == s0) break; s0 += i; } s = s0 + i - 1; w = hyend; if (*s0 & HY_BIT) goto mark; while (s > s0) { w--; if (*s-- & HY_BIT) { mark: hyend = w - 1; if (*s0 & 0100) /* 0100 used in suftab to encode something too */ continue; if (!chkvow(w)) return(0); *hyp++ = w; } } if (*s0 & 040) return(0); if (exword()) return(1); goto again; } maplow(int i) { if (isupper(i)) i = tolower(i); return(i); } vowel(int i) { switch (i) { case 'a': case 'A': case 'e': case 'E': case 'i': case 'I': case 'o': case 'O': case 'u': case 'U': case 'y': case 'Y': return(1); default: return(0); } } Tchar *chkvow(Tchar *w) { while (--w >= wdstart) if (vowel(cbits(*w))) return(w); return(0); } void digram(void) { Tchar *w; int val; Tchar *nhyend, *maxw; int maxval; extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13]; again: if (!(w = chkvow(hyend + 1))) return; hyend = w; if (!(w = chkvow(hyend))) return; nhyend = w; maxval = 0; w--; while (++w < hyend && w < wdend - 1) { val = 1; if (w == wdstart) val *= dilook('a', cbits(*w), bxh); else if (w == wdstart + 1) val *= dilook(cbits(*(w-1)), cbits(*w), bxxh); else val *= dilook(cbits(*(w-1)), cbits(*w), xxh); val *= dilook(cbits(*w), cbits(*(w+1)), xhx); val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx); if (val > maxval) { maxval = val; maxw = w + 1; } } hyend = nhyend; if (maxval > thresh) *hyp++ = maxw; goto again; } dilook(int a, int b, char t[26][13]) { int i, j; i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2]; if (!(j & 01)) i >>= 4; return(i & 017); } /* here beginneth the tex hyphenation code, as interpreted freely */ /* the main difference is that there is no attempt to squeeze space */ /* as tightly at tex does. */ static int texit(Tchar *, Tchar *); static int readpats(void); static void install(char *); static void fixup(void); static int trieindex(int, int); static char pats[50000]; /* size ought to be computed dynamically */ static char *nextpat = pats; static char *trie[27*27]; /* english-specific sizes */ int texhyphen(void) { static int loaded = 0; /* -1: couldn't find tex file */ if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */ return 0; if (loaded == 0) { if (readpats()) loaded = 1; else loaded = -1; } return texit(wdstart, wdend); } static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */ { int nw, i, k, equal, cnt[500]; char w[500+1], *np, *pp, *wp, *xpp, *xwp; w[0] = '.'; for (nw = 1; start <= end && nw < 500-1; nw++, start++) w[nw] = maplow(tolower(cbits(*start))); start -= (nw - 1); w[nw++] = '.'; w[nw] = 0; /* * printf("try %s\n", w); */ for (i = 0; i <= nw; i++) cnt[i] = '0'; for (wp = w; wp+1 < w+nw; wp++) { for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) { if (pp == 0 /* no trie entry */ || *pp != *wp /* no match on 1st letter */ || *(pp+1) != *(wp+1)) /* no match on 2nd letter */ break; /* so move to next letter of word */ equal = 1; for (xpp = pp+2, xwp = wp+2; *xpp; ) if (*xpp++ != *xwp++) { equal = 0; break; } if (equal) { np = xpp+1; /* numpat */ for (k = wp-w; *np; k++, np++) if (*np > cnt[k]) cnt[k] = *np; /* * printf("match: %s %s\n", pp, xpp+1); */ } pp += *(pp-1); /* skip over pattern and numbers to next */ } } /* * for (i = 0; i < nw; i++) printf("%c", w[i]); * printf(" "); * for (i = 0; i <= nw; i++) printf("%c", cnt[i]); * printf("\n"); */ /* * for (i = 1; i < nw - 1; i++) { * if (i > 2 && i < nw - 3 && cnt[i] % 2) * printf("-"); * if (cbits(start[i-1]) != '.') * printf("%c", cbits(start[i-1])); * } * printf("\n"); */ for (i = 1; i < nw -1; i++) if (i > 2 && i < nw - 3 && cnt[i] % 2) *hyp++ = start + i - 1; return hyp - hyptr; /* non-zero if a hyphen was found */ } /* This code assumes that hyphen.tex looks like % some comments \patterns{ % more comments pat5ter4ns, 1 per line, SORTED, nothing else } more goo \hyphenation{ % more comments ex-cep-tions, one per line; i ignore this part for now } this code is NOT robust against variations. unfortunately, it looks like every local language version of this file has a different format. i have also made no provision for weird characters. sigh. */ static int readpats(void) { FILE *fp; char buf[200], buf1[200]; if ((fp = fopen(TEXHYPHENS, "r")) == NULL && (fp = fopen(DWBalthyphens, "r")) == NULL) { ERROR "warning: can't find hyphen.tex" WARN; return 0; } while (fgets(buf, sizeof buf, fp) != NULL) { sscanf(buf, "%s", buf1); if (strcmp(buf1, "\\patterns{") == 0) break; } while (fgets(buf, sizeof buf, fp) != NULL) { if (buf[0] == '}') break; install(buf); } fclose(fp); fixup(); return 1; } static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */ { int npat, lastpat; char num[500], *onextpat = nextpat; num[0] = '0'; *nextpat++ = ' '; /* fill in with count later */ for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) { if (isdigit(*s)) { num[npat] = *s; lastpat = npat; } else { *nextpat++ = *s; npat++; num[npat] = '0'; } } *nextpat++ = 0; if (nextpat > pats + sizeof(pats)-20) { ERROR "tex hyphenation table overflow, tail end ignored" WARN; nextpat = onextpat; } num[lastpat+1] = 0; strcat(nextpat, num); nextpat += strlen(nextpat) + 1; } static void fixup(void) /* build indexes of where . a b c ... start */ { char *p, *lastc; int n; for (lastc = pats, p = pats+1; p < nextpat; p++) if (*p == ' ') { *lastc = p - lastc; lastc = p; } *lastc = p - lastc; for (p = pats+1; p < nextpat; ) { n = trieindex(p[0], p[1]); if (trie[n] == 0) trie[n] = p; p += p[-1]; } /* printf("pats = %d\n", nextpat - pats); */ } static int trieindex(int d1, int d2) { int i; i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1); assert(0 <= i && i < 27*27); return i; }