ref: 128ea44a89c7905612ad2fa5a61a9325ddfb5e1e
dir: /sys/src/cmd/python/Doc/tools/buildindex.py/
#! /usr/bin/env python __version__ = '$Revision: 36356 $' import os.path import re import string import sys from xml.sax.saxutils import quoteattr bang_join = "!".join null_join = "".join REPLACEMENTS = [ # Hackish way to deal with macros replaced with simple text (re.compile(r"\\ABC\b"), "ABC"), (re.compile(r"\\ASCII\b"), "ASCII"), (re.compile(r"\\Cpp\b"), "C++"), (re.compile(r"\\EOF\b"), "EOF"), (re.compile(r"\\NULL\b"), "NULL"), (re.compile(r"\\POSIX\b"), "POSIX"), (re.compile(r"\\UNIX\b"), "Unix"), # deal with turds left over from LaTeX2HTML (re.compile(r"<#\d+#>"), ""), ] class Node: continuation = 0 def __init__(self, link, str, seqno): self.links = [link] self.seqno = seqno for pattern, replacement in REPLACEMENTS: str = pattern.sub(replacement, str) # build up the text self.text = split_entry_text(str) self.key = split_entry_key(str) def __cmp__(self, other): """Comparison operator includes sequence number, for use with list.sort().""" return self.cmp_entry(other) or cmp(self.seqno, other.seqno) def cmp_entry(self, other): """Comparison 'operator' that ignores sequence number.""" c = 0 for i in range(min(len(self.key), len(other.key))): c = (cmp_part(self.key[i], other.key[i]) or cmp_part(self.text[i], other.text[i])) if c: break return c or cmp(self.key, other.key) or cmp(self.text, other.text) def __repr__(self): return "<Node for %s (%s)>" % (bang_join(self.text), self.seqno) def __str__(self): return bang_join(self.key) def dump(self): return "%s\1%s###%s\n" \ % ("\1".join(self.links), bang_join(self.text), self.seqno) def cmp_part(s1, s2): result = cmp(s1, s2) if result == 0: return 0 l1 = s1.lower() l2 = s2.lower() minlen = min(len(s1), len(s2)) if len(s1) < len(s2) and l1 == l2[:len(s1)]: result = -1 elif len(s2) < len(s1) and l2 == l1[:len(s2)]: result = 1 else: result = cmp(l1, l2) or cmp(s1, s2) return result def split_entry(str, which): stuff = [] parts = str.split('!') parts = [part.split('@') for part in parts] for entry in parts: if len(entry) != 1: key = entry[which] else: key = entry[0] stuff.append(key) return stuff _rmtt = re.compile(r"""(.*)<tt(?: class=['"][a-z0-9]+["'])?>(.*)</tt>(.*)$""", re.IGNORECASE) _rmparens = re.compile(r"\(\)") def split_entry_key(str): parts = split_entry(str, 1) for i in range(len(parts)): m = _rmtt.match(parts[i]) if m: parts[i] = null_join(m.group(1, 2, 3)) else: parts[i] = parts[i].lower() # remove '()' from the key: parts[i] = _rmparens.sub('', parts[i]) return map(trim_ignored_letters, parts) def split_entry_text(str): if '<' in str: m = _rmtt.match(str) if m: str = null_join(m.group(1, 2, 3)) return split_entry(str, 1) def load(fp): nodes = [] rx = re.compile("(.*)\1(.*)###(.*)$") while 1: line = fp.readline() if not line: break m = rx.match(line) if m: link, str, seqno = m.group(1, 2, 3) nodes.append(Node(link, str, seqno)) return nodes def trim_ignored_letters(s): # ignore $ to keep environment variables with the # leading letter from the name if s.startswith("$"): return s[1:].lower() else: return s.lower() def get_first_letter(s): if s.startswith("<tex2html_percent_mark>"): return "%" else: return trim_ignored_letters(s)[0] def split_letters(nodes): letter_groups = [] if nodes: group = [] append = group.append letter = get_first_letter(nodes[0].text[0]) letter_groups.append((letter, group)) for node in nodes: nletter = get_first_letter(node.text[0]) if letter != nletter: letter = nletter group = [] letter_groups.append((letter, group)) append = group.append append(node) return letter_groups def group_symbols(groups): entries = [] ident_letters = string.ascii_letters + "_" while groups[0][0] not in ident_letters: entries += groups[0][1] del groups[0] if entries: groups.insert(0, ("Symbols", entries)) # need a function to separate the nodes into columns... def split_columns(nodes, columns=1): if columns <= 1: return [nodes] # This is a rough height; we may have to increase to avoid breaks before # a subitem. colheight = int(len(nodes) / columns) numlong = int(len(nodes) % columns) if numlong: colheight = colheight + 1 else: numlong = columns cols = [] for i in range(numlong): start = i * colheight end = start + colheight cols.append(nodes[start:end]) del nodes[:end] colheight = colheight - 1 try: numshort = int(len(nodes) / colheight) except ZeroDivisionError: cols = cols + (columns - len(cols)) * [[]] else: for i in range(numshort): start = i * colheight end = start + colheight cols.append(nodes[start:end]) # # If items continue across columns, make sure they are marked # as continuations so the user knows to look at the previous column. # for i in range(len(cols) - 1): try: prev = cols[i][-1] next = cols[i + 1][0] except IndexError: return cols else: n = min(len(prev.key), len(next.key)) for j in range(n): if prev.key[j] != next.key[j]: break next.continuation = j + 1 return cols DL_LEVEL_INDENT = " " def format_column(nodes): strings = ["<dl compact='compact'>"] append = strings.append level = 0 previous = [] for node in nodes: current = node.text count = 0 for i in range(min(len(current), len(previous))): if previous[i] != current[i]: break count = i + 1 if count > level: append("<dl compact='compact'>" * (count - level) + "\n") level = count elif level > count: append("\n") append(level * DL_LEVEL_INDENT) append("</dl>" * (level - count)) level = count # else: level == count for i in range(count, len(current) - 1): term = node.text[i] level = level + 1 if node.continuation > i: extra = " (continued)" else: extra = "" append("\n<dt>%s%s\n<dd>\n%s<dl compact='compact'>" % (term, extra, level * DL_LEVEL_INDENT)) append("\n%s<dt>%s%s</a>" % (level * DL_LEVEL_INDENT, node.links[0], node.text[-1])) for link in node.links[1:]: append(",\n%s %s[Link]</a>" % (level * DL_LEVEL_INDENT, link)) previous = current append("\n") append("</dl>" * (level + 1)) return null_join(strings) def format_nodes(nodes, columns=1): strings = [] append = strings.append if columns > 1: colnos = range(columns) colheight = int(len(nodes) / columns) if len(nodes) % columns: colheight = colheight + 1 colwidth = int(100 / columns) append('<table width="100%"><tr valign="top">') for col in split_columns(nodes, columns): append('<td width="%d%%">\n' % colwidth) append(format_column(col)) append("\n</td>") append("\n</tr></table>") else: append(format_column(nodes)) return null_join(strings) def format_letter(letter): if letter == '.': lettername = ". (dot)" elif letter == '_': lettername = "_ (underscore)" else: lettername = letter.capitalize() return "\n<hr />\n<h2 id=%s>%s</h2>\n\n" \ % (quoteattr("letter-" + letter), lettername) def format_html_letters(nodes, columns, group_symbol_nodes): letter_groups = split_letters(nodes) if group_symbol_nodes: group_symbols(letter_groups) items = [] for letter, nodes in letter_groups: s = "<b><a href=\"#letter-%s\">%s</a></b>" % (letter, letter) items.append(s) s = ["<hr /><center>\n%s</center>\n" % " |\n".join(items)] for letter, nodes in letter_groups: s.append(format_letter(letter)) s.append(format_nodes(nodes, columns)) return null_join(s) def format_html(nodes, columns): return format_nodes(nodes, columns) def collapse(nodes): """Collapse sequences of nodes with matching keys into a single node. Destructive.""" if len(nodes) < 2: return prev = nodes[0] i = 1 while i < len(nodes): node = nodes[i] if not node.cmp_entry(prev): prev.links.append(node.links[0]) del nodes[i] else: i = i + 1 prev = node def dump(nodes, fp): for node in nodes: fp.write(node.dump()) def process_nodes(nodes, columns, letters=0, group_symbol_nodes=0): nodes.sort() collapse(nodes) if letters: return format_html_letters(nodes, columns, group_symbol_nodes) else: return format_html(nodes, columns) def main(): import getopt ifn = "-" ofn = "-" columns = 1 letters = 0 group_symbol_nodes = 1 opts, args = getopt.getopt(sys.argv[1:], "c:lo:", ["columns=", "dont-group-symbols", "group-symbols", "letters", "output="]) for opt, val in opts: if opt in ("-o", "--output"): ofn = val elif opt in ("-c", "--columns"): columns = int(val, 10) elif opt in ("-l", "--letters"): letters = 1 elif opt == "--group-symbols": group_symbol_nodes = 1 elif opt == "--dont-group-symbols": group_symbol_nodes = 0 if not args: args = [ifn] nodes = [] for fn in args: nodes = nodes + load(open(fn)) num_nodes = len(nodes) html = process_nodes(nodes, columns, letters, group_symbol_nodes) program = os.path.basename(sys.argv[0]) if ofn == "-": sys.stdout.write(html) sys.stderr.write("\n%s: %d index nodes" % (program, num_nodes)) else: open(ofn, "w").write(html) print print "%s: %d index nodes" % (program, num_nodes) if __name__ == "__main__": main()