diff options
Diffstat (limited to 'doc/tools/sgmlconv/docfixer.py')
-rwxr-xr-x | doc/tools/sgmlconv/docfixer.py | 1033 |
1 files changed, 1033 insertions, 0 deletions
diff --git a/doc/tools/sgmlconv/docfixer.py b/doc/tools/sgmlconv/docfixer.py new file mode 100755 index 0000000..463276b --- /dev/null +++ b/doc/tools/sgmlconv/docfixer.py @@ -0,0 +1,1033 @@ +#! /usr/bin/env python + +"""Perform massive transformations on a document tree created from the LaTeX +of the Python documentation, and dump the ESIS data for the transformed tree. +""" + + +import errno +import esistools +import re +import string +import sys +import xml.dom +import xml.dom.minidom + +ELEMENT = xml.dom.Node.ELEMENT_NODE +ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE +TEXT = xml.dom.Node.TEXT_NODE + + +class ConversionError(Exception): + pass + + +ewrite = sys.stderr.write +try: + # We can only do this trick on Unix (if tput is on $PATH)! + if sys.platform != "posix" or not sys.stderr.isatty(): + raise ImportError + import commands +except ImportError: + bwrite = ewrite +else: + def bwrite(s, BOLDON=commands.getoutput("tput bold"), + BOLDOFF=commands.getoutput("tput sgr0")): + ewrite("%s%s%s" % (BOLDON, s, BOLDOFF)) + + +PARA_ELEMENT = "para" + +DEBUG_PARA_FIXER = 0 + +if DEBUG_PARA_FIXER: + def para_msg(s): + ewrite("*** %s\n" % s) +else: + def para_msg(s): + pass + + +def get_first_element(doc, gi): + for n in doc.childNodes: + if n.nodeName == gi: + return n + +def extract_first_element(doc, gi): + node = get_first_element(doc, gi) + if node is not None: + doc.removeChild(node) + return node + + +def get_documentElement(node): + result = None + for child in node.childNodes: + if child.nodeType == ELEMENT: + result = child + return result + + +def set_tagName(elem, gi): + elem.nodeName = elem.tagName = gi + + +def find_all_elements(doc, gi): + nodes = [] + if doc.nodeName == gi: + nodes.append(doc) + for child in doc.childNodes: + if child.nodeType == ELEMENT: + if child.tagName == gi: + nodes.append(child) + for node in child.getElementsByTagName(gi): + nodes.append(node) + return nodes + +def find_all_child_elements(doc, gi): + nodes = [] + for child in doc.childNodes: + if child.nodeName == gi: + nodes.append(child) + return nodes + + +def find_all_elements_from_set(doc, gi_set): + return __find_all_elements_from_set(doc, gi_set, []) + +def __find_all_elements_from_set(doc, gi_set, nodes): + if doc.nodeName in gi_set: + nodes.append(doc) + for child in doc.childNodes: + if child.nodeType == ELEMENT: + __find_all_elements_from_set(child, gi_set, nodes) + return nodes + + +def simplify(doc, fragment): + # Try to rationalize the document a bit, since these things are simply + # not valid SGML/XML documents as they stand, and need a little work. + documentclass = "document" + inputs = [] + node = extract_first_element(fragment, "documentclass") + if node is not None: + documentclass = node.getAttribute("classname") + node = extract_first_element(fragment, "title") + if node is not None: + inputs.append(node) + # update the name of the root element + node = get_first_element(fragment, "document") + if node is not None: + set_tagName(node, documentclass) + while 1: + node = extract_first_element(fragment, "input") + if node is None: + break + inputs.append(node) + if inputs: + docelem = get_documentElement(fragment) + inputs.reverse() + for node in inputs: + text = doc.createTextNode("\n") + docelem.insertBefore(text, docelem.firstChild) + docelem.insertBefore(node, text) + docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild) + while fragment.firstChild and fragment.firstChild.nodeType == TEXT: + fragment.removeChild(fragment.firstChild) + + +def cleanup_root_text(doc): + discards = [] + skip = 0 + for n in doc.childNodes: + prevskip = skip + skip = 0 + if n.nodeType == TEXT and not prevskip: + discards.append(n) + elif n.nodeName == "COMMENT": + skip = 1 + for node in discards: + doc.removeChild(node) + + +DESCRIPTOR_ELEMENTS = ( + "cfuncdesc", "cvardesc", "ctypedesc", + "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni", + "excdesc", "funcdesc", "funcdescni", "opcodedesc", + "datadesc", "datadescni", + ) + +def fixup_descriptors(doc, fragment): + sections = find_all_elements(fragment, "section") + for section in sections: + find_and_fix_descriptors(doc, section) + + +def find_and_fix_descriptors(doc, container): + children = container.childNodes + for child in children: + if child.nodeType == ELEMENT: + tagName = child.tagName + if tagName in DESCRIPTOR_ELEMENTS: + rewrite_descriptor(doc, child) + elif tagName == "subsection": + find_and_fix_descriptors(doc, child) + + +def rewrite_descriptor(doc, descriptor): + # + # Do these things: + # 1. Add an "index='no'" attribute to the element if the tagName + # ends in 'ni', removing the 'ni' from the name. + # 2. Create a <signature> from the name attribute + # 2a.Create an <args> if it appears to be available. + # 3. Create additional <signature>s from <*line{,ni}> elements, + # if found. + # 4. If a <versionadded> is found, move it to an attribute on the + # descriptor. + # 5. Move remaining child nodes to a <description> element. + # 6. Put it back together. + # + # 1. + descname = descriptor.tagName + index = 1 + if descname[-2:] == "ni": + descname = descname[:-2] + descriptor.setAttribute("index", "no") + set_tagName(descriptor, descname) + index = 0 + desctype = descname[:-4] # remove 'desc' + linename = desctype + "line" + if not index: + linename = linename + "ni" + # 2. + signature = doc.createElement("signature") + name = doc.createElement("name") + signature.appendChild(doc.createTextNode("\n ")) + signature.appendChild(name) + name.appendChild(doc.createTextNode(descriptor.getAttribute("name"))) + descriptor.removeAttribute("name") + # 2a. + if descriptor.hasAttribute("var"): + if descname != "opcodedesc": + raise RuntimeError, \ + "got 'var' attribute on descriptor other than opcodedesc" + variable = descriptor.getAttribute("var") + if variable: + args = doc.createElement("args") + args.appendChild(doc.createTextNode(variable)) + signature.appendChild(doc.createTextNode("\n ")) + signature.appendChild(args) + descriptor.removeAttribute("var") + newchildren = [signature] + children = descriptor.childNodes + pos = skip_leading_nodes(children) + if pos < len(children): + child = children[pos] + if child.nodeName == "args": + # move <args> to <signature>, or remove if empty: + child.parentNode.removeChild(child) + if len(child.childNodes): + signature.appendChild(doc.createTextNode("\n ")) + signature.appendChild(child) + signature.appendChild(doc.createTextNode("\n ")) + # 3, 4. + pos = skip_leading_nodes(children, pos) + while pos < len(children) \ + and children[pos].nodeName in (linename, "versionadded"): + if children[pos].tagName == linename: + # this is really a supplemental signature, create <signature> + oldchild = children[pos].cloneNode(1) + try: + sig = methodline_to_signature(doc, children[pos]) + except KeyError: + print oldchild.toxml() + raise + newchildren.append(sig) + else: + # <versionadded added=...> + descriptor.setAttribute( + "added", children[pos].getAttribute("version")) + pos = skip_leading_nodes(children, pos + 1) + # 5. + description = doc.createElement("description") + description.appendChild(doc.createTextNode("\n")) + newchildren.append(description) + move_children(descriptor, description, pos) + last = description.childNodes[-1] + if last.nodeType == TEXT: + last.data = string.rstrip(last.data) + "\n " + # 6. + # should have nothing but whitespace and signature lines in <descriptor>; + # discard them + while descriptor.childNodes: + descriptor.removeChild(descriptor.childNodes[0]) + for node in newchildren: + descriptor.appendChild(doc.createTextNode("\n ")) + descriptor.appendChild(node) + descriptor.appendChild(doc.createTextNode("\n")) + + +def methodline_to_signature(doc, methodline): + signature = doc.createElement("signature") + signature.appendChild(doc.createTextNode("\n ")) + name = doc.createElement("name") + name.appendChild(doc.createTextNode(methodline.getAttribute("name"))) + methodline.removeAttribute("name") + signature.appendChild(name) + if len(methodline.childNodes): + args = doc.createElement("args") + signature.appendChild(doc.createTextNode("\n ")) + signature.appendChild(args) + move_children(methodline, args) + signature.appendChild(doc.createTextNode("\n ")) + return signature + + +def move_children(origin, dest, start=0): + children = origin.childNodes + while start < len(children): + node = children[start] + origin.removeChild(node) + dest.appendChild(node) + + +def handle_appendix(doc, fragment): + # must be called after simplfy() if document is multi-rooted to begin with + docelem = get_documentElement(fragment) + toplevel = docelem.tagName == "manual" and "chapter" or "section" + appendices = 0 + nodes = [] + for node in docelem.childNodes: + if appendices: + nodes.append(node) + elif node.nodeType == ELEMENT: + appnodes = node.getElementsByTagName("appendix") + if appnodes: + appendices = 1 + parent = appnodes[0].parentNode + parent.removeChild(appnodes[0]) + parent.normalize() + if nodes: + map(docelem.removeChild, nodes) + docelem.appendChild(doc.createTextNode("\n\n\n")) + back = doc.createElement("back-matter") + docelem.appendChild(back) + back.appendChild(doc.createTextNode("\n")) + while nodes and nodes[0].nodeType == TEXT \ + and not string.strip(nodes[0].data): + del nodes[0] + map(back.appendChild, nodes) + docelem.appendChild(doc.createTextNode("\n")) + + +def handle_labels(doc, fragment): + for label in find_all_elements(fragment, "label"): + id = label.getAttribute("id") + if not id: + continue + parent = label.parentNode + parentTagName = parent.tagName + if parentTagName == "title": + parent.parentNode.setAttribute("id", id) + else: + parent.setAttribute("id", id) + # now, remove <label id="..."/> from parent: + parent.removeChild(label) + if parentTagName == "title": + parent.normalize() + children = parent.childNodes + if children[-1].nodeType == TEXT: + children[-1].data = string.rstrip(children[-1].data) + + +def fixup_trailing_whitespace(doc, wsmap): + queue = [doc] + while queue: + node = queue[0] + del queue[0] + if wsmap.has_key(node.nodeName): + ws = wsmap[node.tagName] + children = node.childNodes + children.reverse() + if children[0].nodeType == TEXT: + data = string.rstrip(children[0].data) + ws + children[0].data = data + children.reverse() + # hack to get the title in place: + if node.tagName == "title" \ + and node.parentNode.firstChild.nodeType == ELEMENT: + node.parentNode.insertBefore(doc.createText("\n "), + node.parentNode.firstChild) + for child in node.childNodes: + if child.nodeType == ELEMENT: + queue.append(child) + + +def normalize(doc): + for node in doc.childNodes: + if node.nodeType == ELEMENT: + node.normalize() + + +def cleanup_trailing_parens(doc, element_names): + d = {} + for gi in element_names: + d[gi] = gi + rewrite_element = d.has_key + queue = [] + for node in doc.childNodes: + if node.nodeType == ELEMENT: + queue.append(node) + while queue: + node = queue[0] + del queue[0] + if rewrite_element(node.tagName): + children = node.childNodes + if len(children) == 1 \ + and children[0].nodeType == TEXT: + data = children[0].data + if data[-2:] == "()": + children[0].data = data[:-2] + else: + for child in node.childNodes: + if child.nodeType == ELEMENT: + queue.append(child) + + +def contents_match(left, right): + left_children = left.childNodes + right_children = right.childNodes + if len(left_children) != len(right_children): + return 0 + for l, r in map(None, left_children, right_children): + nodeType = l.nodeType + if nodeType != r.nodeType: + return 0 + if nodeType == ELEMENT: + if l.tagName != r.tagName: + return 0 + # should check attributes, but that's not a problem here + if not contents_match(l, r): + return 0 + elif nodeType == TEXT: + if l.data != r.data: + return 0 + else: + # not quite right, but good enough + return 0 + return 1 + + +def create_module_info(doc, section): + # Heavy. + node = extract_first_element(section, "modulesynopsis") + if node is None: + return + set_tagName(node, "synopsis") + lastchild = node.childNodes[-1] + if lastchild.nodeType == TEXT \ + and lastchild.data[-1:] == ".": + lastchild.data = lastchild.data[:-1] + modauthor = extract_first_element(section, "moduleauthor") + if modauthor: + set_tagName(modauthor, "author") + modauthor.appendChild(doc.createTextNode( + modauthor.getAttribute("name"))) + modauthor.removeAttribute("name") + platform = extract_first_element(section, "platform") + if section.tagName == "section": + modinfo_pos = 2 + modinfo = doc.createElement("moduleinfo") + moddecl = extract_first_element(section, "declaremodule") + name = None + if moddecl: + modinfo.appendChild(doc.createTextNode("\n ")) + name = moddecl.attributes["name"].value + namenode = doc.createElement("name") + namenode.appendChild(doc.createTextNode(name)) + modinfo.appendChild(namenode) + type = moddecl.attributes.get("type") + if type: + type = type.value + modinfo.appendChild(doc.createTextNode("\n ")) + typenode = doc.createElement("type") + typenode.appendChild(doc.createTextNode(type)) + modinfo.appendChild(typenode) + versionadded = extract_first_element(section, "versionadded") + if versionadded: + modinfo.setAttribute("added", versionadded.getAttribute("version")) + title = get_first_element(section, "title") + if title: + children = title.childNodes + if len(children) >= 2 \ + and children[0].nodeName == "module" \ + and children[0].childNodes[0].data == name: + # this is it; morph the <title> into <short-synopsis> + first_data = children[1] + if first_data.data[:4] == " ---": + first_data.data = string.lstrip(first_data.data[4:]) + set_tagName(title, "short-synopsis") + if children[-1].nodeType == TEXT \ + and children[-1].data[-1:] == ".": + children[-1].data = children[-1].data[:-1] + section.removeChild(title) + section.removeChild(section.childNodes[0]) + title.removeChild(children[0]) + modinfo_pos = 0 + else: + ewrite("module name in title doesn't match" + " <declaremodule/>; no <short-synopsis/>\n") + else: + ewrite("Unexpected condition: <section/> without <title/>\n") + modinfo.appendChild(doc.createTextNode("\n ")) + modinfo.appendChild(node) + if title and not contents_match(title, node): + # The short synopsis is actually different, + # and needs to be stored: + modinfo.appendChild(doc.createTextNode("\n ")) + modinfo.appendChild(title) + if modauthor: + modinfo.appendChild(doc.createTextNode("\n ")) + modinfo.appendChild(modauthor) + if platform: + modinfo.appendChild(doc.createTextNode("\n ")) + modinfo.appendChild(platform) + modinfo.appendChild(doc.createTextNode("\n ")) + section.insertBefore(modinfo, section.childNodes[modinfo_pos]) + section.insertBefore(doc.createTextNode("\n "), modinfo) + # + # The rest of this removes extra newlines from where we cut out + # a lot of elements. A lot of code for minimal value, but keeps + # keeps the generated *ML from being too funny looking. + # + section.normalize() + children = section.childNodes + for i in range(len(children)): + node = children[i] + if node.nodeName == "moduleinfo": + nextnode = children[i+1] + if nextnode.nodeType == TEXT: + data = nextnode.data + if len(string.lstrip(data)) < (len(data) - 4): + nextnode.data = "\n\n\n" + string.lstrip(data) + + +def cleanup_synopses(doc, fragment): + for node in find_all_elements(fragment, "section"): + create_module_info(doc, node) + + +def fixup_table_structures(doc, fragment): + for table in find_all_elements(fragment, "table"): + fixup_table(doc, table) + + +def fixup_table(doc, table): + # create the table head + thead = doc.createElement("thead") + row = doc.createElement("row") + move_elements_by_name(doc, table, row, "entry") + thead.appendChild(doc.createTextNode("\n ")) + thead.appendChild(row) + thead.appendChild(doc.createTextNode("\n ")) + # create the table body + tbody = doc.createElement("tbody") + prev_row = None + last_was_hline = 0 + children = table.childNodes + for child in children: + if child.nodeType == ELEMENT: + tagName = child.tagName + if tagName == "hline" and prev_row is not None: + prev_row.setAttribute("rowsep", "1") + elif tagName == "row": + prev_row = child + # save the rows: + tbody.appendChild(doc.createTextNode("\n ")) + move_elements_by_name(doc, table, tbody, "row", sep="\n ") + # and toss the rest: + while children: + child = children[0] + nodeType = child.nodeType + if nodeType == TEXT: + if string.strip(child.data): + raise ConversionError("unexpected free data in <%s>: %r" + % (table.tagName, child.data)) + table.removeChild(child) + continue + if nodeType == ELEMENT: + if child.tagName != "hline": + raise ConversionError( + "unexpected <%s> in table" % child.tagName) + table.removeChild(child) + continue + raise ConversionError( + "unexpected %s node in table" % child.__class__.__name__) + # nothing left in the <table>; add the <thead> and <tbody> + tgroup = doc.createElement("tgroup") + tgroup.appendChild(doc.createTextNode("\n ")) + tgroup.appendChild(thead) + tgroup.appendChild(doc.createTextNode("\n ")) + tgroup.appendChild(tbody) + tgroup.appendChild(doc.createTextNode("\n ")) + table.appendChild(tgroup) + # now make the <entry>s look nice: + for row in table.getElementsByTagName("row"): + fixup_row(doc, row) + + +def fixup_row(doc, row): + entries = [] + map(entries.append, row.childNodes[1:]) + for entry in entries: + row.insertBefore(doc.createTextNode("\n "), entry) +# row.appendChild(doc.createTextNode("\n ")) + + +def move_elements_by_name(doc, source, dest, name, sep=None): + nodes = [] + for child in source.childNodes: + if child.nodeName == name: + nodes.append(child) + for node in nodes: + source.removeChild(node) + dest.appendChild(node) + if sep: + dest.appendChild(doc.createTextNode(sep)) + + +RECURSE_INTO_PARA_CONTAINERS = ( + "chapter", "abstract", "enumerate", + "section", "subsection", "subsubsection", + "paragraph", "subparagraph", "back-matter", + "howto", "manual", + "item", "itemize", "fulllineitems", "enumeration", "descriptionlist", + "definitionlist", "definition", + ) + +PARA_LEVEL_ELEMENTS = ( + "moduleinfo", "title", "verbatim", "enumerate", "item", + "interpreter-session", "back-matter", "interactive-session", + "opcodedesc", "classdesc", "datadesc", + "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni", + "funcdescni", "methoddescni", "excdescni", + "tableii", "tableiii", "tableiv", "localmoduletable", + "sectionauthor", "seealso", "itemize", + # include <para>, so we can just do it again to get subsequent paras: + PARA_ELEMENT, + ) + +PARA_LEVEL_PRECEEDERS = ( + "setindexsubitem", "author", + "stindex", "obindex", "COMMENT", "label", "input", "title", + "versionadded", "versionchanged", "declaremodule", "modulesynopsis", + "moduleauthor", "indexterm", "leader", + ) + + +def fixup_paras(doc, fragment): + for child in fragment.childNodes: + if child.nodeName in RECURSE_INTO_PARA_CONTAINERS: + fixup_paras_helper(doc, child) + descriptions = find_all_elements(fragment, "description") + for description in descriptions: + fixup_paras_helper(doc, description) + + +def fixup_paras_helper(doc, container, depth=0): + # document is already normalized + children = container.childNodes + start = skip_leading_nodes(children) + while len(children) > start: + if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS: + # Something to recurse into: + fixup_paras_helper(doc, children[start]) + else: + # Paragraph material: + build_para(doc, container, start, len(children)) + if DEBUG_PARA_FIXER and depth == 10: + sys.exit(1) + start = skip_leading_nodes(children, start + 1) + + +def build_para(doc, parent, start, i): + children = parent.childNodes + after = start + 1 + have_last = 0 + BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS + # Collect all children until \n\n+ is found in a text node or a + # member of BREAK_ELEMENTS is found. + for j in range(start, i): + after = j + 1 + child = children[j] + nodeType = child.nodeType + if nodeType == ELEMENT: + if child.tagName in BREAK_ELEMENTS: + after = j + break + elif nodeType == TEXT: + pos = string.find(child.data, "\n\n") + if pos == 0: + after = j + break + if pos >= 1: + child.splitText(pos) + break + else: + have_last = 1 + if (start + 1) > after: + raise ConversionError( + "build_para() could not identify content to turn into a paragraph") + if children[after - 1].nodeType == TEXT: + # we may need to split off trailing white space: + child = children[after - 1] + data = child.data + if string.rstrip(data) != data: + have_last = 0 + child.splitText(len(string.rstrip(data))) + para = doc.createElement(PARA_ELEMENT) + prev = None + indexes = range(start, after) + indexes.reverse() + for j in indexes: + node = parent.childNodes[j] + parent.removeChild(node) + para.insertBefore(node, prev) + prev = node + if have_last: + parent.appendChild(para) + parent.appendChild(doc.createTextNode("\n\n")) + return len(parent.childNodes) + else: + nextnode = parent.childNodes[start] + if nextnode.nodeType == TEXT: + if nextnode.data and nextnode.data[0] != "\n": + nextnode.data = "\n" + nextnode.data + else: + newnode = doc.createTextNode("\n") + parent.insertBefore(newnode, nextnode) + nextnode = newnode + start = start + 1 + parent.insertBefore(para, nextnode) + return start + 1 + + +def skip_leading_nodes(children, start=0): + """Return index into children of a node at which paragraph building should + begin or a recursive call to fixup_paras_helper() should be made (for + subsections, etc.). + + When the return value >= len(children), we've built all the paras we can + from this list of children. + """ + i = len(children) + while i > start: + # skip over leading comments and whitespace: + child = children[start] + nodeType = child.nodeType + if nodeType == TEXT: + data = child.data + shortened = string.lstrip(data) + if shortened: + if data != shortened: + # break into two nodes: whitespace and non-whitespace + child.splitText(len(data) - len(shortened)) + return start + 1 + return start + # all whitespace, just skip + elif nodeType == ELEMENT: + tagName = child.tagName + if tagName in RECURSE_INTO_PARA_CONTAINERS: + return start + if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: + return start + start = start + 1 + return start + + +def fixup_rfc_references(doc, fragment): + for rfcnode in find_all_elements(fragment, "rfc"): + rfcnode.appendChild(doc.createTextNode( + "RFC " + rfcnode.getAttribute("num"))) + + +def fixup_signatures(doc, fragment): + for child in fragment.childNodes: + if child.nodeType == ELEMENT: + args = child.getElementsByTagName("args") + for arg in args: + fixup_args(doc, arg) + arg.normalize() + args = child.getElementsByTagName("constructor-args") + for arg in args: + fixup_args(doc, arg) + arg.normalize() + + +def fixup_args(doc, arglist): + for child in arglist.childNodes: + if child.nodeName == "optional": + # found it; fix and return + arglist.insertBefore(doc.createTextNode("["), child) + optkids = child.childNodes + while optkids: + k = optkids[0] + child.removeChild(k) + arglist.insertBefore(k, child) + arglist.insertBefore(doc.createTextNode("]"), child) + arglist.removeChild(child) + return fixup_args(doc, arglist) + + +def fixup_sectionauthors(doc, fragment): + for sectauth in find_all_elements(fragment, "sectionauthor"): + section = sectauth.parentNode + section.removeChild(sectauth) + set_tagName(sectauth, "author") + sectauth.appendChild(doc.createTextNode( + sectauth.getAttribute("name"))) + sectauth.removeAttribute("name") + after = section.childNodes[2] + title = section.childNodes[1] + if title.nodeName != "title": + after = section.childNodes[0] + section.insertBefore(doc.createTextNode("\n "), after) + section.insertBefore(sectauth, after) + + +def fixup_verbatims(doc): + for verbatim in find_all_elements(doc, "verbatim"): + child = verbatim.childNodes[0] + if child.nodeType == TEXT \ + and string.lstrip(child.data)[:3] == ">>>": + set_tagName(verbatim, "interactive-session") + + +def add_node_ids(fragment, counter=0): + fragment.node_id = counter + for node in fragment.childNodes: + counter = counter + 1 + if node.nodeType == ELEMENT: + counter = add_node_ids(node, counter) + else: + node.node_id = counter + return counter + 1 + + +REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex', + 'refexmodindex', 'refstmodindex') + +def fixup_refmodindexes(fragment): + # Locate <ref*modindex>...</> co-located with <module>...</>, and + # remove the <ref*modindex>, replacing it with index=index on the + # <module> element. + nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS) + d = {} + for node in nodes: + parent = node.parentNode + d[parent.node_id] = parent + del nodes + map(fixup_refmodindexes_chunk, d.values()) + + +def fixup_refmodindexes_chunk(container): + # node is probably a <para>; let's see how often it isn't: + if container.tagName != PARA_ELEMENT: + bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container) + module_entries = find_all_elements(container, "module") + if not module_entries: + return + index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS) + removes = [] + for entry in index_entries: + children = entry.childNodes + if len(children) != 0: + bwrite("--- unexpected number of children for %s node:\n" + % entry.tagName) + ewrite(entry.toxml() + "\n") + continue + found = 0 + module_name = entry.getAttribute("module") + for node in module_entries: + if len(node.childNodes) != 1: + continue + this_name = node.childNodes[0].data + if this_name == module_name: + found = 1 + node.setAttribute("index", "yes") + if found: + removes.append(entry) + for node in removes: + container.removeChild(node) + + +def fixup_bifuncindexes(fragment): + nodes = find_all_elements(fragment, 'bifuncindex') + d = {} + # make sure that each parent is only processed once: + for node in nodes: + parent = node.parentNode + d[parent.node_id] = parent + del nodes + map(fixup_bifuncindexes_chunk, d.values()) + + +def fixup_bifuncindexes_chunk(container): + removes = [] + entries = find_all_child_elements(container, "bifuncindex") + function_entries = find_all_child_elements(container, "function") + for entry in entries: + function_name = entry.getAttribute("name") + found = 0 + for func_entry in function_entries: + t2 = func_entry.childNodes[0].data + if t2[-2:] != "()": + continue + t2 = t2[:-2] + if t2 == function_name: + func_entry.setAttribute("index", "yes") + func_entry.setAttribute("module", "__builtin__") + if not found: + found = 1 + removes.append(entry) + for entry in removes: + container.removeChild(entry) + + +def join_adjacent_elements(container, gi): + queue = [container] + while queue: + parent = queue.pop() + i = 0 + children = parent.childNodes + nchildren = len(children) + while i < (nchildren - 1): + child = children[i] + if child.nodeName == gi: + if children[i+1].nodeName == gi: + ewrite("--- merging two <%s/> elements\n" % gi) + child = children[i] + nextchild = children[i+1] + nextchildren = nextchild.childNodes + while len(nextchildren): + node = nextchildren[0] + nextchild.removeChild(node) + child.appendChild(node) + parent.removeChild(nextchild) + continue + if child.nodeType == ELEMENT: + queue.append(child) + i = i + 1 + + +_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") + +def write_esis(doc, ofp, knownempty): + for node in doc.childNodes: + nodeType = node.nodeType + if nodeType == ELEMENT: + gi = node.tagName + if knownempty(gi): + if node.hasChildNodes(): + raise ValueError, \ + "declared-empty node <%s> has children" % gi + ofp.write("e\n") + for k, value in node.attributes.items(): + if _token_rx.match(value): + dtype = "TOKEN" + else: + dtype = "CDATA" + ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value))) + ofp.write("(%s\n" % gi) + write_esis(node, ofp, knownempty) + ofp.write(")%s\n" % gi) + elif nodeType == TEXT: + ofp.write("-%s\n" % esistools.encode(node.data)) + elif nodeType == ENTITY_REFERENCE: + ofp.write("&%s\n" % node.nodeName) + else: + raise RuntimeError, "unsupported node type: %s" % nodeType + + +def convert(ifp, ofp): + events = esistools.parse(ifp) + toktype, doc = events.getEvent() + fragment = doc.createDocumentFragment() + events.expandNode(fragment) + + normalize(fragment) + simplify(doc, fragment) + handle_labels(doc, fragment) + handle_appendix(doc, fragment) + fixup_trailing_whitespace(doc, { + "abstract": "\n", + "title": "", + "chapter": "\n\n", + "section": "\n\n", + "subsection": "\n\n", + "subsubsection": "\n\n", + "paragraph": "\n\n", + "subparagraph": "\n\n", + }) + cleanup_root_text(doc) + cleanup_trailing_parens(fragment, ["function", "method", "cfunction"]) + cleanup_synopses(doc, fragment) + fixup_descriptors(doc, fragment) + fixup_verbatims(fragment) + normalize(fragment) + fixup_paras(doc, fragment) + fixup_sectionauthors(doc, fragment) + fixup_table_structures(doc, fragment) + fixup_rfc_references(doc, fragment) + fixup_signatures(doc, fragment) + add_node_ids(fragment) + fixup_refmodindexes(fragment) + fixup_bifuncindexes(fragment) + # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and + # LaTeX2HTML screwing with GNU-style long options (the '--' problem). + join_adjacent_elements(fragment, "option") + # + d = {} + for gi in events.parser.get_empties(): + d[gi] = gi + if d.has_key("author"): + del d["author"] + if d.has_key("rfc"): + del d["rfc"] + knownempty = d.has_key + # + try: + write_esis(fragment, ofp, knownempty) + except IOError, (err, msg): + # Ignore EPIPE; it just means that whoever we're writing to stopped + # reading. The rest of the output would be ignored. All other errors + # should still be reported, + if err != errno.EPIPE: + raise + + +def main(): + if len(sys.argv) == 1: + ifp = sys.stdin + ofp = sys.stdout + elif len(sys.argv) == 2: + ifp = open(sys.argv[1]) + ofp = sys.stdout + elif len(sys.argv) == 3: + ifp = open(sys.argv[1]) + import StringIO + ofp = StringIO.StringIO() + else: + usage() + sys.exit(2) + convert(ifp, ofp) + if len(sys.argv) == 3: + fp = open(sys.argv[2], "w") + fp.write(ofp.getvalue()) + fp.close() + ofp.close() + + +if __name__ == "__main__": + main() |