summaryrefslogtreecommitdiff
path: root/doc/tools/sgmlconv/docfixer.py
diff options
context:
space:
mode:
Diffstat (limited to 'doc/tools/sgmlconv/docfixer.py')
-rwxr-xr-xdoc/tools/sgmlconv/docfixer.py1033
1 files changed, 1033 insertions, 0 deletions
diff --git a/doc/tools/sgmlconv/docfixer.py b/doc/tools/sgmlconv/docfixer.py
new file mode 100755
index 0000000..463276b
--- /dev/null
+++ b/doc/tools/sgmlconv/docfixer.py
@@ -0,0 +1,1033 @@
+#! /usr/bin/env python
+
+"""Perform massive transformations on a document tree created from the LaTeX
+of the Python documentation, and dump the ESIS data for the transformed tree.
+"""
+
+
+import errno
+import esistools
+import re
+import string
+import sys
+import xml.dom
+import xml.dom.minidom
+
+ELEMENT = xml.dom.Node.ELEMENT_NODE
+ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE
+TEXT = xml.dom.Node.TEXT_NODE
+
+
+class ConversionError(Exception):
+ pass
+
+
+ewrite = sys.stderr.write
+try:
+ # We can only do this trick on Unix (if tput is on $PATH)!
+ if sys.platform != "posix" or not sys.stderr.isatty():
+ raise ImportError
+ import commands
+except ImportError:
+ bwrite = ewrite
+else:
+ def bwrite(s, BOLDON=commands.getoutput("tput bold"),
+ BOLDOFF=commands.getoutput("tput sgr0")):
+ ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))
+
+
+PARA_ELEMENT = "para"
+
+DEBUG_PARA_FIXER = 0
+
+if DEBUG_PARA_FIXER:
+ def para_msg(s):
+ ewrite("*** %s\n" % s)
+else:
+ def para_msg(s):
+ pass
+
+
+def get_first_element(doc, gi):
+ for n in doc.childNodes:
+ if n.nodeName == gi:
+ return n
+
+def extract_first_element(doc, gi):
+ node = get_first_element(doc, gi)
+ if node is not None:
+ doc.removeChild(node)
+ return node
+
+
+def get_documentElement(node):
+ result = None
+ for child in node.childNodes:
+ if child.nodeType == ELEMENT:
+ result = child
+ return result
+
+
+def set_tagName(elem, gi):
+ elem.nodeName = elem.tagName = gi
+
+
+def find_all_elements(doc, gi):
+ nodes = []
+ if doc.nodeName == gi:
+ nodes.append(doc)
+ for child in doc.childNodes:
+ if child.nodeType == ELEMENT:
+ if child.tagName == gi:
+ nodes.append(child)
+ for node in child.getElementsByTagName(gi):
+ nodes.append(node)
+ return nodes
+
+def find_all_child_elements(doc, gi):
+ nodes = []
+ for child in doc.childNodes:
+ if child.nodeName == gi:
+ nodes.append(child)
+ return nodes
+
+
+def find_all_elements_from_set(doc, gi_set):
+ return __find_all_elements_from_set(doc, gi_set, [])
+
+def __find_all_elements_from_set(doc, gi_set, nodes):
+ if doc.nodeName in gi_set:
+ nodes.append(doc)
+ for child in doc.childNodes:
+ if child.nodeType == ELEMENT:
+ __find_all_elements_from_set(child, gi_set, nodes)
+ return nodes
+
+
+def simplify(doc, fragment):
+ # Try to rationalize the document a bit, since these things are simply
+ # not valid SGML/XML documents as they stand, and need a little work.
+ documentclass = "document"
+ inputs = []
+ node = extract_first_element(fragment, "documentclass")
+ if node is not None:
+ documentclass = node.getAttribute("classname")
+ node = extract_first_element(fragment, "title")
+ if node is not None:
+ inputs.append(node)
+ # update the name of the root element
+ node = get_first_element(fragment, "document")
+ if node is not None:
+ set_tagName(node, documentclass)
+ while 1:
+ node = extract_first_element(fragment, "input")
+ if node is None:
+ break
+ inputs.append(node)
+ if inputs:
+ docelem = get_documentElement(fragment)
+ inputs.reverse()
+ for node in inputs:
+ text = doc.createTextNode("\n")
+ docelem.insertBefore(text, docelem.firstChild)
+ docelem.insertBefore(node, text)
+ docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
+ while fragment.firstChild and fragment.firstChild.nodeType == TEXT:
+ fragment.removeChild(fragment.firstChild)
+
+
+def cleanup_root_text(doc):
+ discards = []
+ skip = 0
+ for n in doc.childNodes:
+ prevskip = skip
+ skip = 0
+ if n.nodeType == TEXT and not prevskip:
+ discards.append(n)
+ elif n.nodeName == "COMMENT":
+ skip = 1
+ for node in discards:
+ doc.removeChild(node)
+
+
+DESCRIPTOR_ELEMENTS = (
+ "cfuncdesc", "cvardesc", "ctypedesc",
+ "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
+ "excdesc", "funcdesc", "funcdescni", "opcodedesc",
+ "datadesc", "datadescni",
+ )
+
+def fixup_descriptors(doc, fragment):
+ sections = find_all_elements(fragment, "section")
+ for section in sections:
+ find_and_fix_descriptors(doc, section)
+
+
+def find_and_fix_descriptors(doc, container):
+ children = container.childNodes
+ for child in children:
+ if child.nodeType == ELEMENT:
+ tagName = child.tagName
+ if tagName in DESCRIPTOR_ELEMENTS:
+ rewrite_descriptor(doc, child)
+ elif tagName == "subsection":
+ find_and_fix_descriptors(doc, child)
+
+
+def rewrite_descriptor(doc, descriptor):
+ #
+ # Do these things:
+ # 1. Add an "index='no'" attribute to the element if the tagName
+ # ends in 'ni', removing the 'ni' from the name.
+ # 2. Create a <signature> from the name attribute
+ # 2a.Create an <args> if it appears to be available.
+ # 3. Create additional <signature>s from <*line{,ni}> elements,
+ # if found.
+ # 4. If a <versionadded> is found, move it to an attribute on the
+ # descriptor.
+ # 5. Move remaining child nodes to a <description> element.
+ # 6. Put it back together.
+ #
+ # 1.
+ descname = descriptor.tagName
+ index = 1
+ if descname[-2:] == "ni":
+ descname = descname[:-2]
+ descriptor.setAttribute("index", "no")
+ set_tagName(descriptor, descname)
+ index = 0
+ desctype = descname[:-4] # remove 'desc'
+ linename = desctype + "line"
+ if not index:
+ linename = linename + "ni"
+ # 2.
+ signature = doc.createElement("signature")
+ name = doc.createElement("name")
+ signature.appendChild(doc.createTextNode("\n "))
+ signature.appendChild(name)
+ name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
+ descriptor.removeAttribute("name")
+ # 2a.
+ if descriptor.hasAttribute("var"):
+ if descname != "opcodedesc":
+ raise RuntimeError, \
+ "got 'var' attribute on descriptor other than opcodedesc"
+ variable = descriptor.getAttribute("var")
+ if variable:
+ args = doc.createElement("args")
+ args.appendChild(doc.createTextNode(variable))
+ signature.appendChild(doc.createTextNode("\n "))
+ signature.appendChild(args)
+ descriptor.removeAttribute("var")
+ newchildren = [signature]
+ children = descriptor.childNodes
+ pos = skip_leading_nodes(children)
+ if pos < len(children):
+ child = children[pos]
+ if child.nodeName == "args":
+ # move <args> to <signature>, or remove if empty:
+ child.parentNode.removeChild(child)
+ if len(child.childNodes):
+ signature.appendChild(doc.createTextNode("\n "))
+ signature.appendChild(child)
+ signature.appendChild(doc.createTextNode("\n "))
+ # 3, 4.
+ pos = skip_leading_nodes(children, pos)
+ while pos < len(children) \
+ and children[pos].nodeName in (linename, "versionadded"):
+ if children[pos].tagName == linename:
+ # this is really a supplemental signature, create <signature>
+ oldchild = children[pos].cloneNode(1)
+ try:
+ sig = methodline_to_signature(doc, children[pos])
+ except KeyError:
+ print oldchild.toxml()
+ raise
+ newchildren.append(sig)
+ else:
+ # <versionadded added=...>
+ descriptor.setAttribute(
+ "added", children[pos].getAttribute("version"))
+ pos = skip_leading_nodes(children, pos + 1)
+ # 5.
+ description = doc.createElement("description")
+ description.appendChild(doc.createTextNode("\n"))
+ newchildren.append(description)
+ move_children(descriptor, description, pos)
+ last = description.childNodes[-1]
+ if last.nodeType == TEXT:
+ last.data = string.rstrip(last.data) + "\n "
+ # 6.
+ # should have nothing but whitespace and signature lines in <descriptor>;
+ # discard them
+ while descriptor.childNodes:
+ descriptor.removeChild(descriptor.childNodes[0])
+ for node in newchildren:
+ descriptor.appendChild(doc.createTextNode("\n "))
+ descriptor.appendChild(node)
+ descriptor.appendChild(doc.createTextNode("\n"))
+
+
+def methodline_to_signature(doc, methodline):
+ signature = doc.createElement("signature")
+ signature.appendChild(doc.createTextNode("\n "))
+ name = doc.createElement("name")
+ name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
+ methodline.removeAttribute("name")
+ signature.appendChild(name)
+ if len(methodline.childNodes):
+ args = doc.createElement("args")
+ signature.appendChild(doc.createTextNode("\n "))
+ signature.appendChild(args)
+ move_children(methodline, args)
+ signature.appendChild(doc.createTextNode("\n "))
+ return signature
+
+
+def move_children(origin, dest, start=0):
+ children = origin.childNodes
+ while start < len(children):
+ node = children[start]
+ origin.removeChild(node)
+ dest.appendChild(node)
+
+
+def handle_appendix(doc, fragment):
+ # must be called after simplfy() if document is multi-rooted to begin with
+ docelem = get_documentElement(fragment)
+ toplevel = docelem.tagName == "manual" and "chapter" or "section"
+ appendices = 0
+ nodes = []
+ for node in docelem.childNodes:
+ if appendices:
+ nodes.append(node)
+ elif node.nodeType == ELEMENT:
+ appnodes = node.getElementsByTagName("appendix")
+ if appnodes:
+ appendices = 1
+ parent = appnodes[0].parentNode
+ parent.removeChild(appnodes[0])
+ parent.normalize()
+ if nodes:
+ map(docelem.removeChild, nodes)
+ docelem.appendChild(doc.createTextNode("\n\n\n"))
+ back = doc.createElement("back-matter")
+ docelem.appendChild(back)
+ back.appendChild(doc.createTextNode("\n"))
+ while nodes and nodes[0].nodeType == TEXT \
+ and not string.strip(nodes[0].data):
+ del nodes[0]
+ map(back.appendChild, nodes)
+ docelem.appendChild(doc.createTextNode("\n"))
+
+
+def handle_labels(doc, fragment):
+ for label in find_all_elements(fragment, "label"):
+ id = label.getAttribute("id")
+ if not id:
+ continue
+ parent = label.parentNode
+ parentTagName = parent.tagName
+ if parentTagName == "title":
+ parent.parentNode.setAttribute("id", id)
+ else:
+ parent.setAttribute("id", id)
+ # now, remove <label id="..."/> from parent:
+ parent.removeChild(label)
+ if parentTagName == "title":
+ parent.normalize()
+ children = parent.childNodes
+ if children[-1].nodeType == TEXT:
+ children[-1].data = string.rstrip(children[-1].data)
+
+
+def fixup_trailing_whitespace(doc, wsmap):
+ queue = [doc]
+ while queue:
+ node = queue[0]
+ del queue[0]
+ if wsmap.has_key(node.nodeName):
+ ws = wsmap[node.tagName]
+ children = node.childNodes
+ children.reverse()
+ if children[0].nodeType == TEXT:
+ data = string.rstrip(children[0].data) + ws
+ children[0].data = data
+ children.reverse()
+ # hack to get the title in place:
+ if node.tagName == "title" \
+ and node.parentNode.firstChild.nodeType == ELEMENT:
+ node.parentNode.insertBefore(doc.createText("\n "),
+ node.parentNode.firstChild)
+ for child in node.childNodes:
+ if child.nodeType == ELEMENT:
+ queue.append(child)
+
+
+def normalize(doc):
+ for node in doc.childNodes:
+ if node.nodeType == ELEMENT:
+ node.normalize()
+
+
+def cleanup_trailing_parens(doc, element_names):
+ d = {}
+ for gi in element_names:
+ d[gi] = gi
+ rewrite_element = d.has_key
+ queue = []
+ for node in doc.childNodes:
+ if node.nodeType == ELEMENT:
+ queue.append(node)
+ while queue:
+ node = queue[0]
+ del queue[0]
+ if rewrite_element(node.tagName):
+ children = node.childNodes
+ if len(children) == 1 \
+ and children[0].nodeType == TEXT:
+ data = children[0].data
+ if data[-2:] == "()":
+ children[0].data = data[:-2]
+ else:
+ for child in node.childNodes:
+ if child.nodeType == ELEMENT:
+ queue.append(child)
+
+
+def contents_match(left, right):
+ left_children = left.childNodes
+ right_children = right.childNodes
+ if len(left_children) != len(right_children):
+ return 0
+ for l, r in map(None, left_children, right_children):
+ nodeType = l.nodeType
+ if nodeType != r.nodeType:
+ return 0
+ if nodeType == ELEMENT:
+ if l.tagName != r.tagName:
+ return 0
+ # should check attributes, but that's not a problem here
+ if not contents_match(l, r):
+ return 0
+ elif nodeType == TEXT:
+ if l.data != r.data:
+ return 0
+ else:
+ # not quite right, but good enough
+ return 0
+ return 1
+
+
+def create_module_info(doc, section):
+ # Heavy.
+ node = extract_first_element(section, "modulesynopsis")
+ if node is None:
+ return
+ set_tagName(node, "synopsis")
+ lastchild = node.childNodes[-1]
+ if lastchild.nodeType == TEXT \
+ and lastchild.data[-1:] == ".":
+ lastchild.data = lastchild.data[:-1]
+ modauthor = extract_first_element(section, "moduleauthor")
+ if modauthor:
+ set_tagName(modauthor, "author")
+ modauthor.appendChild(doc.createTextNode(
+ modauthor.getAttribute("name")))
+ modauthor.removeAttribute("name")
+ platform = extract_first_element(section, "platform")
+ if section.tagName == "section":
+ modinfo_pos = 2
+ modinfo = doc.createElement("moduleinfo")
+ moddecl = extract_first_element(section, "declaremodule")
+ name = None
+ if moddecl:
+ modinfo.appendChild(doc.createTextNode("\n "))
+ name = moddecl.attributes["name"].value
+ namenode = doc.createElement("name")
+ namenode.appendChild(doc.createTextNode(name))
+ modinfo.appendChild(namenode)
+ type = moddecl.attributes.get("type")
+ if type:
+ type = type.value
+ modinfo.appendChild(doc.createTextNode("\n "))
+ typenode = doc.createElement("type")
+ typenode.appendChild(doc.createTextNode(type))
+ modinfo.appendChild(typenode)
+ versionadded = extract_first_element(section, "versionadded")
+ if versionadded:
+ modinfo.setAttribute("added", versionadded.getAttribute("version"))
+ title = get_first_element(section, "title")
+ if title:
+ children = title.childNodes
+ if len(children) >= 2 \
+ and children[0].nodeName == "module" \
+ and children[0].childNodes[0].data == name:
+ # this is it; morph the <title> into <short-synopsis>
+ first_data = children[1]
+ if first_data.data[:4] == " ---":
+ first_data.data = string.lstrip(first_data.data[4:])
+ set_tagName(title, "short-synopsis")
+ if children[-1].nodeType == TEXT \
+ and children[-1].data[-1:] == ".":
+ children[-1].data = children[-1].data[:-1]
+ section.removeChild(title)
+ section.removeChild(section.childNodes[0])
+ title.removeChild(children[0])
+ modinfo_pos = 0
+ else:
+ ewrite("module name in title doesn't match"
+ " <declaremodule/>; no <short-synopsis/>\n")
+ else:
+ ewrite("Unexpected condition: <section/> without <title/>\n")
+ modinfo.appendChild(doc.createTextNode("\n "))
+ modinfo.appendChild(node)
+ if title and not contents_match(title, node):
+ # The short synopsis is actually different,
+ # and needs to be stored:
+ modinfo.appendChild(doc.createTextNode("\n "))
+ modinfo.appendChild(title)
+ if modauthor:
+ modinfo.appendChild(doc.createTextNode("\n "))
+ modinfo.appendChild(modauthor)
+ if platform:
+ modinfo.appendChild(doc.createTextNode("\n "))
+ modinfo.appendChild(platform)
+ modinfo.appendChild(doc.createTextNode("\n "))
+ section.insertBefore(modinfo, section.childNodes[modinfo_pos])
+ section.insertBefore(doc.createTextNode("\n "), modinfo)
+ #
+ # The rest of this removes extra newlines from where we cut out
+ # a lot of elements. A lot of code for minimal value, but keeps
+ # keeps the generated *ML from being too funny looking.
+ #
+ section.normalize()
+ children = section.childNodes
+ for i in range(len(children)):
+ node = children[i]
+ if node.nodeName == "moduleinfo":
+ nextnode = children[i+1]
+ if nextnode.nodeType == TEXT:
+ data = nextnode.data
+ if len(string.lstrip(data)) < (len(data) - 4):
+ nextnode.data = "\n\n\n" + string.lstrip(data)
+
+
+def cleanup_synopses(doc, fragment):
+ for node in find_all_elements(fragment, "section"):
+ create_module_info(doc, node)
+
+
+def fixup_table_structures(doc, fragment):
+ for table in find_all_elements(fragment, "table"):
+ fixup_table(doc, table)
+
+
+def fixup_table(doc, table):
+ # create the table head
+ thead = doc.createElement("thead")
+ row = doc.createElement("row")
+ move_elements_by_name(doc, table, row, "entry")
+ thead.appendChild(doc.createTextNode("\n "))
+ thead.appendChild(row)
+ thead.appendChild(doc.createTextNode("\n "))
+ # create the table body
+ tbody = doc.createElement("tbody")
+ prev_row = None
+ last_was_hline = 0
+ children = table.childNodes
+ for child in children:
+ if child.nodeType == ELEMENT:
+ tagName = child.tagName
+ if tagName == "hline" and prev_row is not None:
+ prev_row.setAttribute("rowsep", "1")
+ elif tagName == "row":
+ prev_row = child
+ # save the rows:
+ tbody.appendChild(doc.createTextNode("\n "))
+ move_elements_by_name(doc, table, tbody, "row", sep="\n ")
+ # and toss the rest:
+ while children:
+ child = children[0]
+ nodeType = child.nodeType
+ if nodeType == TEXT:
+ if string.strip(child.data):
+ raise ConversionError("unexpected free data in <%s>: %r"
+ % (table.tagName, child.data))
+ table.removeChild(child)
+ continue
+ if nodeType == ELEMENT:
+ if child.tagName != "hline":
+ raise ConversionError(
+ "unexpected <%s> in table" % child.tagName)
+ table.removeChild(child)
+ continue
+ raise ConversionError(
+ "unexpected %s node in table" % child.__class__.__name__)
+ # nothing left in the <table>; add the <thead> and <tbody>
+ tgroup = doc.createElement("tgroup")
+ tgroup.appendChild(doc.createTextNode("\n "))
+ tgroup.appendChild(thead)
+ tgroup.appendChild(doc.createTextNode("\n "))
+ tgroup.appendChild(tbody)
+ tgroup.appendChild(doc.createTextNode("\n "))
+ table.appendChild(tgroup)
+ # now make the <entry>s look nice:
+ for row in table.getElementsByTagName("row"):
+ fixup_row(doc, row)
+
+
+def fixup_row(doc, row):
+ entries = []
+ map(entries.append, row.childNodes[1:])
+ for entry in entries:
+ row.insertBefore(doc.createTextNode("\n "), entry)
+# row.appendChild(doc.createTextNode("\n "))
+
+
+def move_elements_by_name(doc, source, dest, name, sep=None):
+ nodes = []
+ for child in source.childNodes:
+ if child.nodeName == name:
+ nodes.append(child)
+ for node in nodes:
+ source.removeChild(node)
+ dest.appendChild(node)
+ if sep:
+ dest.appendChild(doc.createTextNode(sep))
+
+
+RECURSE_INTO_PARA_CONTAINERS = (
+ "chapter", "abstract", "enumerate",
+ "section", "subsection", "subsubsection",
+ "paragraph", "subparagraph", "back-matter",
+ "howto", "manual",
+ "item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
+ "definitionlist", "definition",
+ )
+
+PARA_LEVEL_ELEMENTS = (
+ "moduleinfo", "title", "verbatim", "enumerate", "item",
+ "interpreter-session", "back-matter", "interactive-session",
+ "opcodedesc", "classdesc", "datadesc",
+ "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
+ "funcdescni", "methoddescni", "excdescni",
+ "tableii", "tableiii", "tableiv", "localmoduletable",
+ "sectionauthor", "seealso", "itemize",
+ # include <para>, so we can just do it again to get subsequent paras:
+ PARA_ELEMENT,
+ )
+
+PARA_LEVEL_PRECEEDERS = (
+ "setindexsubitem", "author",
+ "stindex", "obindex", "COMMENT", "label", "input", "title",
+ "versionadded", "versionchanged", "declaremodule", "modulesynopsis",
+ "moduleauthor", "indexterm", "leader",
+ )
+
+
+def fixup_paras(doc, fragment):
+ for child in fragment.childNodes:
+ if child.nodeName in RECURSE_INTO_PARA_CONTAINERS:
+ fixup_paras_helper(doc, child)
+ descriptions = find_all_elements(fragment, "description")
+ for description in descriptions:
+ fixup_paras_helper(doc, description)
+
+
+def fixup_paras_helper(doc, container, depth=0):
+ # document is already normalized
+ children = container.childNodes
+ start = skip_leading_nodes(children)
+ while len(children) > start:
+ if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS:
+ # Something to recurse into:
+ fixup_paras_helper(doc, children[start])
+ else:
+ # Paragraph material:
+ build_para(doc, container, start, len(children))
+ if DEBUG_PARA_FIXER and depth == 10:
+ sys.exit(1)
+ start = skip_leading_nodes(children, start + 1)
+
+
+def build_para(doc, parent, start, i):
+ children = parent.childNodes
+ after = start + 1
+ have_last = 0
+ BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
+ # Collect all children until \n\n+ is found in a text node or a
+ # member of BREAK_ELEMENTS is found.
+ for j in range(start, i):
+ after = j + 1
+ child = children[j]
+ nodeType = child.nodeType
+ if nodeType == ELEMENT:
+ if child.tagName in BREAK_ELEMENTS:
+ after = j
+ break
+ elif nodeType == TEXT:
+ pos = string.find(child.data, "\n\n")
+ if pos == 0:
+ after = j
+ break
+ if pos >= 1:
+ child.splitText(pos)
+ break
+ else:
+ have_last = 1
+ if (start + 1) > after:
+ raise ConversionError(
+ "build_para() could not identify content to turn into a paragraph")
+ if children[after - 1].nodeType == TEXT:
+ # we may need to split off trailing white space:
+ child = children[after - 1]
+ data = child.data
+ if string.rstrip(data) != data:
+ have_last = 0
+ child.splitText(len(string.rstrip(data)))
+ para = doc.createElement(PARA_ELEMENT)
+ prev = None
+ indexes = range(start, after)
+ indexes.reverse()
+ for j in indexes:
+ node = parent.childNodes[j]
+ parent.removeChild(node)
+ para.insertBefore(node, prev)
+ prev = node
+ if have_last:
+ parent.appendChild(para)
+ parent.appendChild(doc.createTextNode("\n\n"))
+ return len(parent.childNodes)
+ else:
+ nextnode = parent.childNodes[start]
+ if nextnode.nodeType == TEXT:
+ if nextnode.data and nextnode.data[0] != "\n":
+ nextnode.data = "\n" + nextnode.data
+ else:
+ newnode = doc.createTextNode("\n")
+ parent.insertBefore(newnode, nextnode)
+ nextnode = newnode
+ start = start + 1
+ parent.insertBefore(para, nextnode)
+ return start + 1
+
+
+def skip_leading_nodes(children, start=0):
+ """Return index into children of a node at which paragraph building should
+ begin or a recursive call to fixup_paras_helper() should be made (for
+ subsections, etc.).
+
+ When the return value >= len(children), we've built all the paras we can
+ from this list of children.
+ """
+ i = len(children)
+ while i > start:
+ # skip over leading comments and whitespace:
+ child = children[start]
+ nodeType = child.nodeType
+ if nodeType == TEXT:
+ data = child.data
+ shortened = string.lstrip(data)
+ if shortened:
+ if data != shortened:
+ # break into two nodes: whitespace and non-whitespace
+ child.splitText(len(data) - len(shortened))
+ return start + 1
+ return start
+ # all whitespace, just skip
+ elif nodeType == ELEMENT:
+ tagName = child.tagName
+ if tagName in RECURSE_INTO_PARA_CONTAINERS:
+ return start
+ if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
+ return start
+ start = start + 1
+ return start
+
+
+def fixup_rfc_references(doc, fragment):
+ for rfcnode in find_all_elements(fragment, "rfc"):
+ rfcnode.appendChild(doc.createTextNode(
+ "RFC " + rfcnode.getAttribute("num")))
+
+
+def fixup_signatures(doc, fragment):
+ for child in fragment.childNodes:
+ if child.nodeType == ELEMENT:
+ args = child.getElementsByTagName("args")
+ for arg in args:
+ fixup_args(doc, arg)
+ arg.normalize()
+ args = child.getElementsByTagName("constructor-args")
+ for arg in args:
+ fixup_args(doc, arg)
+ arg.normalize()
+
+
+def fixup_args(doc, arglist):
+ for child in arglist.childNodes:
+ if child.nodeName == "optional":
+ # found it; fix and return
+ arglist.insertBefore(doc.createTextNode("["), child)
+ optkids = child.childNodes
+ while optkids:
+ k = optkids[0]
+ child.removeChild(k)
+ arglist.insertBefore(k, child)
+ arglist.insertBefore(doc.createTextNode("]"), child)
+ arglist.removeChild(child)
+ return fixup_args(doc, arglist)
+
+
+def fixup_sectionauthors(doc, fragment):
+ for sectauth in find_all_elements(fragment, "sectionauthor"):
+ section = sectauth.parentNode
+ section.removeChild(sectauth)
+ set_tagName(sectauth, "author")
+ sectauth.appendChild(doc.createTextNode(
+ sectauth.getAttribute("name")))
+ sectauth.removeAttribute("name")
+ after = section.childNodes[2]
+ title = section.childNodes[1]
+ if title.nodeName != "title":
+ after = section.childNodes[0]
+ section.insertBefore(doc.createTextNode("\n "), after)
+ section.insertBefore(sectauth, after)
+
+
+def fixup_verbatims(doc):
+ for verbatim in find_all_elements(doc, "verbatim"):
+ child = verbatim.childNodes[0]
+ if child.nodeType == TEXT \
+ and string.lstrip(child.data)[:3] == ">>>":
+ set_tagName(verbatim, "interactive-session")
+
+
+def add_node_ids(fragment, counter=0):
+ fragment.node_id = counter
+ for node in fragment.childNodes:
+ counter = counter + 1
+ if node.nodeType == ELEMENT:
+ counter = add_node_ids(node, counter)
+ else:
+ node.node_id = counter
+ return counter + 1
+
+
+REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
+ 'refexmodindex', 'refstmodindex')
+
+def fixup_refmodindexes(fragment):
+ # Locate <ref*modindex>...</> co-located with <module>...</>, and
+ # remove the <ref*modindex>, replacing it with index=index on the
+ # <module> element.
+ nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
+ d = {}
+ for node in nodes:
+ parent = node.parentNode
+ d[parent.node_id] = parent
+ del nodes
+ map(fixup_refmodindexes_chunk, d.values())
+
+
+def fixup_refmodindexes_chunk(container):
+ # node is probably a <para>; let's see how often it isn't:
+ if container.tagName != PARA_ELEMENT:
+ bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
+ module_entries = find_all_elements(container, "module")
+ if not module_entries:
+ return
+ index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
+ removes = []
+ for entry in index_entries:
+ children = entry.childNodes
+ if len(children) != 0:
+ bwrite("--- unexpected number of children for %s node:\n"
+ % entry.tagName)
+ ewrite(entry.toxml() + "\n")
+ continue
+ found = 0
+ module_name = entry.getAttribute("module")
+ for node in module_entries:
+ if len(node.childNodes) != 1:
+ continue
+ this_name = node.childNodes[0].data
+ if this_name == module_name:
+ found = 1
+ node.setAttribute("index", "yes")
+ if found:
+ removes.append(entry)
+ for node in removes:
+ container.removeChild(node)
+
+
+def fixup_bifuncindexes(fragment):
+ nodes = find_all_elements(fragment, 'bifuncindex')
+ d = {}
+ # make sure that each parent is only processed once:
+ for node in nodes:
+ parent = node.parentNode
+ d[parent.node_id] = parent
+ del nodes
+ map(fixup_bifuncindexes_chunk, d.values())
+
+
+def fixup_bifuncindexes_chunk(container):
+ removes = []
+ entries = find_all_child_elements(container, "bifuncindex")
+ function_entries = find_all_child_elements(container, "function")
+ for entry in entries:
+ function_name = entry.getAttribute("name")
+ found = 0
+ for func_entry in function_entries:
+ t2 = func_entry.childNodes[0].data
+ if t2[-2:] != "()":
+ continue
+ t2 = t2[:-2]
+ if t2 == function_name:
+ func_entry.setAttribute("index", "yes")
+ func_entry.setAttribute("module", "__builtin__")
+ if not found:
+ found = 1
+ removes.append(entry)
+ for entry in removes:
+ container.removeChild(entry)
+
+
+def join_adjacent_elements(container, gi):
+ queue = [container]
+ while queue:
+ parent = queue.pop()
+ i = 0
+ children = parent.childNodes
+ nchildren = len(children)
+ while i < (nchildren - 1):
+ child = children[i]
+ if child.nodeName == gi:
+ if children[i+1].nodeName == gi:
+ ewrite("--- merging two <%s/> elements\n" % gi)
+ child = children[i]
+ nextchild = children[i+1]
+ nextchildren = nextchild.childNodes
+ while len(nextchildren):
+ node = nextchildren[0]
+ nextchild.removeChild(node)
+ child.appendChild(node)
+ parent.removeChild(nextchild)
+ continue
+ if child.nodeType == ELEMENT:
+ queue.append(child)
+ i = i + 1
+
+
+_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
+
+def write_esis(doc, ofp, knownempty):
+ for node in doc.childNodes:
+ nodeType = node.nodeType
+ if nodeType == ELEMENT:
+ gi = node.tagName
+ if knownempty(gi):
+ if node.hasChildNodes():
+ raise ValueError, \
+ "declared-empty node <%s> has children" % gi
+ ofp.write("e\n")
+ for k, value in node.attributes.items():
+ if _token_rx.match(value):
+ dtype = "TOKEN"
+ else:
+ dtype = "CDATA"
+ ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
+ ofp.write("(%s\n" % gi)
+ write_esis(node, ofp, knownempty)
+ ofp.write(")%s\n" % gi)
+ elif nodeType == TEXT:
+ ofp.write("-%s\n" % esistools.encode(node.data))
+ elif nodeType == ENTITY_REFERENCE:
+ ofp.write("&%s\n" % node.nodeName)
+ else:
+ raise RuntimeError, "unsupported node type: %s" % nodeType
+
+
+def convert(ifp, ofp):
+ events = esistools.parse(ifp)
+ toktype, doc = events.getEvent()
+ fragment = doc.createDocumentFragment()
+ events.expandNode(fragment)
+
+ normalize(fragment)
+ simplify(doc, fragment)
+ handle_labels(doc, fragment)
+ handle_appendix(doc, fragment)
+ fixup_trailing_whitespace(doc, {
+ "abstract": "\n",
+ "title": "",
+ "chapter": "\n\n",
+ "section": "\n\n",
+ "subsection": "\n\n",
+ "subsubsection": "\n\n",
+ "paragraph": "\n\n",
+ "subparagraph": "\n\n",
+ })
+ cleanup_root_text(doc)
+ cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
+ cleanup_synopses(doc, fragment)
+ fixup_descriptors(doc, fragment)
+ fixup_verbatims(fragment)
+ normalize(fragment)
+ fixup_paras(doc, fragment)
+ fixup_sectionauthors(doc, fragment)
+ fixup_table_structures(doc, fragment)
+ fixup_rfc_references(doc, fragment)
+ fixup_signatures(doc, fragment)
+ add_node_ids(fragment)
+ fixup_refmodindexes(fragment)
+ fixup_bifuncindexes(fragment)
+ # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
+ # LaTeX2HTML screwing with GNU-style long options (the '--' problem).
+ join_adjacent_elements(fragment, "option")
+ #
+ d = {}
+ for gi in events.parser.get_empties():
+ d[gi] = gi
+ if d.has_key("author"):
+ del d["author"]
+ if d.has_key("rfc"):
+ del d["rfc"]
+ knownempty = d.has_key
+ #
+ try:
+ write_esis(fragment, ofp, knownempty)
+ except IOError, (err, msg):
+ # Ignore EPIPE; it just means that whoever we're writing to stopped
+ # reading. The rest of the output would be ignored. All other errors
+ # should still be reported,
+ if err != errno.EPIPE:
+ raise
+
+
+def main():
+ if len(sys.argv) == 1:
+ ifp = sys.stdin
+ ofp = sys.stdout
+ elif len(sys.argv) == 2:
+ ifp = open(sys.argv[1])
+ ofp = sys.stdout
+ elif len(sys.argv) == 3:
+ ifp = open(sys.argv[1])
+ import StringIO
+ ofp = StringIO.StringIO()
+ else:
+ usage()
+ sys.exit(2)
+ convert(ifp, ofp)
+ if len(sys.argv) == 3:
+ fp = open(sys.argv[2], "w")
+ fp.write(ofp.getvalue())
+ fp.close()
+ ofp.close()
+
+
+if __name__ == "__main__":
+ main()