diff options
Diffstat (limited to 'doc/tools/sgmlconv/latex2esis.py')
-rwxr-xr-x | doc/tools/sgmlconv/latex2esis.py | 555 |
1 files changed, 555 insertions, 0 deletions
diff --git a/doc/tools/sgmlconv/latex2esis.py b/doc/tools/sgmlconv/latex2esis.py new file mode 100755 index 0000000..74e1dc7 --- /dev/null +++ b/doc/tools/sgmlconv/latex2esis.py @@ -0,0 +1,555 @@ +#! /usr/bin/env python + +"""Generate ESIS events based on a LaTeX source document and +configuration data. + +The conversion is not strong enough to work with arbitrary LaTeX +documents; it has only been designed to work with the highly stylized +markup used in the standard Python documentation. A lot of +information about specific markup is encoded in the control table +passed to the convert() function; changing this table can allow this +tool to support additional LaTeX markups. + +The format of the table is largely undocumented; see the commented +headers where the table is specified in main(). There is no provision +to load an alternate table from an external file. +""" + +import errno +import getopt +import os +import re +import string +import sys +import UserList +import xml.sax.saxutils + +from types import ListType, StringType, TupleType + +try: + from xml.parsers.xmllib import XMLParser +except ImportError: + from xmllib import XMLParser + + +from esistools import encode + + +DEBUG = 0 + + +class LaTeXFormatError(Exception): + pass + + +class LaTeXStackError(LaTeXFormatError): + def __init__(self, found, stack): + msg = "environment close for %s doesn't match;\n stack = %s" \ + % (found, stack) + self.found = found + self.stack = stack[:] + LaTeXFormatError.__init__(self, msg) + + +_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}") +_end_env_rx = re.compile(r"[\\]end{([^}]*)}") +_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)") +_comment_rx = re.compile("%+ ?(.*)\n[ \t]*") +_text_rx = re.compile(r"[^]~%\\{}]+") +_optional_rx = re.compile(r"\s*[[]([^]]*)[]]") +# _parameter_rx is this complicated to allow {...} inside a parameter; +# this is useful to match tabular layout specifications like {c|p{24pt}} +_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}") +_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") +_start_group_rx = re.compile("[ \n]*{") +_start_optional_rx = re.compile("[ \n]*[[]") + + +ESCAPED_CHARS = "$%#^ {}&~" + + +def dbgmsg(msg): + if DEBUG: + sys.stderr.write(msg + "\n") + +def pushing(name, point, depth): + dbgmsg("pushing <%s> at %s" % (name, point)) + +def popping(name, point, depth): + dbgmsg("popping </%s> at %s" % (name, point)) + + +class _Stack(UserList.UserList): + def append(self, entry): + if type(entry) is not StringType: + raise LaTeXFormatError("cannot push non-string on stack: " + + `entry`) + #dbgmsg("%s<%s>" % (" "*len(self.data), entry)) + self.data.append(entry) + + def pop(self, index=-1): + entry = self.data[index] + del self.data[index] + #dbgmsg("%s</%s>" % (" "*len(self.data), entry)) + + def __delitem__(self, index): + entry = self.data[index] + del self.data[index] + #dbgmsg("%s</%s>" % (" "*len(self.data), entry)) + + +def new_stack(): + if DEBUG: + return _Stack() + return [] + + +class Conversion: + def __init__(self, ifp, ofp, table): + self.write = ofp.write + self.ofp = ofp + self.table = table + self.line = string.join(map(string.rstrip, ifp.readlines()), "\n") + self.preamble = 1 + + def convert(self): + self.subconvert() + + def subconvert(self, endchar=None, depth=0): + # + # Parses content, including sub-structures, until the character + # 'endchar' is found (with no open structures), or until the end + # of the input data is endchar is None. + # + stack = new_stack() + line = self.line + while line: + if line[0] == endchar and not stack: + self.line = line + return line + m = _comment_rx.match(line) + if m: + text = m.group(1) + if text: + self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n" + % encode(text)) + line = line[m.end():] + continue + m = _begin_env_rx.match(line) + if m: + name = m.group(1) + entry = self.get_env_entry(name) + # re-write to use the macro handler + line = r"\%s %s" % (name, line[m.end():]) + continue + m = _end_env_rx.match(line) + if m: + # end of environment + envname = m.group(1) + entry = self.get_entry(envname) + while stack and envname != stack[-1] \ + and stack[-1] in entry.endcloses: + self.write(")%s\n" % stack.pop()) + if stack and envname == stack[-1]: + self.write(")%s\n" % entry.outputname) + del stack[-1] + else: + raise LaTeXStackError(envname, stack) + line = line[m.end():] + continue + m = _begin_macro_rx.match(line) + if m: + # start of macro + macroname = m.group(1) + if macroname == "c": + # Ugh! This is a combining character... + endpos = m.end() + self.combining_char("c", line[endpos]) + line = line[endpos + 1:] + continue + entry = self.get_entry(macroname) + if entry.verbatim: + # magic case! + pos = string.find(line, "\\end{%s}" % macroname) + text = line[m.end(1):pos] + stack.append(entry.name) + self.write("(%s\n" % entry.outputname) + self.write("-%s\n" % encode(text)) + self.write(")%s\n" % entry.outputname) + stack.pop() + line = line[pos + len("\\end{%s}" % macroname):] + continue + while stack and stack[-1] in entry.closes: + top = stack.pop() + topentry = self.get_entry(top) + if topentry.outputname: + self.write(")%s\n-\\n\n" % topentry.outputname) + # + if entry.outputname: + if entry.empty: + self.write("e\n") + # + params, optional, empty, environ = self.start_macro(macroname) + # rip off the macroname + if params: + line = line[m.end(1):] + elif empty: + line = line[m.end(1):] + else: + line = line[m.end():] + opened = 0 + implied_content = 0 + + # handle attribute mappings here: + for pentry in params: + if pentry.type == "attribute": + if pentry.optional: + m = _optional_rx.match(line) + if m and entry.outputname: + line = line[m.end():] + self.dump_attr(pentry, m.group(1)) + elif pentry.text and entry.outputname: + # value supplied by conversion spec: + self.dump_attr(pentry, pentry.text) + else: + m = _parameter_rx.match(line) + if not m: + raise LaTeXFormatError( + "could not extract parameter %s for %s: %s" + % (pentry.name, macroname, `line[:100]`)) + if entry.outputname: + self.dump_attr(pentry, m.group(1)) + line = line[m.end():] + elif pentry.type == "child": + if pentry.optional: + m = _optional_rx.match(line) + if m: + line = line[m.end():] + if entry.outputname and not opened: + opened = 1 + self.write("(%s\n" % entry.outputname) + stack.append(macroname) + stack.append(pentry.name) + self.write("(%s\n" % pentry.name) + self.write("-%s\n" % encode(m.group(1))) + self.write(")%s\n" % pentry.name) + stack.pop() + else: + if entry.outputname and not opened: + opened = 1 + self.write("(%s\n" % entry.outputname) + stack.append(entry.name) + self.write("(%s\n" % pentry.name) + stack.append(pentry.name) + self.line = skip_white(line)[1:] + line = self.subconvert( + "}", len(stack) + depth + 1)[1:] + self.write(")%s\n" % stack.pop()) + elif pentry.type == "content": + if pentry.implied: + implied_content = 1 + else: + if entry.outputname and not opened: + opened = 1 + self.write("(%s\n" % entry.outputname) + stack.append(entry.name) + line = skip_white(line) + if line[0] != "{": + raise LaTeXFormatError( + "missing content for " + macroname) + self.line = line[1:] + line = self.subconvert("}", len(stack) + depth + 1) + if line and line[0] == "}": + line = line[1:] + elif pentry.type == "text" and pentry.text: + if entry.outputname and not opened: + opened = 1 + stack.append(entry.name) + self.write("(%s\n" % entry.outputname) + #dbgmsg("--- text: %s" % `pentry.text`) + self.write("-%s\n" % encode(pentry.text)) + elif pentry.type == "entityref": + self.write("&%s\n" % pentry.name) + if entry.outputname: + if not opened: + self.write("(%s\n" % entry.outputname) + stack.append(entry.name) + if not implied_content: + self.write(")%s\n" % entry.outputname) + stack.pop() + continue + if line[0] == endchar and not stack: + self.line = line[1:] + return self.line + if line[0] == "}": + # end of macro or group + macroname = stack[-1] + if macroname: + conversion = self.table[macroname] + if conversion.outputname: + # otherwise, it was just a bare group + self.write(")%s\n" % conversion.outputname) + del stack[-1] + line = line[1:] + continue + if line[0] == "~": + # don't worry about the "tie" aspect of this command + line = line[1:] + self.write("- \n") + continue + if line[0] == "{": + stack.append("") + line = line[1:] + continue + if line[0] == "\\" and line[1] in ESCAPED_CHARS: + self.write("-%s\n" % encode(line[1])) + line = line[2:] + continue + if line[:2] == r"\\": + self.write("(BREAK\n)BREAK\n") + line = line[2:] + continue + if line[:2] == r"\_": + line = "_" + line[2:] + continue + if line[:2] in (r"\'", r'\"'): + # combining characters... + self.combining_char(line[1], line[2]) + line = line[3:] + continue + m = _text_rx.match(line) + if m: + text = encode(m.group()) + self.write("-%s\n" % text) + line = line[m.end():] + continue + # special case because of \item[] + # XXX can we axe this??? + if line[0] == "]": + self.write("-]\n") + line = line[1:] + continue + # avoid infinite loops + extra = "" + if len(line) > 100: + extra = "..." + raise LaTeXFormatError("could not identify markup: %s%s" + % (`line[:100]`, extra)) + while stack: + entry = self.get_entry(stack[-1]) + if entry.closes: + self.write(")%s\n-%s\n" % (entry.outputname, encode("\n"))) + del stack[-1] + else: + break + if stack: + raise LaTeXFormatError("elements remain on stack: " + + string.join(stack, ", ")) + # otherwise we just ran out of input here... + + # This is a really limited table of combinations, but it will have + # to do for now. + _combinations = { + ("c", "c"): 0x00E7, + ("'", "e"): 0x00E9, + ('"', "o"): 0x00F6, + } + + def combining_char(self, prefix, char): + ordinal = self._combinations[(prefix, char)] + self.write("-\\%%%d;\n" % ordinal) + + def start_macro(self, name): + conversion = self.get_entry(name) + parameters = conversion.parameters + optional = parameters and parameters[0].optional + return parameters, optional, conversion.empty, conversion.environment + + def get_entry(self, name): + entry = self.table.get(name) + if entry is None: + dbgmsg("get_entry(%s) failing; building default entry!" % `name`) + # not defined; build a default entry: + entry = TableEntry(name) + entry.has_content = 1 + entry.parameters.append(Parameter("content")) + self.table[name] = entry + return entry + + def get_env_entry(self, name): + entry = self.table.get(name) + if entry is None: + # not defined; build a default entry: + entry = TableEntry(name, 1) + entry.has_content = 1 + entry.parameters.append(Parameter("content")) + entry.parameters[-1].implied = 1 + self.table[name] = entry + elif not entry.environment: + raise LaTeXFormatError( + name + " is defined as a macro; expected environment") + return entry + + def dump_attr(self, pentry, value): + if not (pentry.name and value): + return + if _token_rx.match(value): + dtype = "TOKEN" + else: + dtype = "CDATA" + self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value))) + + +def convert(ifp, ofp, table): + c = Conversion(ifp, ofp, table) + try: + c.convert() + except IOError, (err, msg): + if err != errno.EPIPE: + raise + + +def skip_white(line): + while line and line[0] in " %\n\t\r": + line = string.lstrip(line[1:]) + return line + + + +class TableEntry: + def __init__(self, name, environment=0): + self.name = name + self.outputname = name + self.environment = environment + self.empty = not environment + self.has_content = 0 + self.verbatim = 0 + self.auto_close = 0 + self.parameters = [] + self.closes = [] + self.endcloses = [] + +class Parameter: + def __init__(self, type, name=None, optional=0): + self.type = type + self.name = name + self.optional = optional + self.text = '' + self.implied = 0 + + +class TableParser(XMLParser): + def __init__(self, table=None): + if table is None: + table = {} + self.__table = table + self.__current = None + self.__buffer = '' + XMLParser.__init__(self) + + def get_table(self): + for entry in self.__table.values(): + if entry.environment and not entry.has_content: + p = Parameter("content") + p.implied = 1 + entry.parameters.append(p) + entry.has_content = 1 + return self.__table + + def start_environment(self, attrs): + name = attrs["name"] + self.__current = TableEntry(name, environment=1) + self.__current.verbatim = attrs.get("verbatim") == "yes" + if attrs.has_key("outputname"): + self.__current.outputname = attrs.get("outputname") + self.__current.endcloses = string.split(attrs.get("endcloses", "")) + def end_environment(self): + self.end_macro() + + def start_macro(self, attrs): + name = attrs["name"] + self.__current = TableEntry(name) + self.__current.closes = string.split(attrs.get("closes", "")) + if attrs.has_key("outputname"): + self.__current.outputname = attrs.get("outputname") + def end_macro(self): + self.__table[self.__current.name] = self.__current + self.__current = None + + def start_attribute(self, attrs): + name = attrs.get("name") + optional = attrs.get("optional") == "yes" + if name: + p = Parameter("attribute", name, optional=optional) + else: + p = Parameter("attribute", optional=optional) + self.__current.parameters.append(p) + self.__buffer = '' + def end_attribute(self): + self.__current.parameters[-1].text = self.__buffer + + def start_entityref(self, attrs): + name = attrs["name"] + p = Parameter("entityref", name) + self.__current.parameters.append(p) + + def start_child(self, attrs): + name = attrs["name"] + p = Parameter("child", name, attrs.get("optional") == "yes") + self.__current.parameters.append(p) + self.__current.empty = 0 + + def start_content(self, attrs): + p = Parameter("content") + p.implied = attrs.get("implied") == "yes" + if self.__current.environment: + p.implied = 1 + self.__current.parameters.append(p) + self.__current.has_content = 1 + self.__current.empty = 0 + + def start_text(self, attrs): + self.__current.empty = 0 + self.__buffer = '' + def end_text(self): + p = Parameter("text") + p.text = self.__buffer + self.__current.parameters.append(p) + + def handle_data(self, data): + self.__buffer = self.__buffer + data + + +def load_table(fp, table=None): + parser = TableParser(table=table) + parser.feed(fp.read()) + parser.close() + return parser.get_table() + + +def main(): + global DEBUG + # + opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"]) + for opt, arg in opts: + if opt in ("-D", "--debug"): + DEBUG = DEBUG + 1 + if len(args) == 0: + ifp = sys.stdin + ofp = sys.stdout + elif len(args) == 1: + ifp = open(args) + ofp = sys.stdout + elif len(args) == 2: + ifp = open(args[0]) + ofp = open(args[1], "w") + else: + usage() + sys.exit(2) + + table = load_table(open(os.path.join(sys.path[0], 'conversion.xml'))) + convert(ifp, ofp, table) + + +if __name__ == "__main__": + main() |