4 files changed, 328 insertions, 0 deletions
diff --git a/tools/dev/README.txt b/tools/dev/README.txt
new file mode 100644
index 000000000..ca9e99ee8
--- /dev/null
+++ b/tools/dev/README.txt
@@ -0,0 +1 @@
+Tools for developers.
diff --git a/tools/dev/create_unimap.py b/tools/dev/create_unimap.py
new file mode 100755
index 000000000..1d1a2f8a0
--- /dev/null
+++ b/tools/dev/create_unimap.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# Author: Felix Wiemann
+# Contact: Felix_Wiemann@ososo.de
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This file has been placed in the public domain.
+
+# Call: create_unimap.py < unicode.xml > unicode_latex.py
+#
+# Get unicode.xml from
+# <http://www.w3.org/2003/entities/xml/unicode.xml>.
+
+from xml.dom import minidom
+import sys
+import pprint
+
+def w(s):
+    if isinstance(s, unicode):
+        s = s.encode('utf8')
+    sys.stdout.write(s)
+
+text_map = {}
+math_map = {}
+
+class Visitor:
+    
+    """Node visitor for contents of unicode.xml."""
+
+    def visit_character(self, node):
+        for n in node.childNodes:
+            if n.nodeName == 'latex':
+                code = node.attributes['dec'].value
+                if '-' in code:
+                    # I don't know what this means, but we probably
+                    # don't need it....
+                    continue
+                if int(code) < 128:
+                    # Wrong (maps "-" to "$-$", which is too wide) and
+                    # unnecessary (maps "a" to "{a}").
+                    continue
+                latex_code = n.childNodes[0].nodeValue.encode('ascii').strip()
+                if node.attributes['mode'].value == 'math':
+                    math_map[unichr(int(code))] = '$%s$' % latex_code
+                else:
+                    text_map[unichr(int(code))] = '{%s}' % latex_code
+
+def call_visitor(node, visitor=Visitor()):
+    if isinstance(node, minidom.Text):
+        name = 'Text'
+    else:
+        name = node.nodeName.replace('#', '_')
+    if hasattr(visitor, 'visit_' + name):
+        getattr(visitor, 'visit_' + name)(node)
+    for child in node.childNodes:
+        call_visitor(child)
+    if hasattr(visitor, 'depart_' + name):
+        getattr(visitor, 'depart_' + name)(node)
+
+document = minidom.parse(sys.stdin)
+call_visitor(document)
+
+unicode_map = math_map
+unicode_map.update(text_map)
+# Now unicode_map contains the text entries plus dollar-enclosed math
+# entries for those chars for which no text entry exists.
+
+print '# Author: Felix Wiemann'
+print '# Contact: Felix_Wiemann@ososo.de'
+print '# Revision: $%s$' % 'Revision'
+print '# Date: $%s$' % 'Date'
+print '# Copyright: This file has been placed in the public domain.'
+print
+print '# This is a mapping of Unicode characters to LaTeX equivalents.'
+print '# The information has been extracted from'
+print '# <http://www.w3.org/2003/entities/xml/unicode.xml>, written by'
+print '# David Carlisle and Sebastian Rahtz.'
+print '#'
+print '# The extraction has been done by the "create_unimap.py" script'
+print '# located at <http://docutils.sf.net/tools/dev/create_unimap.py>.'
+print
+print 'unicode_map = %s' % pprint.pformat(unicode_map, indent=0)
diff --git a/tools/dev/profile_docutils.py b/tools/dev/profile_docutils.py
new file mode 100755
index 000000000..1f79c655e
--- /dev/null
+++ b/tools/dev/profile_docutils.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python -i
+
+# Author: Felix Wiemann
+# Contact: Felix_Wiemann@ososo.de
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This script has been placed in the public domain.
+
+import os.path
+import docutils.core
+import hotshot.stats
+
+print 'Profiler started.'
+
+os.chdir(os.path.join(os.path.dirname(docutils.__file__), '..'))
+
+print 'Profiling...'
+
+prof = hotshot.Profile('docutils.prof')
+prof.runcall(docutils.core.publish_file, source_path='HISTORY.txt',
+             destination_path='prof.HISTORY.html', writer_name='html')
+prof.close()
+
+print 'Loading statistics...'
+
+print """
+stats = hotshot.stats.load('docutils.prof')
+stats.strip_dirs()
+stats.sort_stats('time')  # 'cumulative'; 'calls'
+stats.print_stats(40)
+"""
+
+stats = hotshot.stats.load('docutils.prof')
+stats.strip_dirs()
+stats.sort_stats('time')
+stats.print_stats(40)
+
+try:
+    execfile(os.environ['PYTHONSTARTUP'])
+except:
+    pass
diff --git a/tools/dev/unicode2rstsubs.py b/tools/dev/unicode2rstsubs.py
new file mode 100755
index 000000000..abc85e48b
--- /dev/null
+++ b/tools/dev/unicode2rstsubs.py
@@ -0,0 +1,204 @@
+#! /usr/bin/env python
+
+# Author: David Goodger
+# Contact: goodger@python.org
+# Revision: $Revision$
+# Date: $Date$
+# Copyright: This program has been placed in the public domain.
+
+"""
+unicode2subfiles.py -- produce character entity files (reSructuredText
+substitutions) from the W3C master unicode.xml file.
+
+This program extracts character entity and entity set information from a
+unicode.xml file and produces multiple reStructuredText files (in the current
+directory) containing substitutions.  Entity sets are from ISO 8879 & ISO
+9573-13 (combined), MathML, and HTML4.  One or two files are produced for each
+entity set; a second file with a "-wide.txt" suffix is produced if there are
+wide-Unicode characters in the set.
+
+The input file, unicode.xml, is maintained as part of the MathML 2
+Recommentation XML source, and is available from
+<http://www.w3.org/2003/entities/xml/>.
+"""
+
+import sys
+import os
+import optparse
+import re
+from xml.parsers.expat import ParserCreate
+
+
+usage_msg = """Usage: %s [unicode.xml]"""
+
+def usage(prog, status=0, msg=None):
+    print >>sys.stderr, usage_msg % prog
+    if msg:
+        print >>sys.stderr, msg
+    sys.exit(status)
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    if len(argv) == 2:
+        inpath = argv[1]
+    elif len(argv) > 2:
+        usage(argv[0], 2,
+              'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
+    else:
+        inpath = 'unicode.xml'
+    if not os.path.isfile(inpath):
+        usage(argv[0], 1, 'No such file: "%s".' % inpath)
+    infile = open(inpath)
+    process(infile)
+
+def process(infile):
+    grouper = CharacterEntitySetExtractor(infile)
+    grouper.group()
+    grouper.write_sets()
+
+
+class CharacterEntitySetExtractor:
+
+    """
+    Extracts character entity information from unicode.xml file, groups it by
+    entity set, and writes out reStructuredText substitution files.
+    """
+
+    unwanted_entity_sets = ['stix',     # unknown, buggy set
+                            'predefined']
+
+    header = """\
+.. This data file has been placed in the public domain.
+.. Derived from the Unicode character mappings available from
+   <http://www.w3.org/2003/entities/xml/>.
+   Processed by unicode2rstsubs.py, part of Docutils:
+   <http://docutils.sourceforge.net>.
+"""
+
+    def __init__(self, infile):
+        self.infile = infile
+        """Input unicode.xml file."""
+
+        self.parser = self.setup_parser()
+        """XML parser."""
+
+        self.elements = []
+        """Stack of element names.  Last is current element."""
+
+        self.sets = {}
+        """Mapping of charent set name to set dict."""
+
+        self.charid = None
+        """Current character's "id" attribute value."""
+
+        self.descriptions = {}
+        """Mapping of character ID to description."""
+
+    def setup_parser(self):
+        parser = ParserCreate()
+        parser.StartElementHandler = self.StartElementHandler
+        parser.EndElementHandler = self.EndElementHandler
+        parser.CharacterDataHandler = self.CharacterDataHandler
+        return parser
+
+    def group(self):
+        self.parser.ParseFile(self.infile)
+
+    def StartElementHandler(self, name, attributes):
+        self.elements.append(name)
+        handler = name + '_start'
+        if hasattr(self, handler):
+            getattr(self, handler)(name, attributes)
+
+    def EndElementHandler(self, name):
+        assert self.elements[-1] == name, \
+               'unknown end-tag %r (%r)' % (name, self.element)
+        self.elements.pop()
+        handler = name + '_end'
+        if hasattr(self, handler):
+            getattr(self, handler)(name)
+
+    def CharacterDataHandler(self, data):
+        handler = self.elements[-1] + '_data'
+        if hasattr(self, handler):
+            getattr(self, handler)(data)
+
+    def character_start(self, name, attributes):
+        self.charid = attributes['id']
+
+    def entity_start(self, name, attributes):
+        set = self.entity_set_name(attributes['set'])
+        if not set:
+            return
+        if not self.sets.has_key(set):
+            print 'bad set: %r' % set
+            return
+        entity = attributes['id']
+        assert (not self.sets[set].has_key(entity)
+                or self.sets[set][entity] == self.charid), \
+                ('sets[%r][%r] == %r (!= %r)'
+                 % (set, entity, self.sets[set][entity], self.charid))
+        self.sets[set][entity] = self.charid
+
+    def description_data(self, data):
+        self.descriptions.setdefault(self.charid, '')
+        self.descriptions[self.charid] += data
+
+    entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
+    """Pattern to strip ISO numbers off the beginning of set names."""
+
+    def entity_set_name(self, name):
+        """
+        Return lowcased and standard-number-free entity set name.
+        Return ``None`` for unwanted entity sets.
+        """
+        match = self.entity_set_name_pat.match(name)
+        name = match.group(1).lower()
+        if name in self.unwanted_entity_sets:
+            return None
+        self.sets.setdefault(name, {})
+        return name
+
+    def write_sets(self):
+        sets = self.sets.keys()
+        sets.sort()
+        for set_name in sets:
+            self.write_set(set_name)
+
+    def write_set(self, set_name, wide=None):
+        if wide:
+            outname = set_name + '-wide.txt'
+        else:
+            outname = set_name + '.txt'
+        outfile = open(outname, 'w')
+        print 'writing file "%s"' % outname
+        print >>outfile, self.header
+        set = self.sets[set_name]
+        entities = [(e.lower(), e) for e in set.keys()]
+        entities.sort()
+        longest = 0
+        for _, entity_name in entities:
+            longest = max(longest, len(entity_name))
+        has_wide = None
+        for _, entity_name in entities:
+            has_wide = self.write_entity(
+                set, set_name, entity_name, outfile, longest, wide) or has_wide
+        if has_wide and not wide:
+            self.write_set(set_name, 1)
+
+    def write_entity(self, set, set_name, entity_name, outfile, longest,
+                     wide=None):
+        charid = set[entity_name]
+        if not wide:
+            for code in charid[1:].split('-'):
+                if int(code, 16) > 0xFFFF:
+                    return 1            # wide-Unicode character
+        codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')])
+        print >>outfile, ('.. %-*s unicode:: %s .. %s'
+                          % (longest + 2, '|' + entity_name + '|',
+                             codes, self.descriptions[charid]))
+
+
+if __name__ == '__main__':
+    sys.exit(main())