diff options
Diffstat (limited to 'tools/dev')
-rw-r--r-- | tools/dev/README.txt | 1 | ||||
-rwxr-xr-x | tools/dev/create_unimap.py | 82 | ||||
-rwxr-xr-x | tools/dev/profile_docutils.py | 41 | ||||
-rwxr-xr-x | tools/dev/unicode2rstsubs.py | 204 |
4 files changed, 328 insertions, 0 deletions
diff --git a/tools/dev/README.txt b/tools/dev/README.txt new file mode 100644 index 000000000..ca9e99ee8 --- /dev/null +++ b/tools/dev/README.txt @@ -0,0 +1 @@ +Tools for developers. diff --git a/tools/dev/create_unimap.py b/tools/dev/create_unimap.py new file mode 100755 index 000000000..1d1a2f8a0 --- /dev/null +++ b/tools/dev/create_unimap.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +# Author: Felix Wiemann +# Contact: Felix_Wiemann@ososo.de +# Revision: $Revision$ +# Date: $Date$ +# Copyright: This file has been placed in the public domain. + +# Call: create_unimap.py < unicode.xml > unicode_latex.py +# +# Get unicode.xml from +# <http://www.w3.org/2003/entities/xml/unicode.xml>. + +from xml.dom import minidom +import sys +import pprint + +def w(s): + if isinstance(s, unicode): + s = s.encode('utf8') + sys.stdout.write(s) + +text_map = {} +math_map = {} + +class Visitor: + + """Node visitor for contents of unicode.xml.""" + + def visit_character(self, node): + for n in node.childNodes: + if n.nodeName == 'latex': + code = node.attributes['dec'].value + if '-' in code: + # I don't know what this means, but we probably + # don't need it.... + continue + if int(code) < 128: + # Wrong (maps "-" to "$-$", which is too wide) and + # unnecessary (maps "a" to "{a}"). + continue + latex_code = n.childNodes[0].nodeValue.encode('ascii').strip() + if node.attributes['mode'].value == 'math': + math_map[unichr(int(code))] = '$%s$' % latex_code + else: + text_map[unichr(int(code))] = '{%s}' % latex_code + +def call_visitor(node, visitor=Visitor()): + if isinstance(node, minidom.Text): + name = 'Text' + else: + name = node.nodeName.replace('#', '_') + if hasattr(visitor, 'visit_' + name): + getattr(visitor, 'visit_' + name)(node) + for child in node.childNodes: + call_visitor(child) + if hasattr(visitor, 'depart_' + name): + getattr(visitor, 'depart_' + name)(node) + +document = minidom.parse(sys.stdin) +call_visitor(document) + +unicode_map = math_map +unicode_map.update(text_map) +# Now unicode_map contains the text entries plus dollar-enclosed math +# entries for those chars for which no text entry exists. + +print '# Author: Felix Wiemann' +print '# Contact: Felix_Wiemann@ososo.de' +print '# Revision: $%s$' % 'Revision' +print '# Date: $%s$' % 'Date' +print '# Copyright: This file has been placed in the public domain.' +print +print '# This is a mapping of Unicode characters to LaTeX equivalents.' +print '# The information has been extracted from' +print '# <http://www.w3.org/2003/entities/xml/unicode.xml>, written by' +print '# David Carlisle and Sebastian Rahtz.' +print '#' +print '# The extraction has been done by the "create_unimap.py" script' +print '# located at <http://docutils.sf.net/tools/dev/create_unimap.py>.' +print +print 'unicode_map = %s' % pprint.pformat(unicode_map, indent=0) diff --git a/tools/dev/profile_docutils.py b/tools/dev/profile_docutils.py new file mode 100755 index 000000000..1f79c655e --- /dev/null +++ b/tools/dev/profile_docutils.py @@ -0,0 +1,41 @@ +#!/usr/bin/python -i + +# Author: Felix Wiemann +# Contact: Felix_Wiemann@ososo.de +# Revision: $Revision$ +# Date: $Date$ +# Copyright: This script has been placed in the public domain. + +import os.path +import docutils.core +import hotshot.stats + +print 'Profiler started.' + +os.chdir(os.path.join(os.path.dirname(docutils.__file__), '..')) + +print 'Profiling...' + +prof = hotshot.Profile('docutils.prof') +prof.runcall(docutils.core.publish_file, source_path='HISTORY.txt', + destination_path='prof.HISTORY.html', writer_name='html') +prof.close() + +print 'Loading statistics...' + +print """ +stats = hotshot.stats.load('docutils.prof') +stats.strip_dirs() +stats.sort_stats('time') # 'cumulative'; 'calls' +stats.print_stats(40) +""" + +stats = hotshot.stats.load('docutils.prof') +stats.strip_dirs() +stats.sort_stats('time') +stats.print_stats(40) + +try: + execfile(os.environ['PYTHONSTARTUP']) +except: + pass diff --git a/tools/dev/unicode2rstsubs.py b/tools/dev/unicode2rstsubs.py new file mode 100755 index 000000000..abc85e48b --- /dev/null +++ b/tools/dev/unicode2rstsubs.py @@ -0,0 +1,204 @@ +#! /usr/bin/env python + +# Author: David Goodger +# Contact: goodger@python.org +# Revision: $Revision$ +# Date: $Date$ +# Copyright: This program has been placed in the public domain. + +""" +unicode2subfiles.py -- produce character entity files (reSructuredText +substitutions) from the W3C master unicode.xml file. + +This program extracts character entity and entity set information from a +unicode.xml file and produces multiple reStructuredText files (in the current +directory) containing substitutions. Entity sets are from ISO 8879 & ISO +9573-13 (combined), MathML, and HTML4. One or two files are produced for each +entity set; a second file with a "-wide.txt" suffix is produced if there are +wide-Unicode characters in the set. + +The input file, unicode.xml, is maintained as part of the MathML 2 +Recommentation XML source, and is available from +<http://www.w3.org/2003/entities/xml/>. +""" + +import sys +import os +import optparse +import re +from xml.parsers.expat import ParserCreate + + +usage_msg = """Usage: %s [unicode.xml]""" + +def usage(prog, status=0, msg=None): + print >>sys.stderr, usage_msg % prog + if msg: + print >>sys.stderr, msg + sys.exit(status) + +def main(argv=None): + if argv is None: + argv = sys.argv + if len(argv) == 2: + inpath = argv[1] + elif len(argv) > 2: + usage(argv[0], 2, + 'Too many arguments (%s): only 1 expected.' % (len(argv) - 1)) + else: + inpath = 'unicode.xml' + if not os.path.isfile(inpath): + usage(argv[0], 1, 'No such file: "%s".' % inpath) + infile = open(inpath) + process(infile) + +def process(infile): + grouper = CharacterEntitySetExtractor(infile) + grouper.group() + grouper.write_sets() + + +class CharacterEntitySetExtractor: + + """ + Extracts character entity information from unicode.xml file, groups it by + entity set, and writes out reStructuredText substitution files. + """ + + unwanted_entity_sets = ['stix', # unknown, buggy set + 'predefined'] + + header = """\ +.. This data file has been placed in the public domain. +.. Derived from the Unicode character mappings available from + <http://www.w3.org/2003/entities/xml/>. + Processed by unicode2rstsubs.py, part of Docutils: + <http://docutils.sourceforge.net>. +""" + + def __init__(self, infile): + self.infile = infile + """Input unicode.xml file.""" + + self.parser = self.setup_parser() + """XML parser.""" + + self.elements = [] + """Stack of element names. Last is current element.""" + + self.sets = {} + """Mapping of charent set name to set dict.""" + + self.charid = None + """Current character's "id" attribute value.""" + + self.descriptions = {} + """Mapping of character ID to description.""" + + def setup_parser(self): + parser = ParserCreate() + parser.StartElementHandler = self.StartElementHandler + parser.EndElementHandler = self.EndElementHandler + parser.CharacterDataHandler = self.CharacterDataHandler + return parser + + def group(self): + self.parser.ParseFile(self.infile) + + def StartElementHandler(self, name, attributes): + self.elements.append(name) + handler = name + '_start' + if hasattr(self, handler): + getattr(self, handler)(name, attributes) + + def EndElementHandler(self, name): + assert self.elements[-1] == name, \ + 'unknown end-tag %r (%r)' % (name, self.element) + self.elements.pop() + handler = name + '_end' + if hasattr(self, handler): + getattr(self, handler)(name) + + def CharacterDataHandler(self, data): + handler = self.elements[-1] + '_data' + if hasattr(self, handler): + getattr(self, handler)(data) + + def character_start(self, name, attributes): + self.charid = attributes['id'] + + def entity_start(self, name, attributes): + set = self.entity_set_name(attributes['set']) + if not set: + return + if not self.sets.has_key(set): + print 'bad set: %r' % set + return + entity = attributes['id'] + assert (not self.sets[set].has_key(entity) + or self.sets[set][entity] == self.charid), \ + ('sets[%r][%r] == %r (!= %r)' + % (set, entity, self.sets[set][entity], self.charid)) + self.sets[set][entity] = self.charid + + def description_data(self, data): + self.descriptions.setdefault(self.charid, '') + self.descriptions[self.charid] += data + + entity_set_name_pat = re.compile(r'[0-9-]*(.+)$') + """Pattern to strip ISO numbers off the beginning of set names.""" + + def entity_set_name(self, name): + """ + Return lowcased and standard-number-free entity set name. + Return ``None`` for unwanted entity sets. + """ + match = self.entity_set_name_pat.match(name) + name = match.group(1).lower() + if name in self.unwanted_entity_sets: + return None + self.sets.setdefault(name, {}) + return name + + def write_sets(self): + sets = self.sets.keys() + sets.sort() + for set_name in sets: + self.write_set(set_name) + + def write_set(self, set_name, wide=None): + if wide: + outname = set_name + '-wide.txt' + else: + outname = set_name + '.txt' + outfile = open(outname, 'w') + print 'writing file "%s"' % outname + print >>outfile, self.header + set = self.sets[set_name] + entities = [(e.lower(), e) for e in set.keys()] + entities.sort() + longest = 0 + for _, entity_name in entities: + longest = max(longest, len(entity_name)) + has_wide = None + for _, entity_name in entities: + has_wide = self.write_entity( + set, set_name, entity_name, outfile, longest, wide) or has_wide + if has_wide and not wide: + self.write_set(set_name, 1) + + def write_entity(self, set, set_name, entity_name, outfile, longest, + wide=None): + charid = set[entity_name] + if not wide: + for code in charid[1:].split('-'): + if int(code, 16) > 0xFFFF: + return 1 # wide-Unicode character + codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')]) + print >>outfile, ('.. %-*s unicode:: %s .. %s' + % (longest + 2, '|' + entity_name + '|', + codes, self.descriptions[charid])) + + +if __name__ == '__main__': + sys.exit(main()) |