"Outsourcing" of development code from utils.punctuation_chars.

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@8014 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2017-01-05 09:49:26 +0000
committer: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2017-01-05 09:49:26 +0000
commit: fdbf5ead02485b160e3e5df872e621060eb807cb (patch)
tree: a5307e320f013b1d86e539ea40e5c3aeb5d165f8 /docutils/tools/dev/generate_punctuation_chars.py
parent: 3cb432be835b564aee0f262ba39d0783b6a0a69f (diff)
download: docutils-fdbf5ead02485b160e3e5df872e621060eb807cb.tar.gz
1 files changed, 334 insertions, 0 deletions
diff --git a/docutils/tools/dev/generate_punctuation_chars.py b/docutils/tools/dev/generate_punctuation_chars.py
new file mode 100644
index 000000000..eb1c73372
--- /dev/null
+++ b/docutils/tools/dev/generate_punctuation_chars.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# :Copyright: © 2011, 2016 Günter Milde.
+# :License: Released under the terms of the `2-Clause BSD license`_, in short:
+#
+#    Copying and distribution of this file, with or without modification,
+#    are permitted in any medium without royalty provided the copyright
+#    notice and this notice are preserved.
+#    This file is offered as-is, without any warranty.
+#
+# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
+
+# :Id: $Id$
+#
+# ::
+
+import sys, re
+import unicodedata
+
+# import the punctuation_chars module from the source or Py3k build
+# path for local Python modules
+if sys.version_info < (3,):
+    sys.path.insert(0, '../../docutils')
+else:
+    sys.path.insert(0, '../../build/lib')
+    unichr = chr
+
+from docutils.utils.punctuation_chars import (openers, closers, delimiters,
+                                              closing_delimiters)
+
+# (re)generate the utils.punctuation_chars module
+# ===============================================
+#
+# The category of some characters may change with the development of the
+# Unicode standard. This tool checks the patterns in `utils.punctuation_chars`
+# against a re-calculation based on the "unicodedata" stdlib module
+# which may give different results for different Python versions.
+#
+# Updating the patterns with a new (Python|Unicode standard) version is an API
+# change (may render valid rST documents invalid). It should only be done for
+# "feature releases" and requires also updating the specification of `inline
+# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
+#
+# Generation of the  character category patterns
+# ----------------------------------------------
+#
+#
+# Unicode punctuation character categories
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# For details about Unicode categories, see
+# http://www.unicode.org/Public/5.1.0/ucd/UCD.html#General_Category_Values
+# ::
+
+unicode_punctuation_categories = {
+    # 'Pc': 'Connector', # not used in Docutils inline markup recognition
+    'Pd': 'Dash',
+    'Ps': 'Open',
+    'Pe': 'Close',
+    'Pi': 'Initial quote', # may behave like Ps or Pe depending on usage
+    'Pf': 'Final quote', # may behave like Ps or Pe depending on usage
+    'Po': 'Other'
+    }
+"""Unicode character categories for punctuation"""
+
+
+# generate character pattern strings
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ::
+
+def unicode_charlists(categories, cp_min=0, cp_max=None):
+    """Return dictionary of Unicode character lists.
+
+    For each of the `catagories`, an item contains a list with all Unicode
+    characters with `cp_min` <= code-point <= `cp_max` that belong to
+    the category.
+
+    The default values check every code-point supported by Python
+    (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow"
+    build, i.e. ucs4 and ucs2 respectively).
+    """
+    # Determine highest code point with one of the given categories
+    # (may shorten the search time considerably if there are many
+    # categories with not too high characters):
+    if cp_max is None:
+        cp_max = max(x for x in range(sys.maxunicode+1)
+                    if unicodedata.category(unichr(x)) in categories)
+        # print(cp_max) # => 74867 for unicode_punctuation_categories
+    charlists = {}
+    for cat in categories:
+        charlists[cat] = [unichr(x) for x in range(cp_min, cp_max+1)
+                            if unicodedata.category(unichr(x)) == cat]
+    return charlists
+
+
+# Character categories in Docutils
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ::
+
+def character_category_patterns():
+
+    """Docutils character category patterns.
+
+    Return list of pattern strings for the categories "Open", "Close",
+    "Delimiters" and "Closing-Delimiters" used in the `inline markup
+    recognition rules`_.
+    """
+
+    cp_min = 160 # ASCII chars have special rules for backwards compatibility
+    ucharlists = unicode_charlists(unicode_punctuation_categories, cp_min)
+    """Strings of characters in Unicode punctuation character categories"""
+
+    # match opening/closing characters
+    # --------------------------------
+    # Rearange the lists to ensure matching characters at the same
+    # index position.
+
+    # low quotation marks are also used as closers (e.g. in Greek)
+    # move them to category Pi:
+    ucharlists['Ps'].remove(u'‚') # 201A  SINGLE LOW-9 QUOTATION MARK
+    ucharlists['Ps'].remove(u'„') # 201E  DOUBLE LOW-9 QUOTATION MARK
+    ucharlists['Pi'] += [u'‚', u'„']
+
+    ucharlists['Pi'].remove(u'‛') # 201B  SINGLE HIGH-REVERSED-9 QUOTATION MARK
+    ucharlists['Pi'].remove(u'‟') # 201F  DOUBLE HIGH-REVERSED-9 QUOTATION MARK
+    ucharlists['Pf'] += [u'‛', u'‟']
+
+    # 301F  LOW DOUBLE PRIME QUOTATION MARK misses the opening pendant:
+    ucharlists['Ps'].insert(ucharlists['Pe'].index(u'\u301f'), u'\u301d')
+
+    # print(u''.join(ucharlists['Ps']).encode('utf8')
+    # print(u''.join(ucharlists['Pe']).encode('utf8')
+    # print(u''.join(ucharlists['Pi']).encode('utf8')
+    # print(u''.join(ucharlists['Pf']).encode('utf8')
+
+    # The Docutils character categories
+    # ---------------------------------
+    #
+    # The categorization of ASCII chars is non-standard to reduce
+    # both false positives and need for escaping. (see `inline markup
+    # recognition rules`_)
+
+    # allowed before markup if there is a matching closer
+    openers = [u'"\'(<\\[{']
+    for category in ('Ps', 'Pi', 'Pf'):
+        openers.extend(ucharlists[category])
+
+    # allowed after markup if there is a matching opener
+    closers = [u'"\')>\\]}']
+    for category in ('Pe', 'Pf', 'Pi'):
+        closers.extend(ucharlists[category])
+
+    # non-matching, allowed on both sides
+    delimiters = [u'\\-/:']
+    for category in ('Pd', 'Po'):
+        delimiters.extend(ucharlists[category])
+
+    # non-matching, after markup
+    closing_delimiters = [r'\\.,;!?']
+
+    # # Test open/close matching:
+    # for i in range(min(len(openers),len(closers))):
+    #     print('%4d    %s    %s' % (i, openers[i].encode('utf8'),
+    #                                closers[i].encode('utf8'))
+
+    return [u''.join(chars) for chars in (openers, closers, delimiters,
+                                            closing_delimiters)]
+
+def separate_wide_chars(s):
+    """Return (s1,s2) with characters above 0xFFFF in s2"""
+    maxunicode_narrow = 0xFFFF
+    l1 = [ch for ch in s if ord(ch) <= maxunicode_narrow]
+    l2 = [ch for ch in s if ord(ch) > maxunicode_narrow]
+    return ''.join(l1), ''.join(l2)
+
+def mark_intervals(s):
+    """Return s with shortcut notation for runs of consecutive characters
+
+    Sort string and replace 'cdef' by 'c-f' and similar.
+    """
+    l =[]
+    s = [ord(ch) for ch in s]
+    s.sort()
+    for n in s:
+        try:
+            if l[-1][-1]+1 == n:
+                l[-1].append(n)
+            else:
+                l.append([n])
+        except IndexError:
+            l.append([n])
+
+    l2 = []
+    for i in l:
+        i = [unichr(n) for n in i]
+        if len(i) > 2:
+            i = i[0], u'-', i[-1]
+        l2.extend(i)
+
+    return ''.join(l2)
+
+def wrap_string(s, startstring= "(u'",
+                    endstring = "')", wrap=65):
+    """Line-wrap a unicode string literal definition."""
+    c = len(startstring)
+    contstring = "'\n" + ' ' * (len(startstring)-2) + "u'"
+    l = [startstring]
+    for ch in s.replace("'", r"\'"):
+        c += 1
+        if ch == '\\' and c > wrap:
+            c = len(startstring)
+            ch = contstring + ch
+        l.append(ch)
+    l.append(endstring)
+    return ''.join(l)
+
+
+def print_differences(old, new, name):
+    """List characters missing in old/new."""
+    if old != new:
+        print('new %s:' % name)
+        for c in new:
+            if c not in old:
+                print('  %04x'%ord(c), unicodedata.name(c))
+        print('removed %s:' % name)
+        for c in old:
+            if c not in new:
+                print('  %04x'%ord(c), unicodedata.name(c))
+
+
+# Output
+# ------
+#
+# ::
+
+if __name__ == '__main__':
+
+# (Re)create and compare character patterns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# ::
+
+    (o, c, d, cd) = character_category_patterns()
+    o, o_wide = separate_wide_chars(o)
+    c, c_wide = separate_wide_chars(c)
+    d, d_wide = separate_wide_chars(d)
+    d = d[:5] + mark_intervals(d[5:])
+    d_wide = mark_intervals(d_wide)
+
+    print_differences(openers, o, 'openers')
+    if o_wide:
+        print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
+    print_differences(closers, c, 'closers')
+    if c_wide:
+        print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
+
+    print_differences(delimiters, d + d_wide, 'delimiters')
+    print_differences(closing_delimiters, cd, 'closing_delimiters')
+
+# Print literal code to define the character sets
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This code can be copied to punctuation_chars.py if an update is wanted.
+
+# Unicode version::
+
+    print('# based on Unicode version %s' % unicodedata.unidata_version)
+
+# `openers` and `closers` must be verbose and keep order because they are
+# also used in `match_chars()`::
+
+    print(wrap_string(o.encode('unicode-escape').decode(),
+                      startstring="openers = (u'"))
+    print(wrap_string(c.encode('unicode-escape').decode(),
+                      startstring="closers = (u'"))
+
+# delimiters: sort and use shortcut for intervals (saves ~150 characters)::
+
+    print(wrap_string(d.encode('unicode-escape').decode(),
+                      startstring="delimiters = (u'"))
+
+# add characters in the upper plane only in a "wide" build::
+
+    print('if sys.maxunicode >= 0x10FFFF: # "wide" build')
+    print(wrap_string(d_wide.encode('unicode-escape').decode(),
+                      startstring="    delimiters += (u'"))
+
+# additional closing delimiters::
+
+    print(wrap_string(cd.encode('unicode-escape').decode(),
+                      startstring="closing_delimiters = (u'"))
+
+
+# test prints
+# ~~~~~~~~~~~
+#
+# For interactive use in development you may uncomment the following
+# definitions::
+
+    # print "wide" Unicode characters:
+    # ucharlists = unicode_charlists(unicode_punctuation_categories)
+    # for key in ucharlists:
+    #     if key.endswith('wide'):
+    #         print key, ucharlists[key]
+
+    # print 'openers = ', repr(openers)
+    # print 'closers = ', repr(closers)
+    # print 'delimiters = ', repr(delimiters)
+    # print 'closing_delimiters = ', repr(closing_delimiters)
+
+    # ucharlists = unicode_charlists(unicode_punctuation_categories)
+    # for cat, chars in ucharlists.items():
+    #     # print cat, chars
+    #     # compact output (visible with a comprehensive font):
+    #     print (u":%s: %s" % (cat, u''.join(chars))).encode('utf8')
+
+# verbose print
+#
+# ::
+
+    # print 'openers:'
+    # for ch in openers:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
+    # print 'closers:'
+    # for ch in closers:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
+    # print 'delimiters:'
+    # for ch in delimiters:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
+    # print 'closing_delimiters:'
+    # for ch in closing_delimiters:
+    #     print ch.encode('utf8'), unicodedata.name(ch)
author	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2017-01-05 09:49:26 +0000
committer	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2017-01-05 09:49:26 +0000
commit	fdbf5ead02485b160e3e5df872e621060eb807cb (patch)
tree	a5307e320f013b1d86e539ea40e5c3aeb5d165f8 /docutils/tools/dev/generate_punctuation_chars.py
parent	3cb432be835b564aee0f262ba39d0783b6a0a69f (diff)
download	docutils-fdbf5ead02485b160e3e5df872e621060eb807cb.tar.gz