Use a simpler algorithm to generate char lists

The current algorithm checks each codepoint at least once, and when ``cp_max`` is ``None``, it checks all used characters twice. This combines the two loops and stores the results of the call to ``unicodedata``. Also remove the text about "wide" builds of Python, since PEP 393 and Python 3.3, all builds of Python carry support for the full Unicode space. git-svn-id: https://svn.code.sf.net/p/docutils/code/trunk@9254 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: aa-turner <aa-turner@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2022-11-16 20:18:29 +0000
committer: aa-turner <aa-turner@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2022-11-16 20:18:29 +0000
commit: cb271c98d069029b3678628b43d2aec5a9414cd0 (patch)
tree: 509d7de6cdd162c07e00d1a622800be931530b94 /docutils/tools/dev/generate_punctuation_chars.py
parent: 924d4f7f374709f545d42a99ef7c32e26b58ad29 (diff)
download: docutils-cb271c98d069029b3678628b43d2aec5a9414cd0.tar.gz
1 files changed, 8 insertions, 17 deletions
diff --git a/docutils/tools/dev/generate_punctuation_chars.py b/docutils/tools/dev/generate_punctuation_chars.py
index 5a7bf9842..da5fd9e87 100755
--- a/docutils/tools/dev/generate_punctuation_chars.py
+++ b/docutils/tools/dev/generate_punctuation_chars.py
@@ -152,29 +152,20 @@ unicode_punctuation_categories = {
 #
 # ::
 
-def unicode_charlists(categories, cp_min=0, cp_max=None):
+def unicode_charlists(categories, cp_min=0, cp_max=sys.maxunicode):
     """Return dictionary of Unicode character lists.
 
     For each of the `catagories`, an item contains a list with all Unicode
     characters with `cp_min` <= code-point <= `cp_max` that belong to
     the category.
-
-    The default values check every code-point supported by Python
-    (`sys.maxint` is 0x10FFFF in a "wide" build and 0xFFFF in a "narrow"
-    build, i.e. ucs4 and ucs2 respectively).
     """
-    # Determine highest code point with one of the given categories
-    # (may shorten the search time considerably if there are many
-    # categories with not too high characters):
-    if cp_max is None:
-        cp_max = max(x for x in range(sys.maxunicode+1)
-                     if unicodedata.category(chr(x)) in categories)
-        # print(cp_max) # => 74867 for unicode_punctuation_categories
-    charlists = {}
-    for cat in categories:
-        charlists[cat] = [chr(x) for x in range(cp_min, cp_max+1)
-                          if unicodedata.category(chr(x)) == cat]
-    return charlists
+    char_lists = {cat: [] for cat in categories}
+    for i in range(cp_min, cp_max+1):
+        chr_i = chr(i)
+        cat_i = unicodedata.category(chr_i)
+        if cat_i in char_lists:
+            char_lists[cat_i].append(chr_i)
+    return char_lists
 
 
 # Character categories in Docutils
author	aa-turner <aa-turner@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2022-11-16 20:18:29 +0000
committer	aa-turner <aa-turner@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2022-11-16 20:18:29 +0000
commit	cb271c98d069029b3678628b43d2aec5a9414cd0 (patch)
tree	509d7de6cdd162c07e00d1a622800be931530b94 /docutils/tools/dev/generate_punctuation_chars.py
parent	924d4f7f374709f545d42a99ef7c32e26b58ad29 (diff)
download	docutils-cb271c98d069029b3678628b43d2aec5a9414cd0.tar.gz