summaryrefslogtreecommitdiff
path: root/Tools
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2005-10-21 13:45:17 +0000
committerMarc-André Lemburg <mal@egenix.com>2005-10-21 13:45:17 +0000
commite3f22717bbefb3ad5d2899f03592b771f762d7f9 (patch)
treec79c9db19258aa505c4393dbb34e56e4e4a0a149 /Tools
parent5b3f03fa96421b4eaf4330f491ba16a1c0c2bff8 (diff)
downloadcpython-e3f22717bbefb3ad5d2899f03592b771f762d7f9.tar.gz
Moved gencodec.py to the Tools/unicode/ directory.
Added new support for decoding tables. Cleaned up the implementation a bit.
Diffstat (limited to 'Tools')
-rw-r--r--Tools/unicode/gencodec.py (renamed from Tools/scripts/gencodec.py)267
1 files changed, 179 insertions, 88 deletions
diff --git a/Tools/scripts/gencodec.py b/Tools/unicode/gencodec.py
index 75337d6dbb..7bce3d5f7f 100644
--- a/Tools/scripts/gencodec.py
+++ b/Tools/unicode/gencodec.py
@@ -15,17 +15,22 @@ lowercase with hyphens replaced by underscores.
The tool also writes marshalled versions of the mapping tables to the
same location (with .mapping extension).
-Written by Marc-Andre Lemburg (mal@lemburg.com).
+Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
+Unicode table maps for decoding.
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
(c) Copyright Guido van Rossum, 2000.
+(c) Copyright Marc-Andre Lemburg, 2005.
"""#"
-import re,os,time,marshal
+import re, os, time, marshal, codecs
-# Create numeric tables or character based ones ?
-numeric = 1
+# Maximum allowed size of charmap tables
+MAX_TABLE_SIZE = 8192
+
+# Standard undefined Unicode code point
+UNI_UNDEFINED = unichr(0xFFFE)
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
'\s+'
@@ -69,8 +74,15 @@ def readmap(filename):
enc2uni = {}
identity = []
unmapped = range(256)
- for i in range(256):
- unmapped[i] = i
+
+ # UTC mapping tables per convention don't include the identity
+ # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
+ # explicitly mapped to different characters or undefined
+ for i in range(32) + [127]:
+ identity.append(i)
+ unmapped.remove(i)
+ enc2uni[i] = (i, 'CONTROL CHARACTER')
+
for line in lines:
line = line.strip()
if not line or line[0] == '#':
@@ -82,22 +94,23 @@ def readmap(filename):
enc,uni,comment = m.groups()
enc = parsecodes(enc)
uni = parsecodes(uni)
- if not comment:
+ if comment is None:
comment = ''
else:
- comment = comment[1:]
+ comment = comment[1:].strip()
if enc < 256:
- unmapped.remove(enc)
+ if enc in unmapped:
+ unmapped.remove(enc)
if enc == uni:
identity.append(enc)
- else:
- enc2uni[enc] = (uni,comment)
+ enc2uni[enc] = (uni,comment)
else:
enc2uni[enc] = (uni,comment)
+
# If there are more identity-mapped entries than unmapped entries,
# it pays to generate an identity dictionary first, and add explicit
# mappings to None for the rest
- if len(identity)>=len(unmapped):
+ if len(identity) >= len(unmapped):
for enc in unmapped:
enc2uni[enc] = (None, "")
enc2uni['IDENTITY'] = 256
@@ -112,44 +125,146 @@ def hexrepr(t):
len(t)
except:
return '0x%04x' % t
- return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
+ try:
+ return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
+ except TypeError, why:
+ print '* failed to convert %r: %s' % (t, why)
+ raise
-def unicoderepr(t):
+def python_mapdef_code(varname, map, comments=1):
- if t is None:
- return 'None'
- if numeric:
- return hexrepr(t)
+ l = []
+ append = l.append
+ if map.has_key("IDENTITY"):
+ append("%s = codecs.make_identity_dict(range(%d))" %
+ (varname, map["IDENTITY"]))
+ append("%s.update({" % varname)
+ splits = 1
+ del map["IDENTITY"]
+ identity = 1
else:
- try:
- len(t)
- except:
- return repr(unichr(t))
- return repr(''.join(map(unichr, t)))
-
-def keyrepr(t):
+ append("%s = {" % varname)
+ splits = 0
+ identity = 0
- if t is None:
- return 'None'
- if numeric:
- return hexrepr(t)
+ mappings = map.items()
+ mappings.sort()
+ i = 0
+ for mapkey, mapvalue in mappings:
+ mapcomment = ''
+ if isinstance(mapkey, tuple):
+ (mapkey, mapcomment) = mapkey
+ if isinstance(mapvalue, tuple):
+ (mapvalue, mapcomment) = mapvalue
+ if mapkey is None:
+ continue
+ if (identity and
+ mapkey == mapvalue and
+ mapkey < 256):
+ # No need to include identity mappings, since these
+ # are already set for the first 256 code points.
+ continue
+ key = hexrepr(mapkey)
+ value = hexrepr(mapvalue)
+ if mapcomment and comments:
+ append(' %s: %s,\t# %s' % (key, value, mapcomment))
+ else:
+ append(' %s: %s,' % (key, value))
+ i += 1
+ if i == 4096:
+ # Split the definition into parts to that the Python
+ # parser doesn't dump core
+ if splits == 0:
+ append('}')
+ else:
+ append('})')
+ append('%s.update({' % varname)
+ i = 0
+ splits = splits + 1
+ if splits == 0:
+ append('}')
else:
- try:
- len(t)
- except:
- if t < 256:
- return repr(chr(t))
+ append('})')
+
+ return l
+
+def python_tabledef_code(varname, map, comments=1):
+
+ l = []
+ append = l.append
+ append('%s = (' % varname)
+
+ # Analyze map and create table dict
+ mappings = map.items()
+ mappings.sort()
+ table = {}
+ maxkey = 0
+ if map.has_key('IDENTITY'):
+ for key in range(256):
+ table[key] = (key, '')
+ maxkey = 255
+ del map['IDENTITY']
+ for mapkey, mapvalue in mappings:
+ mapcomment = ''
+ if isinstance(mapkey, tuple):
+ (mapkey, mapcomment) = mapkey
+ if isinstance(mapvalue, tuple):
+ (mapvalue, mapcomment) = mapvalue
+ if mapkey is None:
+ continue
+ table[mapkey] = (mapvalue, mapcomment)
+ if mapkey > maxkey:
+ maxkey = mapkey
+ if maxkey > MAX_TABLE_SIZE:
+ # Table too large
+ return None
+
+ # Create table code
+ for key in range(maxkey + 1):
+ if key not in table:
+ mapvalue = None
+ mapcomment = 'UNDEFINED'
+ else:
+ mapvalue, mapcomment = table[key]
+ if mapvalue is None:
+ mapchar = UNI_UNDEFINED
+ else:
+ if isinstance(mapvalue, tuple):
+ # 1-n mappings not supported
+ return None
else:
- return repr(unichr(t))
- return repr(''.join(map(chr, t)))
+ mapchar = unichr(mapvalue)
+ if mapcomment and comments:
+ append(' %r\t# %s -> %s' % (mapchar,
+ hexrepr(key),
+ mapcomment))
+ else:
+ append(' %r' % mapchar)
-def codegen(name,map,comments=1):
+ append(')')
+ return l
+
+def codegen(name, map, comments=1):
""" Returns Python source for the given map.
Comments are included in the source, if comments is true (default).
"""
+ # Generate code
+ decoding_map_code = python_mapdef_code(
+ 'decoding_map',
+ map,
+ comments=comments)
+ decoding_table_code = python_tabledef_code(
+ 'decoding_table',
+ map,
+ comments=comments)
+ encoding_map_code = python_mapdef_code(
+ 'encoding_map',
+ codecs.make_encoding_map(map),
+ comments=comments)
+
l = [
'''\
""" Python Character Mapping Codec generated from '%s' with gencodec.py.
@@ -167,9 +282,16 @@ class Codec(codecs.Codec):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
-
- return codecs.charmap_decode(input,errors,decoding_map)
-
+''' % name
+ ]
+ if decoding_table_code:
+ l.append('''\
+ return codecs.charmap_decode(input,errors,decoding_table)''')
+ else:
+ l.append('''\
+ return codecs.charmap_decode(input,errors,decoding_map)''')
+
+ l.append('''
class StreamWriter(Codec,codecs.StreamWriter):
pass
@@ -183,54 +305,21 @@ def getregentry():
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
### Decoding Map
-''' % name,
- ]
+''')
+ l.extend(decoding_map_code)
- if map.has_key("IDENTITY"):
- l.append("decoding_map = codecs.make_identity_dict(range(%d))"
- % map["IDENTITY"])
- l.append("decoding_map.update({")
- splits = 1
- del map["IDENTITY"]
- else:
- l.append("decoding_map = {")
- splits = 0
+ # Add optional decoding table
+ if decoding_table_code:
+ l.append('''
+### Decoding Table
+''')
+ l.extend(decoding_table_code)
- mappings = map.items()
- mappings.sort()
- append = l.append
- i = 0
- for e,value in mappings:
- try:
- (u,c) = value
- except TypeError:
- u = value
- c = ''
- key = keyrepr(e)
- if c and comments:
- append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
- else:
- append('\t%s: %s,' % (key,unicoderepr(u)))
- i += 1
- if i == 4096:
- # Split the definition into parts to that the Python
- # parser doesn't dump core
- if splits == 0:
- append('}')
- else:
- append('})')
- append('decoding_map.update({')
- i = 0
- splits = splits + 1
- if splits == 0:
- append('}')
- else:
- append('})')
- append('''
+ l.append('''
### Encoding Map
-
-encoding_map = codecs.make_encoding_map(decoding_map)
''')
+ l.extend(encoding_map_code)
+
return '\n'.join(l)
def pymap(name,map,pyfile,comments=1):
@@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
mapnames = os.listdir(dir)
for mapname in mapnames:
+ mappathname = os.path.join(dir, mapname)
name = os.path.split(mapname)[1]
name = name.replace('-','_')
name = name.split('.')[0]
@@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
if not map:
print '* map is empty; skipping'
else:
- pymap(mapname, map, prefix + codefile,comments)
- marshalmap(mapname, map, prefix + marshalfile)
- except ValueError:
- print '* conversion failed'
+ pymap(mappathname, map, prefix + codefile,comments)
+ marshalmap(mappathname, map, prefix + marshalfile)
+ except ValueError, why:
+ print '* conversion failed: %s' % why
+ raise
def rewritepythondir(dir,prefix='',comments=1):