sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

#!/usr/bin/env python3
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
#    Copying and distribution of this file, with or without modification,
#    are permitted in any medium without royalty provided the copyright
#    notice and this notice are preserved.
#    This file is offered as-is, without any warranty.
#
# .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause
#
# input-encoding-tests.py:
# ========================
from __future__ import print_function

import codecs
import locale
from pprint import pprint
import sys

# additional codecs from https://codeberg.org/milde/inspecting-codecs
import inspecting_codecs

if sys.version_info < (3,):
    sys.path.append('/usr/lib/python3/dist-packages/')

import docutils
from docutils.io import FileInput
if sys.version_info < (3,):
    from docutils.utils.error_reporting import locale_encoding
else:
    from docutils.io import _locale_encoding as locale_encoding


samples = {'utf-8': u'Grüße',
           'utf-8-sig': u'Grüße',
           'utf-16': u'Grüße',
           'utf-16-le': u'Grüße',
           'utf-16-be': u'Grüße',
           'latin1': u'Grüße',
           'latin2': u'cześć',
           'latin4': u'škoda',
           'latin10': u'škoda',
           'cp775': u'cześć',
           }

samples_encoded = dict((encoding, text.encode(encoding))
                     for encoding, text in samples.items())

pprint(samples_encoded)


# for k,v in samples_encoded.items():
#     print(samples[k].encode('utf-8'), '(encoding: %s)'%k)
#     for encoding in samples:
#         if encoding == k:
#             continue
#         try:
#             text = v.decode(encoding)
#             print('  decoded with', encoding, '->', text)
#         except UnicodeError as err:
#             print('  decoded with', encoding, '-> fail')

# write sample files:

for encoding, data in samples_encoded.items():
    with open('samples/sample-'+encoding, mode='w', encoding=encoding) as f:
        f.write(data.decode(encoding))

for encoding, data in samples_encoded.items():
    with open('samples/self-declaring-'+encoding, mode='w', encoding=encoding) as f:
        if encoding in ('utf-16-be', 'utf-16-le'):
            f.write('\ufeff') # BOM (ZWNJ)
        else:
            f.write('.. encoding: %s\n'%encoding)
        f.write(data.decode(encoding))


# read sample files:

print('\nreading with standard `open`')
for encoding in sorted(samples):
    with open('samples/self-declaring-'+encoding) as f:
        try:
            text = f.read()
            print(encoding, repr(text), len(text))
        except UnicodeError:
            print(encoding, 'fail')


print('\nreading with codec "utf_sig".')
for encoding in sorted(samples):
    with open('samples/sample-'+encoding, encoding='utf-sig') as f:
        try:
            text = f.read()
            print(encoding, repr(text), len(text))
        except UnicodeError:
            print(encoding, 'fail')

print('\nreading self-declaring files with codec "declared".')
for encoding in sorted(samples):
    with open('samples/self-declaring-'+encoding, encoding='declared') as f:
        try:
            text = f.read()
            print(encoding, repr(text), len(text))
        except UnicodeError:
            print(encoding, 'fail')


print('\nreading with `docutils.io.FileInput`')
for encoding in sorted(samples):
    f = FileInput(source_path='samples/sample-'+encoding)
    try:
        text = f.read()
        # l > 5 points to spurious bytes in the data
        l = len(text)
        if sys.version_info < (3,):
            text = text.encode('utf8')
        print(encoding, text, l)
    except UnicodeError as err:
        print(encoding, 'fail')

print('\nreading self-declaring file with `docutils.io.FileInput`')
for encoding in sorted(samples):
    f = FileInput(source_path='samples/self-declaring-'+encoding)
    try:
        text = f.read()
        # l > 5 points to spurious bytes in the data
        l = len(text.split()[-1])
        print(encoding, repr(text), l)
    except UnicodeError as err:
        print(encoding, 'fail', err)


print('\nDocutils', docutils.__version__,
      ' Python', sys.version.split()[0])
print('preferred encoding:', locale.getpreferredencoding())
print('locale encoding:', locale_encoding)