diff options
author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2022-12-02 23:23:45 +0000 |
---|---|---|
committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2022-12-02 23:23:45 +0000 |
commit | 7e837bfddea8289d055dd376ec187b0b7cc2750e (patch) | |
tree | fb1ac6a846afa7c024f510d86ea5390b08b100e9 | |
parent | 3e13d7f352d386507f11cbacae27e27c29da9aac (diff) | |
download | docutils-7e837bfddea8289d055dd376ec187b0b7cc2750e.tar.gz |
Test input-encoding enhancement proposal with newly defined codecs.
Test, whether the codecs from https://codeberg.org/milde/inspecting-codecs
can replace the built-in auto-detection of the input encoding.
cf. https://sourceforge.net/p/docutils/patches/194/
git-svn-id: https://svn.code.sf.net/p/docutils/code/trunk@9304 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
-rw-r--r-- | sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py | 29 |
1 files changed, 25 insertions, 4 deletions
diff --git a/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py b/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py index bdef54b17..e7d2a53f1 100644 --- a/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py +++ b/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# encoding: utf-8 # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, @@ -13,10 +12,14 @@ # ======================== from __future__ import print_function +import codecs import locale from pprint import pprint import sys +# additional codecs from https://codeberg.org/milde/inspecting-codecs +import inspecting_codecs + if sys.version_info < (3,): sys.path.append('/usr/lib/python3/dist-packages/') @@ -83,6 +86,26 @@ for encoding in sorted(samples): except UnicodeError: print(encoding, 'fail') + +print('\nreading with codec "utf_sig".') +for encoding in sorted(samples): + with open('samples/sample-'+encoding, encoding='utf-sig') as f: + try: + text = f.read() + print(encoding, repr(text), len(text)) + except UnicodeError: + print(encoding, 'fail') + +print('\nreading self-declaring files with codec "declared".') +for encoding in sorted(samples): + with open('samples/self-declaring-'+encoding, encoding='declared') as f: + try: + text = f.read() + print(encoding, repr(text), len(text)) + except UnicodeError: + print(encoding, 'fail') + + print('\nreading with `docutils.io.FileInput`') for encoding in sorted(samples): f = FileInput(source_path='samples/sample-'+encoding) @@ -102,9 +125,7 @@ for encoding in sorted(samples): try: text = f.read() # l > 5 points to spurious bytes in the data - l = len(text.split()[-1]) - if sys.version_info < (3,): - text = text.encode('utf8') + l = len(text.split()[-1]) print(encoding, repr(text), l) except UnicodeError as err: print(encoding, 'fail', err) |