summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2022-12-02 23:23:45 +0000
committermilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2022-12-02 23:23:45 +0000
commit7e837bfddea8289d055dd376ec187b0b7cc2750e (patch)
treefb1ac6a846afa7c024f510d86ea5390b08b100e9
parent3e13d7f352d386507f11cbacae27e27c29da9aac (diff)
downloaddocutils-7e837bfddea8289d055dd376ec187b0b7cc2750e.tar.gz
Test input-encoding enhancement proposal with newly defined codecs.
Test, whether the codecs from https://codeberg.org/milde/inspecting-codecs can replace the built-in auto-detection of the input encoding. cf. https://sourceforge.net/p/docutils/patches/194/ git-svn-id: https://svn.code.sf.net/p/docutils/code/trunk@9304 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
-rw-r--r--sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py29
1 files changed, 25 insertions, 4 deletions
diff --git a/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py b/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py
index bdef54b17..e7d2a53f1 100644
--- a/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py
+++ b/sandbox/enhancement-proposals/input-encoding/input-encoding-tests.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-# encoding: utf-8
# :License: Released under the terms of the `2-Clause BSD license`_, in short:
#
# Copying and distribution of this file, with or without modification,
@@ -13,10 +12,14 @@
# ========================
from __future__ import print_function
+import codecs
import locale
from pprint import pprint
import sys
+# additional codecs from https://codeberg.org/milde/inspecting-codecs
+import inspecting_codecs
+
if sys.version_info < (3,):
sys.path.append('/usr/lib/python3/dist-packages/')
@@ -83,6 +86,26 @@ for encoding in sorted(samples):
except UnicodeError:
print(encoding, 'fail')
+
+print('\nreading with codec "utf_sig".')
+for encoding in sorted(samples):
+ with open('samples/sample-'+encoding, encoding='utf-sig') as f:
+ try:
+ text = f.read()
+ print(encoding, repr(text), len(text))
+ except UnicodeError:
+ print(encoding, 'fail')
+
+print('\nreading self-declaring files with codec "declared".')
+for encoding in sorted(samples):
+ with open('samples/self-declaring-'+encoding, encoding='declared') as f:
+ try:
+ text = f.read()
+ print(encoding, repr(text), len(text))
+ except UnicodeError:
+ print(encoding, 'fail')
+
+
print('\nreading with `docutils.io.FileInput`')
for encoding in sorted(samples):
f = FileInput(source_path='samples/sample-'+encoding)
@@ -102,9 +125,7 @@ for encoding in sorted(samples):
try:
text = f.read()
# l > 5 points to spurious bytes in the data
- l = len(text.split()[-1])
- if sys.version_info < (3,):
- text = text.encode('utf8')
+ l = len(text.split()[-1])
print(encoding, repr(text), l)
except UnicodeError as err:
print(encoding, 'fail', err)