summaryrefslogtreecommitdiff
path: root/Lib/test/test_codecs.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r--Lib/test/test_codecs.py143
1 files changed, 133 insertions, 10 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index b93e0ab0e2..4740b682aa 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -27,6 +27,7 @@ def coding_checker(self, coder):
self.assertEqual(coder(input), (expect, len(input)))
return check
+
class Queue(object):
"""
queue: write bytes at one end, read bytes from the other end
@@ -47,6 +48,7 @@ class Queue(object):
self._buffer = self._buffer[size:]
return s
+
class MixInCheckStateHandling:
def check_state_handling_decode(self, encoding, u, s):
for i in range(len(s)+1):
@@ -80,6 +82,7 @@ class MixInCheckStateHandling:
part2 = d.encode(u[i:], True)
self.assertEqual(s, part1+part2)
+
class ReadTest(MixInCheckStateHandling):
def check_partial(self, input, partialresults):
# get a StreamReader for the encoding and feed the bytestring version
@@ -358,6 +361,12 @@ class ReadTest(MixInCheckStateHandling):
self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
"[?]".encode(self.encoding))
+ # sequential surrogate characters
+ self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
+ "[]".encode(self.encoding))
+ self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
+ "[??]".encode(self.encoding))
+
bom = "".encode(self.encoding)
for before, after in [("\U00010fff", "A"), ("[", "]"),
("A", "\U00010fff")]:
@@ -383,6 +392,7 @@ class ReadTest(MixInCheckStateHandling):
self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
before + backslashreplace + after)
+
class UTF32Test(ReadTest, unittest.TestCase):
encoding = "utf-32"
if sys.byteorder == 'little':
@@ -478,6 +488,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
self.assertEqual('\U00010000' * 1024,
codecs.utf_32_decode(encoded_be)[0])
+
class UTF32LETest(ReadTest, unittest.TestCase):
encoding = "utf-32-le"
ill_formed_sequence = b"\x80\xdc\x00\x00"
@@ -523,6 +534,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
self.assertEqual('\U00010000' * 1024,
codecs.utf_32_le_decode(encoded)[0])
+
class UTF32BETest(ReadTest, unittest.TestCase):
encoding = "utf-32-be"
ill_formed_sequence = b"\x00\x00\xdc\x80"
@@ -747,6 +759,7 @@ class UTF8Test(ReadTest, unittest.TestCase):
encoding = "utf-8"
ill_formed_sequence = b"\xed\xb2\x80"
ill_formed_sequence_replace = "\ufffd" * 3
+ BOM = b''
def test_partial(self):
self.check_partial(
@@ -775,27 +788,49 @@ class UTF8Test(ReadTest, unittest.TestCase):
self.check_state_handling_decode(self.encoding,
u, u.encode(self.encoding))
+ def test_decode_error(self):
+ for data, error_handler, expected in (
+ (b'[\x80\xff]', 'ignore', '[]'),
+ (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+ (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+ (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.decode(self.encoding, error_handler),
+ expected)
+
def test_lone_surrogates(self):
super().test_lone_surrogates()
# not sure if this is making sense for
# UTF-16 and UTF-32
- self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
- b'[\x80]')
+ self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
+ self.BOM + b'[\x80]')
+
+ with self.assertRaises(UnicodeEncodeError) as cm:
+ "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
+ exc = cm.exception
+ self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
def test_surrogatepass_handler(self):
- self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
- b"abc\xed\xa0\x80def")
- self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"),
+ self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
+ self.BOM + b"abc\xed\xa0\x80def")
+ self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
+ self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
+ self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
+ self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
+
+ self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
"abc\ud800def")
- self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"),
- b"\xf0\x90\xbf\xbf\xed\xa0\x80")
- self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"),
+ self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
"\U00010fff\uD800")
+
self.assertTrue(codecs.lookup_error("surrogatepass"))
with self.assertRaises(UnicodeDecodeError):
- b"abc\xed\xa0".decode("utf-8", "surrogatepass")
+ b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
with self.assertRaises(UnicodeDecodeError):
- b"abc\xed\xa0z".decode("utf-8", "surrogatepass")
+ b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
+
@unittest.skipUnless(sys.platform == 'win32',
'cp65001 is a Windows-only codec')
@@ -1059,6 +1094,7 @@ class ReadBufferTest(unittest.TestCase):
class UTF8SigTest(UTF8Test, unittest.TestCase):
encoding = "utf-8-sig"
+ BOM = codecs.BOM_UTF8
def test_partial(self):
self.check_partial(
@@ -1194,6 +1230,7 @@ class EscapeDecodeTest(unittest.TestCase):
self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
+
class RecodingTest(unittest.TestCase):
def test_recoding(self):
f = io.BytesIO()
@@ -1313,6 +1350,7 @@ for i in punycode_testcases:
if len(i)!=2:
print(repr(i))
+
class PunycodeTest(unittest.TestCase):
def test_encode(self):
for uni, puny in punycode_testcases:
@@ -1332,6 +1370,7 @@ class PunycodeTest(unittest.TestCase):
puny = puny.decode("ascii").encode("ascii")
self.assertEqual(uni, puny.decode("punycode"))
+
class UnicodeInternalTest(unittest.TestCase):
@unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
def test_bug1251300(self):
@@ -1586,6 +1625,7 @@ class NameprepTest(unittest.TestCase):
except Exception as e:
raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
+
class IDNACodecTest(unittest.TestCase):
def test_builtin_decode(self):
self.assertEqual(str(b"python.org", "idna"), "python.org")
@@ -1672,6 +1712,7 @@ class IDNACodecTest(unittest.TestCase):
self.assertRaises(Exception,
b"python.org".decode, "idna", errors)
+
class CodecsModuleTest(unittest.TestCase):
def test_decode(self):
@@ -1780,6 +1821,7 @@ class CodecsModuleTest(unittest.TestCase):
self.assertRaises(UnicodeError,
codecs.decode, b'abc', 'undefined', errors)
+
class StreamReaderTest(unittest.TestCase):
def setUp(self):
@@ -1790,6 +1832,7 @@ class StreamReaderTest(unittest.TestCase):
f = self.reader(self.stream)
self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
+
class EncodedFileTest(unittest.TestCase):
def test_basic(self):
@@ -1920,6 +1963,7 @@ broken_unicode_with_stateful = [
"unicode_internal"
]
+
class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
def test_basics(self):
s = "abc123" # all codecs should be able to encode these
@@ -2082,6 +2126,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
self.check_state_handling_decode(encoding, u, u.encode(encoding))
self.check_state_handling_encode(encoding, u, u.encode(encoding))
+
class CharmapTest(unittest.TestCase):
def test_decode_with_string_map(self):
self.assertEqual(
@@ -2332,6 +2377,7 @@ class WithStmtTest(unittest.TestCase):
info.streamwriter, 'strict') as srw:
self.assertEqual(srw.read(), "\xfc")
+
class TypesTest(unittest.TestCase):
def test_decode_unicode(self):
# Most decoders don't accept unicode input
@@ -2622,6 +2668,7 @@ else:
bytes_transform_encodings.append("bz2_codec")
transform_aliases["bz2_codec"] = ["bz2"]
+
class TransformCodecTest(unittest.TestCase):
def test_basics(self):
@@ -3099,5 +3146,81 @@ class CodePageTest(unittest.TestCase):
self.assertEqual(decoded, ('abc', 3))
+class ASCIITest(unittest.TestCase):
+ def test_encode(self):
+ self.assertEqual('abc123'.encode('ascii'), b'abc123')
+
+ def test_encode_error(self):
+ for data, error_handler, expected in (
+ ('[\x80\xff\u20ac]', 'ignore', b'[]'),
+ ('[\x80\xff\u20ac]', 'replace', b'[???]'),
+ ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
+ ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
+ b'[\\x80\\xff\\u20ac\\U000abcde]'),
+ ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.encode('ascii', error_handler),
+ expected)
+
+ def test_encode_surrogateescape_error(self):
+ with self.assertRaises(UnicodeEncodeError):
+ # the first character can be decoded, but not the second
+ '\udc80\xff'.encode('ascii', 'surrogateescape')
+
+ def test_decode(self):
+ self.assertEqual(b'abc'.decode('ascii'), 'abc')
+
+ def test_decode_error(self):
+ for data, error_handler, expected in (
+ (b'[\x80\xff]', 'ignore', '[]'),
+ (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
+ (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
+ (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.decode('ascii', error_handler),
+ expected)
+
+
+class Latin1Test(unittest.TestCase):
+ def test_encode(self):
+ for data, expected in (
+ ('abc', b'abc'),
+ ('\x80\xe9\xff', b'\x80\xe9\xff'),
+ ):
+ with self.subTest(data=data, expected=expected):
+ self.assertEqual(data.encode('latin1'), expected)
+
+ def test_encode_errors(self):
+ for data, error_handler, expected in (
+ ('[\u20ac\udc80]', 'ignore', b'[]'),
+ ('[\u20ac\udc80]', 'replace', b'[??]'),
+ ('[\u20ac\U000abcde]', 'backslashreplace',
+ b'[\\u20ac\\U000abcde]'),
+ ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
+ ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.encode('latin1', error_handler),
+ expected)
+
+ def test_encode_surrogateescape_error(self):
+ with self.assertRaises(UnicodeEncodeError):
+ # the first character can be decoded, but not the second
+ '\udc80\u20ac'.encode('latin1', 'surrogateescape')
+
+ def test_decode(self):
+ for data, expected in (
+ (b'abc', 'abc'),
+ (b'[\x80\xff]', '[\x80\xff]'),
+ ):
+ with self.subTest(data=data, expected=expected):
+ self.assertEqual(data.decode('latin1'), expected)
+
+
if __name__ == "__main__":
unittest.main()