summaryrefslogtreecommitdiff
path: root/Lib/test/test_codecs.py
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2014-02-08 14:06:33 +0200
committerSerhiy Storchaka <storchaka@gmail.com>2014-02-08 14:06:33 +0200
commita76f61a821a0cd3aae6689ffc0eb4f7006d0b4fc (patch)
treef93766290e312b2b76b0dc8d30a6b84338abea1c /Lib/test/test_codecs.py
parent57e23753f4f9e7c8ec2334610e70d674995f6604 (diff)
parentc91f4b46f8e9034d0acc62dbd16635e3532fb3e0 (diff)
downloadcpython-a76f61a821a0cd3aae6689ffc0eb4f7006d0b4fc.tar.gz
Issue #20538: UTF-7 incremental decoder produced inconsistant string when
input was truncated in BASE64 section.
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r--Lib/test/test_codecs.py391
1 files changed, 358 insertions, 33 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index cb618ece8c..570539fa53 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,9 +1,11 @@
import codecs
+import contextlib
import io
import locale
import sys
import unittest
import warnings
+import encodings
from test import support
@@ -341,8 +343,46 @@ class ReadTest(MixInCheckStateHandling):
self.assertEqual(reader.readline(), s5)
self.assertEqual(reader.readline(), "")
+ ill_formed_sequence_replace = "\ufffd"
+
+ def test_lone_surrogates(self):
+ self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
+ self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
+ "[\\udc80]".encode(self.encoding))
+ self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
+ "[&#56448;]".encode(self.encoding))
+ self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
+ "[]".encode(self.encoding))
+ self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
+ "[?]".encode(self.encoding))
+
+ bom = "".encode(self.encoding)
+ for before, after in [("\U00010fff", "A"), ("[", "]"),
+ ("A", "\U00010fff")]:
+ before_sequence = before.encode(self.encoding)[len(bom):]
+ after_sequence = after.encode(self.encoding)[len(bom):]
+ test_string = before + "\uDC80" + after
+ test_sequence = (bom + before_sequence +
+ self.ill_formed_sequence + after_sequence)
+ self.assertRaises(UnicodeDecodeError, test_sequence.decode,
+ self.encoding)
+ self.assertEqual(test_string.encode(self.encoding,
+ "surrogatepass"),
+ test_sequence)
+ self.assertEqual(test_sequence.decode(self.encoding,
+ "surrogatepass"),
+ test_string)
+ self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
+ before + after)
+ self.assertEqual(test_sequence.decode(self.encoding, "replace"),
+ before + self.ill_formed_sequence_replace + after)
+
class UTF32Test(ReadTest, unittest.TestCase):
encoding = "utf-32"
+ if sys.byteorder == 'little':
+ ill_formed_sequence = b"\x80\xdc\x00\x00"
+ else:
+ ill_formed_sequence = b"\x00\x00\xdc\x80"
spamle = (b'\xff\xfe\x00\x00'
b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
@@ -434,6 +474,7 @@ class UTF32Test(ReadTest, unittest.TestCase):
class UTF32LETest(ReadTest, unittest.TestCase):
encoding = "utf-32-le"
+ ill_formed_sequence = b"\x80\xdc\x00\x00"
def test_partial(self):
self.check_partial(
@@ -478,6 +519,7 @@ class UTF32LETest(ReadTest, unittest.TestCase):
class UTF32BETest(ReadTest, unittest.TestCase):
encoding = "utf-32-be"
+ ill_formed_sequence = b"\x00\x00\xdc\x80"
def test_partial(self):
self.check_partial(
@@ -523,6 +565,10 @@ class UTF32BETest(ReadTest, unittest.TestCase):
class UTF16Test(ReadTest, unittest.TestCase):
encoding = "utf-16"
+ if sys.byteorder == 'little':
+ ill_formed_sequence = b"\x80\xdc"
+ else:
+ ill_formed_sequence = b"\xdc\x80"
spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@@ -598,11 +644,14 @@ class UTF16Test(ReadTest, unittest.TestCase):
self.addCleanup(support.unlink, support.TESTFN)
with open(support.TESTFN, 'wb') as fp:
fp.write(s)
- with codecs.open(support.TESTFN, 'U', encoding=self.encoding) as reader:
+ with support.check_warnings(('', DeprecationWarning)):
+ reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
+ with reader:
self.assertEqual(reader.read(), s1)
class UTF16LETest(ReadTest, unittest.TestCase):
encoding = "utf-16-le"
+ ill_formed_sequence = b"\x80\xdc"
def test_partial(self):
self.check_partial(
@@ -646,6 +695,7 @@ class UTF16LETest(ReadTest, unittest.TestCase):
class UTF16BETest(ReadTest, unittest.TestCase):
encoding = "utf-16-be"
+ ill_formed_sequence = b"\xdc\x80"
def test_partial(self):
self.check_partial(
@@ -689,6 +739,8 @@ class UTF16BETest(ReadTest, unittest.TestCase):
class UTF8Test(ReadTest, unittest.TestCase):
encoding = "utf-8"
+ ill_formed_sequence = b"\xed\xb2\x80"
+ ill_formed_sequence_replace = "\ufffd" * 3
def test_partial(self):
self.check_partial(
@@ -718,18 +770,11 @@ class UTF8Test(ReadTest, unittest.TestCase):
u, u.encode(self.encoding))
def test_lone_surrogates(self):
- self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
- self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
- self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
- b'[\\udc80]')
- self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
- b'[&#56448;]')
- self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
+ super().test_lone_surrogates()
+ # not sure if this is making sense for
+ # UTF-16 and UTF-32
+ self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"),
b'[\x80]')
- self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
- b'[]')
- self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
- b'[?]')
def test_surrogatepass_handler(self):
self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"),
@@ -909,15 +954,19 @@ class UTF7Test(ReadTest, unittest.TestCase):
(b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
]
for raw, expected in tests:
- self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
- raw, 'strict', True)
- self.assertEqual(raw.decode('utf-7', 'replace'), expected)
+ with self.subTest(raw=raw):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
+ raw, 'strict', True)
+ self.assertEqual(raw.decode('utf-7', 'replace'), expected)
def test_nonbmp(self):
self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
+ test_lone_surrogates = None
+
+
class UTF16ExTest(unittest.TestCase):
def test_errors(self):
@@ -942,7 +991,7 @@ class ReadBufferTest(unittest.TestCase):
self.assertRaises(TypeError, codecs.readbuffer_encode)
self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
-class UTF8SigTest(ReadTest, unittest.TestCase):
+class UTF8SigTest(UTF8Test, unittest.TestCase):
encoding = "utf-8-sig"
def test_partial(self):
@@ -1624,6 +1673,7 @@ all_unicode_encodings = [
"cp037",
"cp1006",
"cp1026",
+ "cp1125",
"cp1140",
"cp1250",
"cp1251",
@@ -2366,44 +2416,319 @@ bytes_transform_encodings = [
"quopri_codec",
"hex_codec",
]
+
+transform_aliases = {
+ "base64_codec": ["base64", "base_64"],
+ "uu_codec": ["uu"],
+ "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
+ "hex_codec": ["hex"],
+ "rot_13": ["rot13"],
+}
+
try:
import zlib
except ImportError:
- pass
+ zlib = None
else:
bytes_transform_encodings.append("zlib_codec")
+ transform_aliases["zlib_codec"] = ["zip", "zlib"]
try:
import bz2
except ImportError:
pass
else:
bytes_transform_encodings.append("bz2_codec")
+ transform_aliases["bz2_codec"] = ["bz2"]
class TransformCodecTest(unittest.TestCase):
def test_basics(self):
binput = bytes(range(256))
for encoding in bytes_transform_encodings:
- # generic codecs interface
- (o, size) = codecs.getencoder(encoding)(binput)
- self.assertEqual(size, len(binput))
- (i, size) = codecs.getdecoder(encoding)(o)
- self.assertEqual(size, len(o))
- self.assertEqual(i, binput)
+ with self.subTest(encoding=encoding):
+ # generic codecs interface
+ (o, size) = codecs.getencoder(encoding)(binput)
+ self.assertEqual(size, len(binput))
+ (i, size) = codecs.getdecoder(encoding)(o)
+ self.assertEqual(size, len(o))
+ self.assertEqual(i, binput)
def test_read(self):
for encoding in bytes_transform_encodings:
- sin = codecs.encode(b"\x80", encoding)
- reader = codecs.getreader(encoding)(io.BytesIO(sin))
- sout = reader.read()
- self.assertEqual(sout, b"\x80")
+ with self.subTest(encoding=encoding):
+ sin = codecs.encode(b"\x80", encoding)
+ reader = codecs.getreader(encoding)(io.BytesIO(sin))
+ sout = reader.read()
+ self.assertEqual(sout, b"\x80")
def test_readline(self):
for encoding in bytes_transform_encodings:
- sin = codecs.encode(b"\x80", encoding)
- reader = codecs.getreader(encoding)(io.BytesIO(sin))
- sout = reader.readline()
- self.assertEqual(sout, b"\x80")
+ with self.subTest(encoding=encoding):
+ sin = codecs.encode(b"\x80", encoding)
+ reader = codecs.getreader(encoding)(io.BytesIO(sin))
+ sout = reader.readline()
+ self.assertEqual(sout, b"\x80")
+
+ def test_buffer_api_usage(self):
+ # We check all the transform codecs accept memoryview input
+ # for encoding and decoding
+ # and also that they roundtrip correctly
+ original = b"12345\x80"
+ for encoding in bytes_transform_encodings:
+ with self.subTest(encoding=encoding):
+ data = original
+ view = memoryview(data)
+ data = codecs.encode(data, encoding)
+ view_encoded = codecs.encode(view, encoding)
+ self.assertEqual(view_encoded, data)
+ view = memoryview(data)
+ data = codecs.decode(data, encoding)
+ self.assertEqual(data, original)
+ view_decoded = codecs.decode(view, encoding)
+ self.assertEqual(view_decoded, data)
+
+ def test_text_to_binary_blacklists_binary_transforms(self):
+ # Check binary -> binary codecs give a good error for str input
+ bad_input = "bad input type"
+ for encoding in bytes_transform_encodings:
+ with self.subTest(encoding=encoding):
+ fmt = ( "{!r} is not a text encoding; "
+ "use codecs.encode\(\) to handle arbitrary codecs")
+ msg = fmt.format(encoding)
+ with self.assertRaisesRegex(LookupError, msg) as failure:
+ bad_input.encode(encoding)
+ self.assertIsNone(failure.exception.__cause__)
+
+ def test_text_to_binary_blacklists_text_transforms(self):
+ # Check str.encode gives a good error message for str -> str codecs
+ msg = (r"^'rot_13' is not a text encoding; "
+ "use codecs.encode\(\) to handle arbitrary codecs")
+ with self.assertRaisesRegex(LookupError, msg):
+ "just an example message".encode("rot_13")
+
+ def test_binary_to_text_blacklists_binary_transforms(self):
+ # Check bytes.decode and bytearray.decode give a good error
+ # message for binary -> binary codecs
+ data = b"encode first to ensure we meet any format restrictions"
+ for encoding in bytes_transform_encodings:
+ with self.subTest(encoding=encoding):
+ encoded_data = codecs.encode(data, encoding)
+ fmt = (r"{!r} is not a text encoding; "
+ "use codecs.decode\(\) to handle arbitrary codecs")
+ msg = fmt.format(encoding)
+ with self.assertRaisesRegex(LookupError, msg):
+ encoded_data.decode(encoding)
+ with self.assertRaisesRegex(LookupError, msg):
+ bytearray(encoded_data).decode(encoding)
+
+ def test_binary_to_text_blacklists_text_transforms(self):
+ # Check str -> str codec gives a good error for binary input
+ for bad_input in (b"immutable", bytearray(b"mutable")):
+ with self.subTest(bad_input=bad_input):
+ msg = (r"^'rot_13' is not a text encoding; "
+ "use codecs.decode\(\) to handle arbitrary codecs")
+ with self.assertRaisesRegex(LookupError, msg) as failure:
+ bad_input.decode("rot_13")
+ self.assertIsNone(failure.exception.__cause__)
+
+ @unittest.skipUnless(zlib, "Requires zlib support")
+ def test_custom_zlib_error_is_wrapped(self):
+ # Check zlib codec gives a good error for malformed input
+ msg = "^decoding with 'zlib_codec' codec failed"
+ with self.assertRaisesRegex(Exception, msg) as failure:
+ codecs.decode(b"hello", "zlib_codec")
+ self.assertIsInstance(failure.exception.__cause__,
+ type(failure.exception))
+
+ def test_custom_hex_error_is_wrapped(self):
+ # Check hex codec gives a good error for malformed input
+ msg = "^decoding with 'hex_codec' codec failed"
+ with self.assertRaisesRegex(Exception, msg) as failure:
+ codecs.decode(b"hello", "hex_codec")
+ self.assertIsInstance(failure.exception.__cause__,
+ type(failure.exception))
+
+ # Unfortunately, the bz2 module throws OSError, which the codec
+ # machinery currently can't wrap :(
+
+ # Ensure codec aliases from http://bugs.python.org/issue7475 work
+ def test_aliases(self):
+ for codec_name, aliases in transform_aliases.items():
+ expected_name = codecs.lookup(codec_name).name
+ for alias in aliases:
+ with self.subTest(alias=alias):
+ info = codecs.lookup(alias)
+ self.assertEqual(info.name, expected_name)
+
+
+# The codec system tries to wrap exceptions in order to ensure the error
+# mentions the operation being performed and the codec involved. We
+# currently *only* want this to happen for relatively stateless
+# exceptions, where the only significant information they contain is their
+# type and a single str argument.
+
+# Use a local codec registry to avoid appearing to leak objects when
+# registering multiple seach functions
+_TEST_CODECS = {}
+
+def _get_test_codec(codec_name):
+ return _TEST_CODECS.get(codec_name)
+codecs.register(_get_test_codec) # Returns None, not usable as a decorator
+
+class ExceptionChainingTest(unittest.TestCase):
+
+ def setUp(self):
+ # There's no way to unregister a codec search function, so we just
+ # ensure we render this one fairly harmless after the test
+ # case finishes by using the test case repr as the codec name
+ # The codecs module normalizes codec names, although this doesn't
+ # appear to be formally documented...
+ # We also make sure we use a truly unique id for the custom codec
+ # to avoid issues with the codec cache when running these tests
+ # multiple times (e.g. when hunting for refleaks)
+ unique_id = repr(self) + str(id(self))
+ self.codec_name = encodings.normalize_encoding(unique_id).lower()
+
+ # We store the object to raise on the instance because of a bad
+ # interaction between the codec caching (which means we can't
+ # recreate the codec entry) and regrtest refleak hunting (which
+ # runs the same test instance multiple times). This means we
+ # need to ensure the codecs call back in to the instance to find
+ # out which exception to raise rather than binding them in a
+ # closure to an object that may change on the next run
+ self.obj_to_raise = RuntimeError
+
+ def tearDown(self):
+ _TEST_CODECS.pop(self.codec_name, None)
+
+ def set_codec(self, encode, decode):
+ codec_info = codecs.CodecInfo(encode, decode,
+ name=self.codec_name)
+ _TEST_CODECS[self.codec_name] = codec_info
+
+ @contextlib.contextmanager
+ def assertWrapped(self, operation, exc_type, msg):
+ full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
+ operation, self.codec_name, exc_type.__name__, msg)
+ with self.assertRaisesRegex(exc_type, full_msg) as caught:
+ yield caught
+ self.assertIsInstance(caught.exception.__cause__, exc_type)
+ self.assertIsNotNone(caught.exception.__cause__.__traceback__)
+
+ def raise_obj(self, *args, **kwds):
+ # Helper to dynamically change the object raised by a test codec
+ raise self.obj_to_raise
+
+ def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
+ self.obj_to_raise = obj_to_raise
+ self.set_codec(self.raise_obj, self.raise_obj)
+ with self.assertWrapped("encoding", exc_type, msg):
+ "str_input".encode(self.codec_name)
+ with self.assertWrapped("encoding", exc_type, msg):
+ codecs.encode("str_input", self.codec_name)
+ with self.assertWrapped("decoding", exc_type, msg):
+ b"bytes input".decode(self.codec_name)
+ with self.assertWrapped("decoding", exc_type, msg):
+ codecs.decode(b"bytes input", self.codec_name)
+
+ def test_raise_by_type(self):
+ self.check_wrapped(RuntimeError, "")
+
+ def test_raise_by_value(self):
+ msg = "This should be wrapped"
+ self.check_wrapped(RuntimeError(msg), msg)
+
+ def test_raise_grandchild_subclass_exact_size(self):
+ msg = "This should be wrapped"
+ class MyRuntimeError(RuntimeError):
+ __slots__ = ()
+ self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
+
+ def test_raise_subclass_with_weakref_support(self):
+ msg = "This should be wrapped"
+ class MyRuntimeError(RuntimeError):
+ pass
+ self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
+
+ def check_not_wrapped(self, obj_to_raise, msg):
+ def raise_obj(*args, **kwds):
+ raise obj_to_raise
+ self.set_codec(raise_obj, raise_obj)
+ with self.assertRaisesRegex(RuntimeError, msg):
+ "str input".encode(self.codec_name)
+ with self.assertRaisesRegex(RuntimeError, msg):
+ codecs.encode("str input", self.codec_name)
+ with self.assertRaisesRegex(RuntimeError, msg):
+ b"bytes input".decode(self.codec_name)
+ with self.assertRaisesRegex(RuntimeError, msg):
+ codecs.decode(b"bytes input", self.codec_name)
+
+ def test_init_override_is_not_wrapped(self):
+ class CustomInit(RuntimeError):
+ def __init__(self):
+ pass
+ self.check_not_wrapped(CustomInit, "")
+
+ def test_new_override_is_not_wrapped(self):
+ class CustomNew(RuntimeError):
+ def __new__(cls):
+ return super().__new__(cls)
+ self.check_not_wrapped(CustomNew, "")
+
+ def test_instance_attribute_is_not_wrapped(self):
+ msg = "This should NOT be wrapped"
+ exc = RuntimeError(msg)
+ exc.attr = 1
+ self.check_not_wrapped(exc, "^{}$".format(msg))
+
+ def test_non_str_arg_is_not_wrapped(self):
+ self.check_not_wrapped(RuntimeError(1), "1")
+
+ def test_multiple_args_is_not_wrapped(self):
+ msg_re = r"^\('a', 'b', 'c'\)$"
+ self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
+
+ # http://bugs.python.org/issue19609
+ def test_codec_lookup_failure_not_wrapped(self):
+ msg = "^unknown encoding: {}$".format(self.codec_name)
+ # The initial codec lookup should not be wrapped
+ with self.assertRaisesRegex(LookupError, msg):
+ "str input".encode(self.codec_name)
+ with self.assertRaisesRegex(LookupError, msg):
+ codecs.encode("str input", self.codec_name)
+ with self.assertRaisesRegex(LookupError, msg):
+ b"bytes input".decode(self.codec_name)
+ with self.assertRaisesRegex(LookupError, msg):
+ codecs.decode(b"bytes input", self.codec_name)
+
+ def test_unflagged_non_text_codec_handling(self):
+ # The stdlib non-text codecs are now marked so they're
+ # pre-emptively skipped by the text model related methods
+ # However, third party codecs won't be flagged, so we still make
+ # sure the case where an inappropriate output type is produced is
+ # handled appropriately
+ def encode_to_str(*args, **kwds):
+ return "not bytes!", 0
+ def decode_to_bytes(*args, **kwds):
+ return b"not str!", 0
+ self.set_codec(encode_to_str, decode_to_bytes)
+ # No input or output type checks on the codecs module functions
+ encoded = codecs.encode(None, self.codec_name)
+ self.assertEqual(encoded, "not bytes!")
+ decoded = codecs.decode(None, self.codec_name)
+ self.assertEqual(decoded, b"not str!")
+ # Text model methods should complain
+ fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
+ "use codecs.encode\(\) to encode to arbitrary types$")
+ msg = fmt.format(self.codec_name)
+ with self.assertRaisesRegex(TypeError, msg):
+ "str_input".encode(self.codec_name)
+ fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
+ "use codecs.decode\(\) to decode to arbitrary types$")
+ msg = fmt.format(self.codec_name)
+ with self.assertRaisesRegex(TypeError, msg):
+ b"bytes input".decode(self.codec_name)
+
@unittest.skipUnless(sys.platform == 'win32',
@@ -2415,8 +2740,8 @@ class CodePageTest(unittest.TestCase):
def test_invalid_code_page(self):
self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
- self.assertRaises(WindowsError, codecs.code_page_encode, 123, 'a')
- self.assertRaises(WindowsError, codecs.code_page_decode, 123, b'a')
+ self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
+ self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
def test_code_page_name(self):
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',