diff options
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r-- | Lib/test/test_codecs.py | 255 |
1 files changed, 193 insertions, 62 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 0479542745..4d91a07868 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -4,16 +4,10 @@ import io import locale import sys import unittest -import warnings import encodings from test import support -if sys.platform == 'win32': - VISTA_OR_LATER = (sys.getwindowsversion().major >= 6) -else: - VISTA_OR_LATER = False - try: import ctypes except ImportError: @@ -27,6 +21,7 @@ def coding_checker(self, coder): self.assertEqual(coder(input), (expect, len(input))) return check + class Queue(object): """ queue: write bytes at one end, read bytes from the other end @@ -47,6 +42,7 @@ class Queue(object): self._buffer = self._buffer[size:] return s + class MixInCheckStateHandling: def check_state_handling_decode(self, encoding, u, s): for i in range(len(s)+1): @@ -80,6 +76,7 @@ class MixInCheckStateHandling: part2 = d.encode(u[i:], True) self.assertEqual(s, part1+part2) + class ReadTest(MixInCheckStateHandling): def check_partial(self, input, partialresults): # get a StreamReader for the encoding and feed the bytestring version @@ -358,6 +355,12 @@ class ReadTest(MixInCheckStateHandling): self.assertEqual("[\uDC80]".encode(self.encoding, "replace"), "[?]".encode(self.encoding)) + # sequential surrogate characters + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"), + "[]".encode(self.encoding)) + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"), + "[??]".encode(self.encoding)) + bom = "".encode(self.encoding) for before, after in [("\U00010fff", "A"), ("[", "]"), ("A", "\U00010fff")]: @@ -383,6 +386,7 @@ class ReadTest(MixInCheckStateHandling): self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), before + backslashreplace + after) + class UTF32Test(ReadTest, unittest.TestCase): encoding = "utf-32" if sys.byteorder == 'little': @@ -478,6 +482,7 @@ class UTF32Test(ReadTest, unittest.TestCase): self.assertEqual('\U00010000' * 1024, codecs.utf_32_decode(encoded_be)[0]) + class UTF32LETest(ReadTest, unittest.TestCase): encoding = "utf-32-le" ill_formed_sequence = b"\x80\xdc\x00\x00" @@ -523,6 +528,7 @@ class UTF32LETest(ReadTest, unittest.TestCase): self.assertEqual('\U00010000' * 1024, codecs.utf_32_le_decode(encoded)[0]) + class UTF32BETest(ReadTest, unittest.TestCase): encoding = "utf-32-be" ill_formed_sequence = b"\x00\x00\xdc\x80" @@ -747,6 +753,7 @@ class UTF8Test(ReadTest, unittest.TestCase): encoding = "utf-8" ill_formed_sequence = b"\xed\xb2\x80" ill_formed_sequence_replace = "\ufffd" * 3 + BOM = b'' def test_partial(self): self.check_partial( @@ -775,27 +782,49 @@ class UTF8Test(ReadTest, unittest.TestCase): self.check_state_handling_decode(self.encoding, u, u.encode(self.encoding)) + def test_decode_error(self): + for data, error_handler, expected in ( + (b'[\x80\xff]', 'ignore', '[]'), + (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), + (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), + (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.decode(self.encoding, error_handler), + expected) + def test_lone_surrogates(self): super().test_lone_surrogates() # not sure if this is making sense for # UTF-16 and UTF-32 - self.assertEqual("[\uDC80]".encode('utf-8', "surrogateescape"), - b'[\x80]') + self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"), + self.BOM + b'[\x80]') + + with self.assertRaises(UnicodeEncodeError) as cm: + "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape") + exc = cm.exception + self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF') def test_surrogatepass_handler(self): - self.assertEqual("abc\ud800def".encode("utf-8", "surrogatepass"), - b"abc\xed\xa0\x80def") - self.assertEqual(b"abc\xed\xa0\x80def".decode("utf-8", "surrogatepass"), + self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"), + self.BOM + b"abc\xed\xa0\x80def") + self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"), + self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80") + self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"), + self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]') + + self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"), "abc\ud800def") - self.assertEqual("\U00010fff\uD800".encode("utf-8", "surrogatepass"), - b"\xf0\x90\xbf\xbf\xed\xa0\x80") - self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("utf-8", "surrogatepass"), + self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"), "\U00010fff\uD800") + self.assertTrue(codecs.lookup_error("surrogatepass")) with self.assertRaises(UnicodeDecodeError): - b"abc\xed\xa0".decode("utf-8", "surrogatepass") + b"abc\xed\xa0".decode(self.encoding, "surrogatepass") with self.assertRaises(UnicodeDecodeError): - b"abc\xed\xa0z".decode("utf-8", "surrogatepass") + b"abc\xed\xa0z".decode(self.encoding, "surrogatepass") + @unittest.skipUnless(sys.platform == 'win32', 'cp65001 is a Windows-only codec') @@ -807,18 +836,13 @@ class CP65001Test(ReadTest, unittest.TestCase): ('abc', 'strict', b'abc'), ('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'), ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'), + ('\udc80', 'strict', None), + ('\udc80', 'ignore', b''), + ('\udc80', 'replace', b'?'), + ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'namereplace', b'\\udc80'), + ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), ] - if VISTA_OR_LATER: - tests.extend(( - ('\udc80', 'strict', None), - ('\udc80', 'ignore', b''), - ('\udc80', 'replace', b'?'), - ('\udc80', 'backslashreplace', b'\\udc80'), - ('\udc80', 'namereplace', b'\\udc80'), - ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), - )) - else: - tests.append(('\udc80', 'strict', b'\xed\xb2\x80')) for text, errors, expected in tests: if expected is not None: try: @@ -845,17 +869,10 @@ class CP65001Test(ReadTest, unittest.TestCase): (b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'replace', '[\ufffd]'), (b'[\xff]', 'surrogateescape', '[\udcff]'), + (b'[\xed\xb2\x80]', 'strict', None), + (b'[\xed\xb2\x80]', 'ignore', '[]'), + (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), ] - if VISTA_OR_LATER: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', None), - (b'[\xed\xb2\x80]', 'ignore', '[]'), - (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'), - )) - else: - tests.extend(( - (b'[\xed\xb2\x80]', 'strict', '[\udc80]'), - )) for raw, errors, expected in tests: if expected is not None: try: @@ -870,7 +887,6 @@ class CP65001Test(ReadTest, unittest.TestCase): self.assertRaises(UnicodeDecodeError, raw.decode, 'cp65001', errors) - @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') def test_lone_surrogates(self): self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001") self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") @@ -887,7 +903,6 @@ class CP65001Test(ReadTest, unittest.TestCase): self.assertEqual("[\uDC80]".encode("cp65001", "replace"), b'[?]') - @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later') def test_surrogatepass_handler(self): self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"), b"abc\xed\xa0\x80def") @@ -1059,6 +1074,7 @@ class ReadBufferTest(unittest.TestCase): class UTF8SigTest(UTF8Test, unittest.TestCase): encoding = "utf-8-sig" + BOM = codecs.BOM_UTF8 def test_partial(self): self.check_partial( @@ -1159,7 +1175,7 @@ class EscapeDecodeTest(unittest.TestCase): check(b"[\\\n]", b"[]") check(br'[\"]', b'["]') check(br"[\']", b"[']") - check(br"[\\]", br"[\]") + check(br"[\\]", b"[\\]") check(br"[\a]", b"[\x07]") check(br"[\b]", b"[\x08]") check(br"[\t]", b"[\x09]") @@ -1168,7 +1184,6 @@ class EscapeDecodeTest(unittest.TestCase): check(br"[\f]", b"[\x0c]") check(br"[\r]", b"[\x0d]") check(br"[\7]", b"[\x07]") - check(br"[\8]", br"[\8]") check(br"[\78]", b"[\x078]") check(br"[\41]", b"[!]") check(br"[\418]", b"[!8]") @@ -1176,12 +1191,18 @@ class EscapeDecodeTest(unittest.TestCase): check(br"[\1010]", b"[A0]") check(br"[\501]", b"[A]") check(br"[\x41]", b"[A]") - check(br"[\X41]", br"[\X41]") check(br"[\x410]", b"[A0]") - for b in range(256): - if b not in b'\n"\'\\abtnvfr01234567x': - b = bytes([b]) - check(b'\\' + b, b'\\' + b) + for i in range(97, 123): + b = bytes([i]) + if b not in b'abfnrtvx': + with self.assertWarns(DeprecationWarning): + check(b"\\" + b, b"\\" + b) + with self.assertWarns(DeprecationWarning): + check(b"\\" + b.upper(), b"\\" + b.upper()) + with self.assertWarns(DeprecationWarning): + check(br"\8", b"\\8") + with self.assertWarns(DeprecationWarning): + check(br"\9", b"\\9") def test_errors(self): decode = codecs.escape_decode @@ -1194,6 +1215,7 @@ class EscapeDecodeTest(unittest.TestCase): self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8)) self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8)) + class RecodingTest(unittest.TestCase): def test_recoding(self): f = io.BytesIO() @@ -1313,6 +1335,7 @@ for i in punycode_testcases: if len(i)!=2: print(repr(i)) + class PunycodeTest(unittest.TestCase): def test_encode(self): for uni, puny in punycode_testcases: @@ -1332,6 +1355,7 @@ class PunycodeTest(unittest.TestCase): puny = puny.decode("ascii").encode("ascii") self.assertEqual(uni, puny.decode("punycode")) + class UnicodeInternalTest(unittest.TestCase): @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t') def test_bug1251300(self): @@ -1586,6 +1610,7 @@ class NameprepTest(unittest.TestCase): except Exception as e: raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e))) + class IDNACodecTest(unittest.TestCase): def test_builtin_decode(self): self.assertEqual(str(b"python.org", "idna"), "python.org") @@ -1672,6 +1697,7 @@ class IDNACodecTest(unittest.TestCase): self.assertRaises(Exception, b"python.org".decode, "idna", errors) + class CodecsModuleTest(unittest.TestCase): def test_decode(self): @@ -1780,6 +1806,7 @@ class CodecsModuleTest(unittest.TestCase): self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined', errors) + class StreamReaderTest(unittest.TestCase): def setUp(self): @@ -1790,6 +1817,7 @@ class StreamReaderTest(unittest.TestCase): f = self.reader(self.stream) self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00']) + class EncodedFileTest(unittest.TestCase): def test_basic(self): @@ -1909,6 +1937,8 @@ all_unicode_encodings = [ if hasattr(codecs, "mbcs_encode"): all_unicode_encodings.append("mbcs") +if hasattr(codecs, "oem_encode"): + all_unicode_encodings.append("oem") # The following encoding is not tested, because it's not supposed # to work: @@ -1920,6 +1950,7 @@ broken_unicode_with_stateful = [ "unicode_internal" ] + class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): def test_basics(self): s = "abc123" # all codecs should be able to encode these @@ -2082,6 +2113,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling): self.check_state_handling_decode(encoding, u, u.encode(encoding)) self.check_state_handling_encode(encoding, u, u.encode(encoding)) + class CharmapTest(unittest.TestCase): def test_decode_with_string_map(self): self.assertEqual( @@ -2332,6 +2364,7 @@ class WithStmtTest(unittest.TestCase): info.streamwriter, 'strict') as srw: self.assertEqual(srw.read(), "\xfc") + class TypesTest(unittest.TestCase): def test_decode_unicode(self): # Most decoders don't accept unicode input @@ -2420,7 +2453,6 @@ class UnicodeEscapeTest(unittest.TestCase): check(br"[\f]", "[\x0c]") check(br"[\r]", "[\x0d]") check(br"[\7]", "[\x07]") - check(br"[\8]", r"[\8]") check(br"[\78]", "[\x078]") check(br"[\41]", "[!]") check(br"[\418]", "[!8]") @@ -2430,9 +2462,18 @@ class UnicodeEscapeTest(unittest.TestCase): check(br"[\x410]", "[A0]") check(br"\u20ac", "\u20ac") check(br"\U0001d120", "\U0001d120") - for b in range(256): - if b not in b'\n"\'\\abtnvfr01234567xuUN': - check(b'\\' + bytes([b]), '\\' + chr(b)) + for i in range(97, 123): + b = bytes([i]) + if b not in b'abfnrtuvx': + with self.assertWarns(DeprecationWarning): + check(b"\\" + b, "\\" + chr(i)) + if b.upper() not in b'UN': + with self.assertWarns(DeprecationWarning): + check(b"\\" + b.upper(), "\\" + chr(i-32)) + with self.assertWarns(DeprecationWarning): + check(br"\8", "\\8") + with self.assertWarns(DeprecationWarning): + check(br"\9", "\\9") def test_decode_errors(self): decode = codecs.unicode_escape_decode @@ -2622,6 +2663,7 @@ else: bytes_transform_encodings.append("bz2_codec") transform_aliases["bz2_codec"] = ["bz2"] + class TransformCodecTest(unittest.TestCase): def test_basics(self): @@ -2674,8 +2716,8 @@ class TransformCodecTest(unittest.TestCase): bad_input = "bad input type" for encoding in bytes_transform_encodings: with self.subTest(encoding=encoding): - fmt = ( "{!r} is not a text encoding; " - "use codecs.encode\(\) to handle arbitrary codecs") + fmt = (r"{!r} is not a text encoding; " + r"use codecs.encode\(\) to handle arbitrary codecs") msg = fmt.format(encoding) with self.assertRaisesRegex(LookupError, msg) as failure: bad_input.encode(encoding) @@ -2684,7 +2726,7 @@ class TransformCodecTest(unittest.TestCase): def test_text_to_binary_blacklists_text_transforms(self): # Check str.encode gives a good error message for str -> str codecs msg = (r"^'rot_13' is not a text encoding; " - "use codecs.encode\(\) to handle arbitrary codecs") + r"use codecs.encode\(\) to handle arbitrary codecs") with self.assertRaisesRegex(LookupError, msg): "just an example message".encode("rot_13") @@ -2696,7 +2738,7 @@ class TransformCodecTest(unittest.TestCase): with self.subTest(encoding=encoding): encoded_data = codecs.encode(data, encoding) fmt = (r"{!r} is not a text encoding; " - "use codecs.decode\(\) to handle arbitrary codecs") + r"use codecs.decode\(\) to handle arbitrary codecs") msg = fmt.format(encoding) with self.assertRaisesRegex(LookupError, msg): encoded_data.decode(encoding) @@ -2708,7 +2750,7 @@ class TransformCodecTest(unittest.TestCase): for bad_input in (b"immutable", bytearray(b"mutable")): with self.subTest(bad_input=bad_input): msg = (r"^'rot_13' is not a text encoding; " - "use codecs.decode\(\) to handle arbitrary codecs") + r"use codecs.decode\(\) to handle arbitrary codecs") with self.assertRaisesRegex(LookupError, msg) as failure: bad_input.decode("rot_13") self.assertIsNone(failure.exception.__cause__) @@ -2927,12 +2969,12 @@ class ExceptionChainingTest(unittest.TestCase): self.assertEqual(decoded, b"not str!") # Text model methods should complain fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; " - "use codecs.encode\(\) to encode to arbitrary types$") + r"use codecs.encode\(\) to encode to arbitrary types$") msg = fmt.format(self.codec_name) with self.assertRaisesRegex(TypeError, msg): "str_input".encode(self.codec_name) fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; " - "use codecs.decode\(\) to decode to arbitrary types$") + r"use codecs.decode\(\) to decode to arbitrary types$") msg = fmt.format(self.codec_name) with self.assertRaisesRegex(TypeError, msg): b"bytes input".decode(self.codec_name) @@ -3073,11 +3115,10 @@ class CodePageTest(unittest.TestCase): (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), )) - if VISTA_OR_LATER: - self.check_encode(self.CP_UTF8, ( - ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), - ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), - )) + self.check_encode(self.CP_UTF8, ( + ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'), + ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'), + )) def test_incremental(self): decoded = codecs.code_page_decode(932, b'\x82', 'strict', False) @@ -3098,6 +3139,96 @@ class CodePageTest(unittest.TestCase): False) self.assertEqual(decoded, ('abc', 3)) + def test_mbcs_alias(self): + # Check that looking up our 'default' codepage will return + # mbcs when we don't have a more specific one available + import _bootlocale + def _get_fake_codepage(*a): + return 'cp123' + old_getpreferredencoding = _bootlocale.getpreferredencoding + _bootlocale.getpreferredencoding = _get_fake_codepage + try: + codec = codecs.lookup('cp123') + self.assertEqual(codec.name, 'mbcs') + finally: + _bootlocale.getpreferredencoding = old_getpreferredencoding + + +class ASCIITest(unittest.TestCase): + def test_encode(self): + self.assertEqual('abc123'.encode('ascii'), b'abc123') + + def test_encode_error(self): + for data, error_handler, expected in ( + ('[\x80\xff\u20ac]', 'ignore', b'[]'), + ('[\x80\xff\u20ac]', 'replace', b'[???]'), + ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), + ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', + b'[\\x80\\xff\\u20ac\\U000abcde]'), + ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.encode('ascii', error_handler), + expected) + + def test_encode_surrogateescape_error(self): + with self.assertRaises(UnicodeEncodeError): + # the first character can be decoded, but not the second + '\udc80\xff'.encode('ascii', 'surrogateescape') + + def test_decode(self): + self.assertEqual(b'abc'.decode('ascii'), 'abc') + + def test_decode_error(self): + for data, error_handler, expected in ( + (b'[\x80\xff]', 'ignore', '[]'), + (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'), + (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'), + (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.decode('ascii', error_handler), + expected) + + +class Latin1Test(unittest.TestCase): + def test_encode(self): + for data, expected in ( + ('abc', b'abc'), + ('\x80\xe9\xff', b'\x80\xe9\xff'), + ): + with self.subTest(data=data, expected=expected): + self.assertEqual(data.encode('latin1'), expected) + + def test_encode_errors(self): + for data, error_handler, expected in ( + ('[\u20ac\udc80]', 'ignore', b'[]'), + ('[\u20ac\udc80]', 'replace', b'[??]'), + ('[\u20ac\U000abcde]', 'backslashreplace', + b'[\\u20ac\\U000abcde]'), + ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), + ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), + ): + with self.subTest(data=data, error_handler=error_handler, + expected=expected): + self.assertEqual(data.encode('latin1', error_handler), + expected) + + def test_encode_surrogateescape_error(self): + with self.assertRaises(UnicodeEncodeError): + # the first character can be decoded, but not the second + '\udc80\u20ac'.encode('latin1', 'surrogateescape') + + def test_decode(self): + for data, expected in ( + (b'abc', 'abc'), + (b'[\x80\xff]', '[\x80\xff]'), + ): + with self.subTest(data=data, expected=expected): + self.assertEqual(data.decode('latin1'), expected) + if __name__ == "__main__": unittest.main() |