diff options
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r-- | Lib/test/test_codecs.py | 98 |
1 files changed, 90 insertions, 8 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 8fe21fb920..b93e0ab0e2 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -349,6 +349,8 @@ class ReadTest(MixInCheckStateHandling): self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding) self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"), "[\\udc80]".encode(self.encoding)) + self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"), + "[\\udc80]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"), "[�]".encode(self.encoding)) self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"), @@ -376,6 +378,10 @@ class ReadTest(MixInCheckStateHandling): before + after) self.assertEqual(test_sequence.decode(self.encoding, "replace"), before + self.ill_formed_sequence_replace + after) + backslashreplace = ''.join('\\x%02x' % b + for b in self.ill_formed_sequence) + self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"), + before + backslashreplace + after) class UTF32Test(ReadTest, unittest.TestCase): encoding = "utf-32" @@ -808,6 +814,7 @@ class CP65001Test(ReadTest, unittest.TestCase): ('\udc80', 'ignore', b''), ('\udc80', 'replace', b'?'), ('\udc80', 'backslashreplace', b'\\udc80'), + ('\udc80', 'namereplace', b'\\udc80'), ('\udc80', 'surrogatepass', b'\xed\xb2\x80'), )) else: @@ -869,6 +876,8 @@ class CP65001Test(ReadTest, unittest.TestCase): self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001") self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"), b'[\\udc80]') + self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"), + b'[\\udc80]') self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"), b'[�]') self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"), @@ -890,10 +899,6 @@ class CP65001Test(ReadTest, unittest.TestCase): "\U00010fff\uD800") self.assertTrue(codecs.lookup_error("surrogatepass")) - def test_readline(self): - self.skipTest("issue #20571: code page 65001 codec does not " - "support partial decoder yet") - class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" @@ -1139,6 +1144,7 @@ class UTF8SigTest(UTF8Test, unittest.TestCase): class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEqual(codecs.escape_decode(b""), (b"", 0)) + self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0)) def test_raw(self): decode = codecs.escape_decode @@ -1357,14 +1363,19 @@ class UnicodeInternalTest(unittest.TestCase): "unicode_internal") if sys.byteorder == "little": invalid = b"\x00\x00\x11\x00" + invalid_backslashreplace = r"\x00\x00\x11\x00" else: invalid = b"\x00\x11\x00\x00" + invalid_backslashreplace = r"\x00\x11\x00\x00" with support.check_warnings(): self.assertRaises(UnicodeDecodeError, invalid.decode, "unicode_internal") with support.check_warnings(): self.assertEqual(invalid.decode("unicode_internal", "replace"), '\ufffd') + with support.check_warnings(): + self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"), + invalid_backslashreplace) @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t') def test_decode_error_attributes(self): @@ -1670,6 +1681,12 @@ class CodecsModuleTest(unittest.TestCase): self.assertEqual(codecs.decode(b'abc'), 'abc') self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii') + # test keywords + self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'), + '\xe4\xf6\xfc') + self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'), + '[]') + def test_encode(self): self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'), b'\xe4\xf6\xfc') @@ -1678,6 +1695,12 @@ class CodecsModuleTest(unittest.TestCase): self.assertEqual(codecs.encode('abc'), b'abc') self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii') + # test keywords + self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'), + b'\xe4\xf6\xfc') + self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'), + b'[]') + def test_register(self): self.assertRaises(TypeError, codecs.register) self.assertRaises(TypeError, codecs.register, 42) @@ -1726,6 +1749,7 @@ class CodecsModuleTest(unittest.TestCase): "register_error", "lookup_error", "strict_errors", "replace_errors", "ignore_errors", "xmlcharrefreplace_errors", "backslashreplace_errors", + "namereplace_errors", "open", "EncodedFile", "iterencode", "iterdecode", "BOM", "BOM_BE", "BOM_LE", @@ -1856,7 +1880,9 @@ all_unicode_encodings = [ "iso8859_9", "johab", "koi8_r", + "koi8_t", "koi8_u", + "kz1048", "latin_1", "mac_cyrillic", "mac_greek", @@ -2087,6 +2113,16 @@ class CharmapTest(unittest.TestCase): ) self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"), + ("ab\\x02", 3) + ) + + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"), + ("ab\\x02", 3) + ) + + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"), ("ab", 3) ) @@ -2163,6 +2199,25 @@ class CharmapTest(unittest.TestCase): ) self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", + {0: 'a', 1: 'b'}), + ("ab\\x02", 3) + ) + + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", + {0: 'a', 1: 'b', 2: None}), + ("ab\\x02", 3) + ) + + # Issue #14850 + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", + {0: 'a', 1: 'b', 2: '\ufffe'}), + ("ab\\x02", 3) + ) + + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "ignore", {0: 'a', 1: 'b'}), ("ab", 3) @@ -2239,6 +2294,18 @@ class CharmapTest(unittest.TestCase): ) self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", + {0: a, 1: b}), + ("ab\\x02", 3) + ) + + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", + {0: a, 1: b, 2: 0xFFFE}), + ("ab\\x02", 3) + ) + + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "ignore", {0: a, 1: b}), ("ab", 3) @@ -2297,9 +2364,13 @@ class TypesTest(unittest.TestCase): self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000") self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) + self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"), + (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000") self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) + self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"), + (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10)) class UnicodeEscapeTest(unittest.TestCase): @@ -2884,15 +2955,15 @@ class CodePageTest(unittest.TestCase): self.assertRaisesRegex(UnicodeEncodeError, 'cp932', codecs.code_page_encode, 932, '\xff') self.assertRaisesRegex(UnicodeDecodeError, 'cp932', - codecs.code_page_decode, 932, b'\x81\x00') + codecs.code_page_decode, 932, b'\x81\x00', 'strict', True) self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', - codecs.code_page_decode, self.CP_UTF8, b'\xff') + codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) def check_decode(self, cp, tests): for raw, errors, expected in tests: if expected is not None: try: - decoded = codecs.code_page_decode(cp, raw, errors) + decoded = codecs.code_page_decode(cp, raw, errors, True) except UnicodeDecodeError as err: self.fail('Unable to decode %a from "cp%s" with ' 'errors=%r: %s' % (raw, cp, errors, err)) @@ -2904,7 +2975,7 @@ class CodePageTest(unittest.TestCase): self.assertLessEqual(decoded[1], len(raw)) else: self.assertRaises(UnicodeDecodeError, - codecs.code_page_decode, cp, raw, errors) + codecs.code_page_decode, cp, raw, errors, True) def check_encode(self, cp, tests): for text, errors, expected in tests: @@ -2932,7 +3003,12 @@ class CodePageTest(unittest.TestCase): ('[\xff]', 'replace', b'[y]'), ('[\u20ac]', 'replace', b'[?]'), ('[\xff]', 'backslashreplace', b'[\\xff]'), + ('[\xff]', 'namereplace', + b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'), ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), + ('\udcff', 'strict', None), + ('[\udcff]', 'surrogateescape', b'[\xff]'), + ('[\udcff]', 'surrogatepass', None), )) self.check_decode(932, ( (b'abc', 'strict', 'abc'), @@ -2941,10 +3017,13 @@ class CodePageTest(unittest.TestCase): (b'[\xff]', 'strict', None), (b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'replace', '[\ufffd]'), + (b'[\xff]', 'backslashreplace', '[\\xff]'), (b'[\xff]', 'surrogateescape', '[\udcff]'), + (b'[\xff]', 'surrogatepass', None), (b'\x81\x00abc', 'strict', None), (b'\x81\x00abc', 'ignore', '\x00abc'), (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), + (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'), )) def test_cp1252(self): @@ -2952,9 +3031,12 @@ class CodePageTest(unittest.TestCase): ('abc', 'strict', b'abc'), ('\xe9\u20ac', 'strict', b'\xe9\x80'), ('\xff', 'strict', b'\xff'), + # test error handlers ('\u0141', 'strict', None), ('\u0141', 'ignore', b''), ('\u0141', 'replace', b'L'), + ('\udc98', 'surrogateescape', b'\x98'), + ('\udc98', 'surrogatepass', None), )) self.check_decode(1252, ( (b'abc', 'strict', 'abc'), |