summaryrefslogtreecommitdiff
path: root/Lib/test/test_codecs.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/test/test_codecs.py')
-rw-r--r--Lib/test/test_codecs.py98
1 files changed, 90 insertions, 8 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 8fe21fb920..b93e0ab0e2 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -349,6 +349,8 @@ class ReadTest(MixInCheckStateHandling):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
"[\\udc80]".encode(self.encoding))
+ self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
+ "[\\udc80]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
"[�]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
@@ -376,6 +378,10 @@ class ReadTest(MixInCheckStateHandling):
before + after)
self.assertEqual(test_sequence.decode(self.encoding, "replace"),
before + self.ill_formed_sequence_replace + after)
+ backslashreplace = ''.join('\\x%02x' % b
+ for b in self.ill_formed_sequence)
+ self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
+ before + backslashreplace + after)
class UTF32Test(ReadTest, unittest.TestCase):
encoding = "utf-32"
@@ -808,6 +814,7 @@ class CP65001Test(ReadTest, unittest.TestCase):
('\udc80', 'ignore', b''),
('\udc80', 'replace', b'?'),
('\udc80', 'backslashreplace', b'\\udc80'),
+ ('\udc80', 'namereplace', b'\\udc80'),
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
))
else:
@@ -869,6 +876,8 @@ class CP65001Test(ReadTest, unittest.TestCase):
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
b'[\\udc80]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
+ b'[\\udc80]')
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
b'[�]')
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
@@ -890,10 +899,6 @@ class CP65001Test(ReadTest, unittest.TestCase):
"\U00010fff\uD800")
self.assertTrue(codecs.lookup_error("surrogatepass"))
- def test_readline(self):
- self.skipTest("issue #20571: code page 65001 codec does not "
- "support partial decoder yet")
-
class UTF7Test(ReadTest, unittest.TestCase):
encoding = "utf-7"
@@ -1139,6 +1144,7 @@ class UTF8SigTest(UTF8Test, unittest.TestCase):
class EscapeDecodeTest(unittest.TestCase):
def test_empty(self):
self.assertEqual(codecs.escape_decode(b""), (b"", 0))
+ self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
def test_raw(self):
decode = codecs.escape_decode
@@ -1357,14 +1363,19 @@ class UnicodeInternalTest(unittest.TestCase):
"unicode_internal")
if sys.byteorder == "little":
invalid = b"\x00\x00\x11\x00"
+ invalid_backslashreplace = r"\x00\x00\x11\x00"
else:
invalid = b"\x00\x11\x00\x00"
+ invalid_backslashreplace = r"\x00\x11\x00\x00"
with support.check_warnings():
self.assertRaises(UnicodeDecodeError,
invalid.decode, "unicode_internal")
with support.check_warnings():
self.assertEqual(invalid.decode("unicode_internal", "replace"),
'\ufffd')
+ with support.check_warnings():
+ self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
+ invalid_backslashreplace)
@unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
def test_decode_error_attributes(self):
@@ -1670,6 +1681,12 @@ class CodecsModuleTest(unittest.TestCase):
self.assertEqual(codecs.decode(b'abc'), 'abc')
self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
+ # test keywords
+ self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
+ '\xe4\xf6\xfc')
+ self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
+ '[]')
+
def test_encode(self):
self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
b'\xe4\xf6\xfc')
@@ -1678,6 +1695,12 @@ class CodecsModuleTest(unittest.TestCase):
self.assertEqual(codecs.encode('abc'), b'abc')
self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
+ # test keywords
+ self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
+ b'\xe4\xf6\xfc')
+ self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
+ b'[]')
+
def test_register(self):
self.assertRaises(TypeError, codecs.register)
self.assertRaises(TypeError, codecs.register, 42)
@@ -1726,6 +1749,7 @@ class CodecsModuleTest(unittest.TestCase):
"register_error", "lookup_error",
"strict_errors", "replace_errors", "ignore_errors",
"xmlcharrefreplace_errors", "backslashreplace_errors",
+ "namereplace_errors",
"open", "EncodedFile",
"iterencode", "iterdecode",
"BOM", "BOM_BE", "BOM_LE",
@@ -1856,7 +1880,9 @@ all_unicode_encodings = [
"iso8859_9",
"johab",
"koi8_r",
+ "koi8_t",
"koi8_u",
+ "kz1048",
"latin_1",
"mac_cyrillic",
"mac_greek",
@@ -2087,6 +2113,16 @@ class CharmapTest(unittest.TestCase):
)
self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
+ ("ab\\x02", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
+ ("ab\\x02", 3)
+ )
+
+ self.assertEqual(
codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
("ab", 3)
)
@@ -2163,6 +2199,25 @@ class CharmapTest(unittest.TestCase):
)
self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
+ {0: 'a', 1: 'b'}),
+ ("ab\\x02", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
+ {0: 'a', 1: 'b', 2: None}),
+ ("ab\\x02", 3)
+ )
+
+ # Issue #14850
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
+ {0: 'a', 1: 'b', 2: '\ufffe'}),
+ ("ab\\x02", 3)
+ )
+
+ self.assertEqual(
codecs.charmap_decode(b"\x00\x01\x02", "ignore",
{0: 'a', 1: 'b'}),
("ab", 3)
@@ -2239,6 +2294,18 @@ class CharmapTest(unittest.TestCase):
)
self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
+ {0: a, 1: b}),
+ ("ab\\x02", 3)
+ )
+
+ self.assertEqual(
+ codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
+ {0: a, 1: b, 2: 0xFFFE}),
+ ("ab\\x02", 3)
+ )
+
+ self.assertEqual(
codecs.charmap_decode(b"\x00\x01\x02", "ignore",
{0: a, 1: b}),
("ab", 3)
@@ -2297,9 +2364,13 @@ class TypesTest(unittest.TestCase):
self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+ self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
+ (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
+ self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
+ (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
class UnicodeEscapeTest(unittest.TestCase):
@@ -2884,15 +2955,15 @@ class CodePageTest(unittest.TestCase):
self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
codecs.code_page_encode, 932, '\xff')
self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
- codecs.code_page_decode, 932, b'\x81\x00')
+ codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
- codecs.code_page_decode, self.CP_UTF8, b'\xff')
+ codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
def check_decode(self, cp, tests):
for raw, errors, expected in tests:
if expected is not None:
try:
- decoded = codecs.code_page_decode(cp, raw, errors)
+ decoded = codecs.code_page_decode(cp, raw, errors, True)
except UnicodeDecodeError as err:
self.fail('Unable to decode %a from "cp%s" with '
'errors=%r: %s' % (raw, cp, errors, err))
@@ -2904,7 +2975,7 @@ class CodePageTest(unittest.TestCase):
self.assertLessEqual(decoded[1], len(raw))
else:
self.assertRaises(UnicodeDecodeError,
- codecs.code_page_decode, cp, raw, errors)
+ codecs.code_page_decode, cp, raw, errors, True)
def check_encode(self, cp, tests):
for text, errors, expected in tests:
@@ -2932,7 +3003,12 @@ class CodePageTest(unittest.TestCase):
('[\xff]', 'replace', b'[y]'),
('[\u20ac]', 'replace', b'[?]'),
('[\xff]', 'backslashreplace', b'[\\xff]'),
+ ('[\xff]', 'namereplace',
+ b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
('[\xff]', 'xmlcharrefreplace', b'[ÿ]'),
+ ('\udcff', 'strict', None),
+ ('[\udcff]', 'surrogateescape', b'[\xff]'),
+ ('[\udcff]', 'surrogatepass', None),
))
self.check_decode(932, (
(b'abc', 'strict', 'abc'),
@@ -2941,10 +3017,13 @@ class CodePageTest(unittest.TestCase):
(b'[\xff]', 'strict', None),
(b'[\xff]', 'ignore', '[]'),
(b'[\xff]', 'replace', '[\ufffd]'),
+ (b'[\xff]', 'backslashreplace', '[\\xff]'),
(b'[\xff]', 'surrogateescape', '[\udcff]'),
+ (b'[\xff]', 'surrogatepass', None),
(b'\x81\x00abc', 'strict', None),
(b'\x81\x00abc', 'ignore', '\x00abc'),
(b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
+ (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
))
def test_cp1252(self):
@@ -2952,9 +3031,12 @@ class CodePageTest(unittest.TestCase):
('abc', 'strict', b'abc'),
('\xe9\u20ac', 'strict', b'\xe9\x80'),
('\xff', 'strict', b'\xff'),
+ # test error handlers
('\u0141', 'strict', None),
('\u0141', 'ignore', b''),
('\u0141', 'replace', b'L'),
+ ('\udc98', 'surrogateescape', b'\x98'),
+ ('\udc98', 'surrogatepass', None),
))
self.check_decode(1252, (
(b'abc', 'strict', 'abc'),