merge 3.3

author: Benjamin Peterson <benjamin@python.org> 2014-03-15 12:21:47 -0500
committer: Benjamin Peterson <benjamin@python.org> 2014-03-15 12:21:47 -0500
commit: afe58888980f88d8cbcfdd6f97bf849850dcf17e (patch)
tree: 9fe2ca3bc2937f5232740a402b4dbde31855607f /Lib/email/utils.py
parent: 54ac1ddd428350cbad5f4300bf94dedb75488ec8 (diff)
parent: c4e5497ec6fe913bd00e318a828486c2efb20798 (diff)
download: cpython-afe58888980f88d8cbcfdd6f97bf849850dcf17e.tar.gz
1 files changed, 17 insertions, 7 deletions
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index f76c21eb1b..95855d81bd 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -54,17 +54,27 @@ TICK = "'"
 specialsre = re.compile(r'[][\\()<>@,:;".]')
 escapesre = re.compile(r'[\\"]')
 
-# How to figure out if we are processing strings that come from a byte
-# source with undecodable characters.
-_has_surrogates = re.compile(
-    '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+def _has_surrogates(s):
+    """Return True if s contains surrogate-escaped binary data."""
+    # This check is based on the fact that unless there are surrogates, utf8
+    # (Python's default encoding) can encode any string.  This is the fastest
+    # way to check for surrogates, see issue 11454 for timings.
+    try:
+        s.encode()
+        return False
+    except UnicodeEncodeError:
+        return True
 
 # How to deal with a string containing bytes before handing it to the
 # application through the 'normal' interface.
 def _sanitize(string):
-    # Turn any escaped bytes into unicode 'unknown' char.
-    original_bytes = string.encode('ascii', 'surrogateescape')
-    return original_bytes.decode('ascii', 'replace')
+    # Turn any escaped bytes into unicode 'unknown' char.  If the escaped
+    # bytes happen to be utf-8 they will instead get decoded, even if they
+    # were invalid in the charset the source was supposed to be in.  This
+    # seems like it is not a bad thing; a defect was still registered.
+    original_bytes = string.encode('utf-8', 'surrogateescape')
+    return original_bytes.decode('utf-8', 'replace')
+
 
 
 # Helpers
author	Benjamin Peterson <benjamin@python.org>	2014-03-15 12:21:47 -0500
committer	Benjamin Peterson <benjamin@python.org>	2014-03-15 12:21:47 -0500
commit	afe58888980f88d8cbcfdd6f97bf849850dcf17e (patch)
tree	9fe2ca3bc2937f5232740a402b4dbde31855607f /Lib/email/utils.py
parent	54ac1ddd428350cbad5f4300bf94dedb75488ec8 (diff)
parent	c4e5497ec6fe913bd00e318a828486c2efb20798 (diff)
download	cpython-afe58888980f88d8cbcfdd6f97bf849850dcf17e.tar.gz