diff options
author | Benjamin Peterson <benjamin@python.org> | 2014-03-15 12:21:47 -0500 |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2014-03-15 12:21:47 -0500 |
commit | afe58888980f88d8cbcfdd6f97bf849850dcf17e (patch) | |
tree | 9fe2ca3bc2937f5232740a402b4dbde31855607f /Lib/email/utils.py | |
parent | 54ac1ddd428350cbad5f4300bf94dedb75488ec8 (diff) | |
parent | c4e5497ec6fe913bd00e318a828486c2efb20798 (diff) | |
download | cpython-afe58888980f88d8cbcfdd6f97bf849850dcf17e.tar.gz |
merge 3.3
Diffstat (limited to 'Lib/email/utils.py')
-rw-r--r-- | Lib/email/utils.py | 24 |
1 files changed, 17 insertions, 7 deletions
diff --git a/Lib/email/utils.py b/Lib/email/utils.py index f76c21eb1b..95855d81bd 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -54,17 +54,27 @@ TICK = "'" specialsre = re.compile(r'[][\\()<>@,:;".]') escapesre = re.compile(r'[\\"]') -# How to figure out if we are processing strings that come from a byte -# source with undecodable characters. -_has_surrogates = re.compile( - '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search +def _has_surrogates(s): + """Return True if s contains surrogate-escaped binary data.""" + # This check is based on the fact that unless there are surrogates, utf8 + # (Python's default encoding) can encode any string. This is the fastest + # way to check for surrogates, see issue 11454 for timings. + try: + s.encode() + return False + except UnicodeEncodeError: + return True # How to deal with a string containing bytes before handing it to the # application through the 'normal' interface. def _sanitize(string): - # Turn any escaped bytes into unicode 'unknown' char. - original_bytes = string.encode('ascii', 'surrogateescape') - return original_bytes.decode('ascii', 'replace') + # Turn any escaped bytes into unicode 'unknown' char. If the escaped + # bytes happen to be utf-8 they will instead get decoded, even if they + # were invalid in the charset the source was supposed to be in. This + # seems like it is not a bad thing; a defect was still registered. + original_bytes = string.encode('utf-8', 'surrogateescape') + return original_bytes.decode('utf-8', 'replace') + # Helpers |