summaryrefslogtreecommitdiff
path: root/Lib/email
diff options
context:
space:
mode:
authorR David Murray <rdmurray@bitdance.com>2014-02-08 11:51:18 -0500
committerR David Murray <rdmurray@bitdance.com>2014-02-08 11:51:18 -0500
commita055d4d797b347ad45b2becdf5b8ec86dc135f20 (patch)
treed2a97624b5a31dea7b6be6398e95a9430cf1dd42 /Lib/email
parent37b69c6a9e207f93e6a3184bde8d434c81777996 (diff)
parent36c69903355c4b54119021ae16f844bb2f7e3942 (diff)
downloadcpython-a055d4d797b347ad45b2becdf5b8ec86dc135f20.tar.gz
Merge #19772: Do not mutate message when downcoding to 7bit.
Diffstat (limited to 'Lib/email')
-rw-r--r--Lib/email/_header_value_parser.py37
-rw-r--r--Lib/email/contentmanager.py249
-rw-r--r--Lib/email/encoders.py17
-rw-r--r--Lib/email/feedparser.py52
-rw-r--r--Lib/email/iterators.py6
-rw-r--r--Lib/email/message.py260
-rw-r--r--Lib/email/parser.py5
-rw-r--r--Lib/email/policy.py13
-rw-r--r--Lib/email/utils.py24
9 files changed, 578 insertions, 85 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 291437c586..039237936c 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -368,8 +368,7 @@ class TokenList(list):
yield (indent + ' !! invalid element in token '
'list: {!r}'.format(token))
else:
- for line in token._pp(indent+' '):
- yield line
+ yield from token._pp(indent+' ')
if self.defects:
extra = ' Defects: {}'.format(self.defects)
else:
@@ -1315,24 +1314,22 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
# Parser
#
-"""Parse strings according to RFC822/2047/2822/5322 rules.
-
-This is a stateless parser. Each get_XXX function accepts a string and
-returns either a Terminal or a TokenList representing the RFC object named
-by the method and a string containing the remaining unparsed characters
-from the input. Thus a parser method consumes the next syntactic construct
-of a given type and returns a token representing the construct plus the
-unparsed remainder of the input string.
-
-For example, if the first element of a structured header is a 'phrase',
-then:
-
- phrase, value = get_phrase(value)
-
-returns the complete phrase from the start of the string value, plus any
-characters left in the string after the phrase is removed.
-
-"""
+# Parse strings according to RFC822/2047/2822/5322 rules.
+#
+# This is a stateless parser. Each get_XXX function accepts a string and
+# returns either a Terminal or a TokenList representing the RFC object named
+# by the method and a string containing the remaining unparsed characters
+# from the input. Thus a parser method consumes the next syntactic construct
+# of a given type and returns a token representing the construct plus the
+# unparsed remainder of the input string.
+#
+# For example, if the first element of a structured header is a 'phrase',
+# then:
+#
+# phrase, value = get_phrase(value)
+#
+# returns the complete phrase from the start of the string value, plus any
+# characters left in the string after the phrase is removed.
_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
_non_atom_end_matcher = re.compile(r"[^{}]+".format(
diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py
new file mode 100644
index 0000000000..d3636529b6
--- /dev/null
+++ b/Lib/email/contentmanager.py
@@ -0,0 +1,249 @@
+import binascii
+import email.charset
+import email.message
+import email.errors
+from email import quoprimime
+
+class ContentManager:
+
+ def __init__(self):
+ self.get_handlers = {}
+ self.set_handlers = {}
+
+ def add_get_handler(self, key, handler):
+ self.get_handlers[key] = handler
+
+ def get_content(self, msg, *args, **kw):
+ content_type = msg.get_content_type()
+ if content_type in self.get_handlers:
+ return self.get_handlers[content_type](msg, *args, **kw)
+ maintype = msg.get_content_maintype()
+ if maintype in self.get_handlers:
+ return self.get_handlers[maintype](msg, *args, **kw)
+ if '' in self.get_handlers:
+ return self.get_handlers[''](msg, *args, **kw)
+ raise KeyError(content_type)
+
+ def add_set_handler(self, typekey, handler):
+ self.set_handlers[typekey] = handler
+
+ def set_content(self, msg, obj, *args, **kw):
+ if msg.get_content_maintype() == 'multipart':
+ # XXX: is this error a good idea or not? We can remove it later,
+ # but we can't add it later, so do it for now.
+ raise TypeError("set_content not valid on multipart")
+ handler = self._find_set_handler(msg, obj)
+ msg.clear_content()
+ handler(msg, obj, *args, **kw)
+
+ def _find_set_handler(self, msg, obj):
+ full_path_for_error = None
+ for typ in type(obj).__mro__:
+ if typ in self.set_handlers:
+ return self.set_handlers[typ]
+ qname = typ.__qualname__
+ modname = getattr(typ, '__module__', '')
+ full_path = '.'.join((modname, qname)) if modname else qname
+ if full_path_for_error is None:
+ full_path_for_error = full_path
+ if full_path in self.set_handlers:
+ return self.set_handlers[full_path]
+ if qname in self.set_handlers:
+ return self.set_handlers[qname]
+ name = typ.__name__
+ if name in self.set_handlers:
+ return self.set_handlers[name]
+ if None in self.set_handlers:
+ return self.set_handlers[None]
+ raise KeyError(full_path_for_error)
+
+
+raw_data_manager = ContentManager()
+
+
+def get_text_content(msg, errors='replace'):
+ content = msg.get_payload(decode=True)
+ charset = msg.get_param('charset', 'ASCII')
+ return content.decode(charset, errors=errors)
+raw_data_manager.add_get_handler('text', get_text_content)
+
+
+def get_non_text_content(msg):
+ return msg.get_payload(decode=True)
+for maintype in 'audio image video application'.split():
+ raw_data_manager.add_get_handler(maintype, get_non_text_content)
+
+
+def get_message_content(msg):
+ return msg.get_payload(0)
+for subtype in 'rfc822 external-body'.split():
+ raw_data_manager.add_get_handler('message/'+subtype, get_message_content)
+
+
+def get_and_fixup_unknown_message_content(msg):
+ # If we don't understand a message subtype, we are supposed to treat it as
+ # if it were application/octet-stream, per
+ # tools.ietf.org/html/rfc2046#section-5.2.4. Feedparser doesn't do that,
+ # so do our best to fix things up. Note that it is *not* appropriate to
+ # model message/partial content as Message objects, so they are handled
+ # here as well. (How to reassemble them is out of scope for this comment :)
+ return bytes(msg.get_payload(0))
+raw_data_manager.add_get_handler('message',
+ get_and_fixup_unknown_message_content)
+
+
+def _prepare_set(msg, maintype, subtype, headers):
+ msg['Content-Type'] = '/'.join((maintype, subtype))
+ if headers:
+ if not hasattr(headers[0], 'name'):
+ mp = msg.policy
+ headers = [mp.header_factory(*mp.header_source_parse([header]))
+ for header in headers]
+ try:
+ for header in headers:
+ if header.defects:
+ raise header.defects[0]
+ msg[header.name] = header
+ except email.errors.HeaderDefect as exc:
+ raise ValueError("Invalid header: {}".format(
+ header.fold(policy=msg.policy))) from exc
+
+
+def _finalize_set(msg, disposition, filename, cid, params):
+ if disposition is None and filename is not None:
+ disposition = 'attachment'
+ if disposition is not None:
+ msg['Content-Disposition'] = disposition
+ if filename is not None:
+ msg.set_param('filename',
+ filename,
+ header='Content-Disposition',
+ replace=True)
+ if cid is not None:
+ msg['Content-ID'] = cid
+ if params is not None:
+ for key, value in params.items():
+ msg.set_param(key, value)
+
+
+# XXX: This is a cleaned-up version of base64mime.body_encode. It would
+# be nice to drop both this and quoprimime.body_encode in favor of
+# enhanced binascii routines that accepted a max_line_length parameter.
+def _encode_base64(data, max_line_length):
+ encoded_lines = []
+ unencoded_bytes_per_line = max_line_length * 3 // 4
+ for i in range(0, len(data), unencoded_bytes_per_line):
+ thisline = data[i:i+unencoded_bytes_per_line]
+ encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii'))
+ return ''.join(encoded_lines)
+
+
+def _encode_text(string, charset, cte, policy):
+ lines = string.encode(charset).splitlines()
+ linesep = policy.linesep.encode('ascii')
+ def embeded_body(lines): return linesep.join(lines) + linesep
+ def normal_body(lines): return b'\n'.join(lines) + b'\n'
+ if cte==None:
+ # Use heuristics to decide on the "best" encoding.
+ try:
+ return '7bit', normal_body(lines).decode('ascii')
+ except UnicodeDecodeError:
+ pass
+ if (policy.cte_type == '8bit' and
+ max(len(x) for x in lines) <= policy.max_line_length):
+ return '8bit', normal_body(lines).decode('ascii', 'surrogateescape')
+ sniff = embeded_body(lines[:10])
+ sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'),
+ policy.max_line_length)
+ sniff_base64 = binascii.b2a_base64(sniff)
+ # This is a little unfair to qp; it includes lineseps, base64 doesn't.
+ if len(sniff_qp) > len(sniff_base64):
+ cte = 'base64'
+ else:
+ cte = 'quoted-printable'
+ if len(lines) <= 10:
+ return cte, sniff_qp
+ if cte == '7bit':
+ data = normal_body(lines).decode('ascii')
+ elif cte == '8bit':
+ data = normal_body(lines).decode('ascii', 'surrogateescape')
+ elif cte == 'quoted-printable':
+ data = quoprimime.body_encode(normal_body(lines).decode('latin-1'),
+ policy.max_line_length)
+ elif cte == 'base64':
+ data = _encode_base64(embeded_body(lines), policy.max_line_length)
+ else:
+ raise ValueError("Unknown content transfer encoding {}".format(cte))
+ return cte, data
+
+
+def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
+ disposition=None, filename=None, cid=None,
+ params=None, headers=None):
+ _prepare_set(msg, 'text', subtype, headers)
+ cte, payload = _encode_text(string, charset, cte, msg.policy)
+ msg.set_payload(payload)
+ msg.set_param('charset',
+ email.charset.ALIASES.get(charset, charset),
+ replace=True)
+ msg['Content-Transfer-Encoding'] = cte
+ _finalize_set(msg, disposition, filename, cid, params)
+raw_data_manager.add_set_handler(str, set_text_content)
+
+
+def set_message_content(msg, message, subtype="rfc822", cte=None,
+ disposition=None, filename=None, cid=None,
+ params=None, headers=None):
+ if subtype == 'partial':
+ raise ValueError("message/partial is not supported for Message objects")
+ if subtype == 'rfc822':
+ if cte not in (None, '7bit', '8bit', 'binary'):
+ # http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate.
+ raise ValueError(
+ "message/rfc822 parts do not support cte={}".format(cte))
+ # 8bit will get coerced on serialization if policy.cte_type='7bit'. We
+ # may end up claiming 8bit when it isn't needed, but the only negative
+ # result of that should be a gateway that needs to coerce to 7bit
+ # having to look through the whole embedded message to discover whether
+ # or not it actually has to do anything.
+ cte = '8bit' if cte is None else cte
+ elif subtype == 'external-body':
+ if cte not in (None, '7bit'):
+ # http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate.
+ raise ValueError(
+ "message/external-body parts do not support cte={}".format(cte))
+ cte = '7bit'
+ elif cte is None:
+ # http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future
+ # subtypes should be restricted to 7bit, so assume that.
+ cte = '7bit'
+ _prepare_set(msg, 'message', subtype, headers)
+ msg.set_payload([message])
+ msg['Content-Transfer-Encoding'] = cte
+ _finalize_set(msg, disposition, filename, cid, params)
+raw_data_manager.add_set_handler(email.message.Message, set_message_content)
+
+
+def set_bytes_content(msg, data, maintype, subtype, cte='base64',
+ disposition=None, filename=None, cid=None,
+ params=None, headers=None):
+ _prepare_set(msg, maintype, subtype, headers)
+ if cte == 'base64':
+ data = _encode_base64(data, max_line_length=msg.policy.max_line_length)
+ elif cte == 'quoted-printable':
+ # XXX: quoprimime.body_encode won't encode newline characters in data,
+ # so we can't use it. This means max_line_length is ignored. Another
+ # bug to fix later. (Note: encoders.quopri is broken on line ends.)
+ data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True)
+ data = data.decode('ascii')
+ elif cte == '7bit':
+ # Make sure it really is only ASCII. The early warning here seems
+ # worth the overhead...if you care write your own content manager :).
+ data.encode('ascii')
+ elif cte in ('8bit', 'binary'):
+ data = data.decode('ascii', 'surrogateescape')
+ msg.set_payload(data)
+ msg['Content-Transfer-Encoding'] = cte
+ _finalize_set(msg, disposition, filename, cid, params)
+for typ in (bytes, bytearray, memoryview):
+ raw_data_manager.add_set_handler(typ, set_bytes_content)
diff --git a/Lib/email/encoders.py b/Lib/email/encoders.py
index f9657f0a25..0a66acb624 100644
--- a/Lib/email/encoders.py
+++ b/Lib/email/encoders.py
@@ -54,21 +54,12 @@ def encode_7or8bit(msg):
# There's no payload. For backwards compatibility we use 7bit
msg['Content-Transfer-Encoding'] = '7bit'
return
- # We play a trick to make this go fast. If encoding/decode to ASCII
- # succeeds, we know the data must be 7bit, otherwise treat it as 8bit.
+ # We play a trick to make this go fast. If decoding from ASCII succeeds,
+ # we know the data must be 7bit, otherwise treat it as 8bit.
try:
- if isinstance(orig, str):
- orig.encode('ascii')
- else:
- orig.decode('ascii')
+ orig.decode('ascii')
except UnicodeError:
- charset = msg.get_charset()
- output_cset = charset and charset.output_charset
- # iso-2022-* is non-ASCII but encodes to a 7-bit representation
- if output_cset and output_cset.lower().startswith('iso-2022-'):
- msg['Content-Transfer-Encoding'] = '7bit'
- else:
- msg['Content-Transfer-Encoding'] = '8bit'
+ msg['Content-Transfer-Encoding'] = '8bit'
else:
msg['Content-Transfer-Encoding'] = '7bit'
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index ea41e9571d..6cf9b91c1f 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -98,24 +98,15 @@ class BufferedSubFile(object):
"""Push some new data into this object."""
# Handle any previous leftovers
data, self._partial = self._partial + data, ''
- # Crack into lines, but preserve the newlines on the end of each
- parts = NLCRE_crack.split(data)
- # The *ahem* interesting behaviour of re.split when supplied grouping
- # parentheses is that the last element of the resulting list is the
- # data after the final RE. In the case of a NL/CR terminated string,
- # this is the empty string.
- self._partial = parts.pop()
- #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
- # is there a \n to follow later?
- if not self._partial and parts and parts[-1].endswith('\r'):
- self._partial = parts.pop(-2)+parts.pop()
- # parts is a list of strings, alternating between the line contents
- # and the eol character(s). Gather up a list of lines after
- # re-attaching the newlines.
- lines = []
- for i in range(len(parts) // 2):
- lines.append(parts[i*2] + parts[i*2+1])
- self.pushlines(lines)
+ # Crack into lines, but preserve the linesep characters on the end of each
+ parts = data.splitlines(True)
+ # If the last element of the list does not end in a newline, then treat
+ # it as a partial line. We only check for '\n' here because a line
+ # ending with '\r' might be a line that was split in the middle of a
+ # '\r\n' sequence (see bugs 1555570 and 1721862).
+ if parts and not parts[-1].endswith('\n'):
+ self._partial = parts.pop()
+ self.pushlines(parts)
def pushlines(self, lines):
# Reverse and insert at the front of the lines.
@@ -135,7 +126,7 @@ class BufferedSubFile(object):
class FeedParser:
"""A feed-style parser of email."""
- def __init__(self, _factory=message.Message, *, policy=compat32):
+ def __init__(self, _factory=None, *, policy=compat32):
"""_factory is called with no arguments to create a new message obj
The policy keyword specifies a policy object that controls a number of
@@ -143,14 +134,23 @@ class FeedParser:
backward compatibility.
"""
- self._factory = _factory
self.policy = policy
- try:
- _factory(policy=self.policy)
- self._factory_kwds = lambda: {'policy': self.policy}
- except TypeError:
- # Assume this is an old-style factory
- self._factory_kwds = lambda: {}
+ self._factory_kwds = lambda: {'policy': self.policy}
+ if _factory is None:
+ # What this should be:
+ #self._factory = policy.default_message_factory
+ # but, because we are post 3.4 feature freeze, fix with temp hack:
+ if self.policy is compat32:
+ self._factory = message.Message
+ else:
+ self._factory = message.EmailMessage
+ else:
+ self._factory = _factory
+ try:
+ _factory(policy=self.policy)
+ except TypeError:
+ # Assume this is an old-style factory
+ self._factory_kwds = lambda: {}
self._input = BufferedSubFile()
self._msgstack = []
self._parse = self._parsegen().__next__
diff --git a/Lib/email/iterators.py b/Lib/email/iterators.py
index 3adc4a04ba..b5502ee975 100644
--- a/Lib/email/iterators.py
+++ b/Lib/email/iterators.py
@@ -26,8 +26,7 @@ def walk(self):
yield self
if self.is_multipart():
for subpart in self.get_payload():
- for subsubpart in subpart.walk():
- yield subsubpart
+ yield from subpart.walk()
@@ -40,8 +39,7 @@ def body_line_iterator(msg, decode=False):
for subpart in msg.walk():
payload = subpart.get_payload(decode=decode)
if isinstance(payload, str):
- for line in StringIO(payload):
- yield line
+ yield from StringIO(payload)
def typed_subpart_iterator(msg, maintype='text', subtype=None):
diff --git a/Lib/email/message.py b/Lib/email/message.py
index afe350c902..88b5fa3552 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -8,8 +8,6 @@ __all__ = ['Message']
import re
import uu
-import base64
-import binascii
from io import BytesIO, StringIO
# Intrapackage imports
@@ -132,22 +130,50 @@ class Message:
def __str__(self):
"""Return the entire formatted message as a string.
- This includes the headers, body, and envelope header.
"""
return self.as_string()
- def as_string(self, unixfrom=False, maxheaderlen=0):
+ def as_string(self, unixfrom=False, maxheaderlen=0, policy=None):
"""Return the entire formatted message as a string.
- Optional `unixfrom' when True, means include the Unix From_ envelope
- header.
- This is a convenience method and may not generate the message exactly
- as you intend. For more flexibility, use the flatten() method of a
- Generator instance.
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. For backward compatibility reasons, if maxheaderlen is
+ not specified it defaults to 0, so you must override it explicitly
+ if you want a different maxheaderlen. 'policy' is passed to the
+ Generator instance used to serialize the mesasge; if it is not
+ specified the policy associated with the message instance is used.
+
+ If the message object contains binary data that is not encoded
+ according to RFC standards, the non-compliant data will be replaced by
+ unicode "unknown character" code points.
"""
from email.generator import Generator
+ policy = self.policy if policy is None else policy
fp = StringIO()
- g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen)
+ g = Generator(fp,
+ mangle_from_=False,
+ maxheaderlen=maxheaderlen,
+ policy=policy)
+ g.flatten(self, unixfrom=unixfrom)
+ return fp.getvalue()
+
+ def __bytes__(self):
+ """Return the entire formatted message as a bytes object.
+ """
+ return self.as_bytes()
+
+ def as_bytes(self, unixfrom=False, policy=None):
+ """Return the entire formatted message as a bytes object.
+
+ Optional 'unixfrom', when true, means include the Unix From_ envelope
+ header. 'policy' is passed to the BytesGenerator instance used to
+ serialize the message; if not specified the policy associated with
+ the message instance is used.
+ """
+ from email.generator import BytesGenerator
+ policy = self.policy if policy is None else policy
+ fp = BytesIO()
+ g = BytesGenerator(fp, mangle_from_=False, policy=policy)
g.flatten(self, unixfrom=unixfrom)
return fp.getvalue()
@@ -668,7 +694,7 @@ class Message:
return failobj
def set_param(self, param, value, header='Content-Type', requote=True,
- charset=None, language=''):
+ charset=None, language='', replace=False):
"""Set a parameter in the Content-Type header.
If the parameter already exists in the header, its value will be
@@ -712,8 +738,11 @@ class Message:
else:
ctype = SEMISPACE.join([ctype, append_param])
if ctype != self.get(header):
- del self[header]
- self[header] = ctype
+ if replace:
+ self.replace_header(header, ctype)
+ else:
+ del self[header]
+ self[header] = ctype
def del_param(self, param, header='content-type', requote=True):
"""Remove the given parameter completely from the Content-Type header.
@@ -894,3 +923,208 @@ class Message:
# I.e. def walk(self): ...
from email.iterators import walk
+
+
+class MIMEPart(Message):
+
+ def __init__(self, policy=None):
+ if policy is None:
+ from email.policy import default
+ policy = default
+ Message.__init__(self, policy)
+
+ @property
+ def is_attachment(self):
+ c_d = self.get('content-disposition')
+ if c_d is None:
+ return False
+ return c_d.lower() == 'attachment'
+
+ def _find_body(self, part, preferencelist):
+ if part.is_attachment:
+ return
+ maintype, subtype = part.get_content_type().split('/')
+ if maintype == 'text':
+ if subtype in preferencelist:
+ yield (preferencelist.index(subtype), part)
+ return
+ if maintype != 'multipart':
+ return
+ if subtype != 'related':
+ for subpart in part.iter_parts():
+ yield from self._find_body(subpart, preferencelist)
+ return
+ if 'related' in preferencelist:
+ yield (preferencelist.index('related'), part)
+ candidate = None
+ start = part.get_param('start')
+ if start:
+ for subpart in part.iter_parts():
+ if subpart['content-id'] == start:
+ candidate = subpart
+ break
+ if candidate is None:
+ subparts = part.get_payload()
+ candidate = subparts[0] if subparts else None
+ if candidate is not None:
+ yield from self._find_body(candidate, preferencelist)
+
+ def get_body(self, preferencelist=('related', 'html', 'plain')):
+ """Return best candidate mime part for display as 'body' of message.
+
+ Do a depth first search, starting with self, looking for the first part
+ matching each of the items in preferencelist, and return the part
+ corresponding to the first item that has a match, or None if no items
+ have a match. If 'related' is not included in preferencelist, consider
+ the root part of any multipart/related encountered as a candidate
+ match. Ignore parts with 'Content-Disposition: attachment'.
+ """
+ best_prio = len(preferencelist)
+ body = None
+ for prio, part in self._find_body(self, preferencelist):
+ if prio < best_prio:
+ best_prio = prio
+ body = part
+ if prio == 0:
+ break
+ return body
+
+ _body_types = {('text', 'plain'),
+ ('text', 'html'),
+ ('multipart', 'related'),
+ ('multipart', 'alternative')}
+ def iter_attachments(self):
+ """Return an iterator over the non-main parts of a multipart.
+
+ Skip the first of each occurrence of text/plain, text/html,
+ multipart/related, or multipart/alternative in the multipart (unless
+ they have a 'Content-Disposition: attachment' header) and include all
+ remaining subparts in the returned iterator. When applied to a
+ multipart/related, return all parts except the root part. Return an
+ empty iterator when applied to a multipart/alternative or a
+ non-multipart.
+ """
+ maintype, subtype = self.get_content_type().split('/')
+ if maintype != 'multipart' or subtype == 'alternative':
+ return
+ parts = self.get_payload()
+ if maintype == 'multipart' and subtype == 'related':
+ # For related, we treat everything but the root as an attachment.
+ # The root may be indicated by 'start'; if there's no start or we
+ # can't find the named start, treat the first subpart as the root.
+ start = self.get_param('start')
+ if start:
+ found = False
+ attachments = []
+ for part in parts:
+ if part.get('content-id') == start:
+ found = True
+ else:
+ attachments.append(part)
+ if found:
+ yield from attachments
+ return
+ parts.pop(0)
+ yield from parts
+ return
+ # Otherwise we more or less invert the remaining logic in get_body.
+ # This only really works in edge cases (ex: non-text relateds or
+ # alternatives) if the sending agent sets content-disposition.
+ seen = [] # Only skip the first example of each candidate type.
+ for part in parts:
+ maintype, subtype = part.get_content_type().split('/')
+ if ((maintype, subtype) in self._body_types and
+ not part.is_attachment and subtype not in seen):
+ seen.append(subtype)
+ continue
+ yield part
+
+ def iter_parts(self):
+ """Return an iterator over all immediate subparts of a multipart.
+
+ Return an empty iterator for a non-multipart.
+ """
+ if self.get_content_maintype() == 'multipart':
+ yield from self.get_payload()
+
+ def get_content(self, *args, content_manager=None, **kw):
+ if content_manager is None:
+ content_manager = self.policy.content_manager
+ return content_manager.get_content(self, *args, **kw)
+
+ def set_content(self, *args, content_manager=None, **kw):
+ if content_manager is None:
+ content_manager = self.policy.content_manager
+ content_manager.set_content(self, *args, **kw)
+
+ def _make_multipart(self, subtype, disallowed_subtypes, boundary):
+ if self.get_content_maintype() == 'multipart':
+ existing_subtype = self.get_content_subtype()
+ disallowed_subtypes = disallowed_subtypes + (subtype,)
+ if existing_subtype in disallowed_subtypes:
+ raise ValueError("Cannot convert {} to {}".format(
+ existing_subtype, subtype))
+ keep_headers = []
+ part_headers = []
+ for name, value in self._headers:
+ if name.lower().startswith('content-'):
+ part_headers.append((name, value))
+ else:
+ keep_headers.append((name, value))
+ if part_headers:
+ # There is existing content, move it to the first subpart.
+ part = type(self)(policy=self.policy)
+ part._headers = part_headers
+ part._payload = self._payload
+ self._payload = [part]
+ else:
+ self._payload = []
+ self._headers = keep_headers
+ self['Content-Type'] = 'multipart/' + subtype
+ if boundary is not None:
+ self.set_param('boundary', boundary)
+
+ def make_related(self, boundary=None):
+ self._make_multipart('related', ('alternative', 'mixed'), boundary)
+
+ def make_alternative(self, boundary=None):
+ self._make_multipart('alternative', ('mixed',), boundary)
+
+ def make_mixed(self, boundary=None):
+ self._make_multipart('mixed', (), boundary)
+
+ def _add_multipart(self, _subtype, *args, _disp=None, **kw):
+ if (self.get_content_maintype() != 'multipart' or
+ self.get_content_subtype() != _subtype):
+ getattr(self, 'make_' + _subtype)()
+ part = type(self)(policy=self.policy)
+ part.set_content(*args, **kw)
+ if _disp and 'content-disposition' not in part:
+ part['Content-Disposition'] = _disp
+ self.attach(part)
+
+ def add_related(self, *args, **kw):
+ self._add_multipart('related', *args, _disp='inline', **kw)
+
+ def add_alternative(self, *args, **kw):
+ self._add_multipart('alternative', *args, **kw)
+
+ def add_attachment(self, *args, **kw):
+ self._add_multipart('mixed', *args, _disp='attachment', **kw)
+
+ def clear(self):
+ self._headers = []
+ self._payload = None
+
+ def clear_content(self):
+ self._headers = [(n, v) for n, v in self._headers
+ if not n.lower().startswith('content-')]
+ self._payload = None
+
+
+class EmailMessage(MIMEPart):
+
+ def set_content(self, *args, **kw):
+ super().set_content(*args, **kw)
+ if 'MIME-Version' not in self:
+ self['MIME-Version'] = '1.0'
diff --git a/Lib/email/parser.py b/Lib/email/parser.py
index 752bf35a6e..ed512b114f 100644
--- a/Lib/email/parser.py
+++ b/Lib/email/parser.py
@@ -4,7 +4,8 @@
"""A parser of RFC 2822 and MIME email messages."""
-__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser']
+__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser',
+ 'FeedParser', 'BytesFeedParser']
import warnings
from io import StringIO, TextIOWrapper
@@ -16,7 +17,7 @@ from email._policybase import compat32
class Parser:
- def __init__(self, _class=Message, *, policy=compat32):
+ def __init__(self, _class=None, *, policy=compat32):
"""Parser of RFC 2822 and MIME email messages.
Creates an in-memory object tree representing the email message, which
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
index 38e88afe1d..f0b20f4b19 100644
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -5,6 +5,7 @@ code that adds all the email6 features.
from email._policybase import Policy, Compat32, compat32, _extend_docstrings
from email.utils import _has_surrogates
from email.headerregistry import HeaderRegistry as HeaderRegistry
+from email.contentmanager import raw_data_manager
__all__ = [
'Compat32',
@@ -58,10 +59,22 @@ class EmailPolicy(Policy):
special treatment, while all other fields are
treated as unstructured. This list will be
completed before the extension is marked stable.)
+
+ content_manager -- an object with at least two methods: get_content
+ and set_content. When the get_content or
+ set_content method of a Message object is called,
+ it calls the corresponding method of this object,
+ passing it the message object as its first argument,
+ and any arguments or keywords that were passed to
+ it as additional arguments. The default
+ content_manager is
+ :data:`~email.contentmanager.raw_data_manager`.
+
"""
refold_source = 'long'
header_factory = HeaderRegistry()
+ content_manager = raw_data_manager
def __init__(self, **kw):
# Ensure that each new instance gets a unique header factory
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index f76c21eb1b..95855d81bd 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -54,17 +54,27 @@ TICK = "'"
specialsre = re.compile(r'[][\\()<>@,:;".]')
escapesre = re.compile(r'[\\"]')
-# How to figure out if we are processing strings that come from a byte
-# source with undecodable characters.
-_has_surrogates = re.compile(
- '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+def _has_surrogates(s):
+ """Return True if s contains surrogate-escaped binary data."""
+ # This check is based on the fact that unless there are surrogates, utf8
+ # (Python's default encoding) can encode any string. This is the fastest
+ # way to check for surrogates, see issue 11454 for timings.
+ try:
+ s.encode()
+ return False
+ except UnicodeEncodeError:
+ return True
# How to deal with a string containing bytes before handing it to the
# application through the 'normal' interface.
def _sanitize(string):
- # Turn any escaped bytes into unicode 'unknown' char.
- original_bytes = string.encode('ascii', 'surrogateescape')
- return original_bytes.decode('ascii', 'replace')
+ # Turn any escaped bytes into unicode 'unknown' char. If the escaped
+ # bytes happen to be utf-8 they will instead get decoded, even if they
+ # were invalid in the charset the source was supposed to be in. This
+ # seems like it is not a bad thing; a defect was still registered.
+ original_bytes = string.encode('utf-8', 'surrogateescape')
+ return original_bytes.decode('utf-8', 'replace')
+
# Helpers