Merge #19772: Do not mutate message when downcoding to 7bit.

author: R David Murray <rdmurray@bitdance.com> 2014-02-08 11:51:18 -0500
committer: R David Murray <rdmurray@bitdance.com> 2014-02-08 11:51:18 -0500
commit: a055d4d797b347ad45b2becdf5b8ec86dc135f20 (patch)
tree: d2a97624b5a31dea7b6be6398e95a9430cf1dd42 /Lib/email
parent: 37b69c6a9e207f93e6a3184bde8d434c81777996 (diff)
parent: 36c69903355c4b54119021ae16f844bb2f7e3942 (diff)
download: cpython-a055d4d797b347ad45b2becdf5b8ec86dc135f20.tar.gz
9 files changed, 578 insertions, 85 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 291437c586..039237936c 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -368,8 +368,7 @@ class TokenList(list):
                 yield (indent + '    !! invalid element in token '
                                         'list: {!r}'.format(token))
             else:
-                for line in token._pp(indent+'    '):
-                    yield line
+                yield from token._pp(indent+'    ')
         if self.defects:
             extra = ' Defects: {}'.format(self.defects)
         else:
@@ -1315,24 +1314,22 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
 # Parser
 #
 
-"""Parse strings according to RFC822/2047/2822/5322 rules.
-
-This is a stateless parser.  Each get_XXX function accepts a string and
-returns either a Terminal or a TokenList representing the RFC object named
-by the method and a string containing the remaining unparsed characters
-from the input.  Thus a parser method consumes the next syntactic construct
-of a given type and returns a token representing the construct plus the
-unparsed remainder of the input string.
-
-For example, if the first element of a structured header is a 'phrase',
-then:
-
-    phrase, value = get_phrase(value)
-
-returns the complete phrase from the start of the string value, plus any
-characters left in the string after the phrase is removed.
-
-"""
+# Parse strings according to RFC822/2047/2822/5322 rules.
+#
+# This is a stateless parser.  Each get_XXX function accepts a string and
+# returns either a Terminal or a TokenList representing the RFC object named
+# by the method and a string containing the remaining unparsed characters
+# from the input.  Thus a parser method consumes the next syntactic construct
+# of a given type and returns a token representing the construct plus the
+# unparsed remainder of the input string.
+#
+# For example, if the first element of a structured header is a 'phrase',
+# then:
+#
+#     phrase, value = get_phrase(value)
+#
+# returns the complete phrase from the start of the string value, plus any
+# characters left in the string after the phrase is removed.
 
 _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
 _non_atom_end_matcher = re.compile(r"[^{}]+".format(
diff --git a/Lib/email/contentmanager.py b/Lib/email/contentmanager.py
new file mode 100644
index 0000000000..d3636529b6
--- /dev/null
+++ b/Lib/email/contentmanager.py
@@ -0,0 +1,249 @@
+import binascii
+import email.charset
+import email.message
+import email.errors
+from email import quoprimime
+
+class ContentManager:
+
+    def __init__(self):
+        self.get_handlers = {}
+        self.set_handlers = {}
+
+    def add_get_handler(self, key, handler):
+        self.get_handlers[key] = handler
+
+    def get_content(self, msg, *args, **kw):
+        content_type = msg.get_content_type()
+        if content_type in self.get_handlers:
+            return self.get_handlers[content_type](msg, *args, **kw)
+        maintype = msg.get_content_maintype()
+        if maintype in self.get_handlers:
+            return self.get_handlers[maintype](msg, *args, **kw)
+        if '' in self.get_handlers:
+            return self.get_handlers[''](msg, *args, **kw)
+        raise KeyError(content_type)
+
+    def add_set_handler(self, typekey, handler):
+        self.set_handlers[typekey] = handler
+
+    def set_content(self, msg, obj, *args, **kw):
+        if msg.get_content_maintype() == 'multipart':
+            # XXX: is this error a good idea or not?  We can remove it later,
+            # but we can't add it later, so do it for now.
+            raise TypeError("set_content not valid on multipart")
+        handler = self._find_set_handler(msg, obj)
+        msg.clear_content()
+        handler(msg, obj, *args, **kw)
+
+    def _find_set_handler(self, msg, obj):
+        full_path_for_error = None
+        for typ in type(obj).__mro__:
+            if typ in self.set_handlers:
+                return self.set_handlers[typ]
+            qname = typ.__qualname__
+            modname = getattr(typ, '__module__', '')
+            full_path = '.'.join((modname, qname)) if modname else qname
+            if full_path_for_error is None:
+                full_path_for_error = full_path
+            if full_path in self.set_handlers:
+                return self.set_handlers[full_path]
+            if qname in self.set_handlers:
+                return self.set_handlers[qname]
+            name = typ.__name__
+            if name in self.set_handlers:
+                return self.set_handlers[name]
+        if None in self.set_handlers:
+            return self.set_handlers[None]
+        raise KeyError(full_path_for_error)
+
+
+raw_data_manager = ContentManager()
+
+
+def get_text_content(msg, errors='replace'):
+    content = msg.get_payload(decode=True)
+    charset = msg.get_param('charset', 'ASCII')
+    return content.decode(charset, errors=errors)
+raw_data_manager.add_get_handler('text', get_text_content)
+
+
+def get_non_text_content(msg):
+    return msg.get_payload(decode=True)
+for maintype in 'audio image video application'.split():
+    raw_data_manager.add_get_handler(maintype, get_non_text_content)
+
+
+def get_message_content(msg):
+    return msg.get_payload(0)
+for subtype in 'rfc822 external-body'.split():
+    raw_data_manager.add_get_handler('message/'+subtype, get_message_content)
+
+
+def get_and_fixup_unknown_message_content(msg):
+    # If we don't understand a message subtype, we are supposed to treat it as
+    # if it were application/octet-stream, per
+    # tools.ietf.org/html/rfc2046#section-5.2.4.  Feedparser doesn't do that,
+    # so do our best to fix things up.  Note that it is *not* appropriate to
+    # model message/partial content as Message objects, so they are handled
+    # here as well.  (How to reassemble them is out of scope for this comment :)
+    return bytes(msg.get_payload(0))
+raw_data_manager.add_get_handler('message',
+                                 get_and_fixup_unknown_message_content)
+
+
+def _prepare_set(msg, maintype, subtype, headers):
+    msg['Content-Type'] = '/'.join((maintype, subtype))
+    if headers:
+        if not hasattr(headers[0], 'name'):
+            mp = msg.policy
+            headers = [mp.header_factory(*mp.header_source_parse([header]))
+                       for header in headers]
+        try:
+            for header in headers:
+                if header.defects:
+                    raise header.defects[0]
+                msg[header.name] = header
+        except email.errors.HeaderDefect as exc:
+            raise ValueError("Invalid header: {}".format(
+                                header.fold(policy=msg.policy))) from exc
+
+
+def _finalize_set(msg, disposition, filename, cid, params):
+    if disposition is None and filename is not None:
+        disposition = 'attachment'
+    if disposition is not None:
+        msg['Content-Disposition'] = disposition
+    if filename is not None:
+        msg.set_param('filename',
+                      filename,
+                      header='Content-Disposition',
+                      replace=True)
+    if cid is not None:
+        msg['Content-ID'] = cid
+    if params is not None:
+        for key, value in params.items():
+            msg.set_param(key, value)
+
+
+# XXX: This is a cleaned-up version of base64mime.body_encode.  It would
+# be nice to drop both this and quoprimime.body_encode in favor of
+# enhanced binascii routines that accepted a max_line_length parameter.
+def _encode_base64(data, max_line_length):
+    encoded_lines = []
+    unencoded_bytes_per_line = max_line_length * 3 // 4
+    for i in range(0, len(data), unencoded_bytes_per_line):
+        thisline = data[i:i+unencoded_bytes_per_line]
+        encoded_lines.append(binascii.b2a_base64(thisline).decode('ascii'))
+    return ''.join(encoded_lines)
+
+
+def _encode_text(string, charset, cte, policy):
+    lines = string.encode(charset).splitlines()
+    linesep = policy.linesep.encode('ascii')
+    def embeded_body(lines): return linesep.join(lines) + linesep
+    def normal_body(lines): return b'\n'.join(lines) + b'\n'
+    if cte==None:
+        # Use heuristics to decide on the "best" encoding.
+        try:
+            return '7bit', normal_body(lines).decode('ascii')
+        except UnicodeDecodeError:
+            pass
+        if (policy.cte_type == '8bit' and
+                max(len(x) for x in lines) <= policy.max_line_length):
+            return '8bit', normal_body(lines).decode('ascii', 'surrogateescape')
+        sniff = embeded_body(lines[:10])
+        sniff_qp = quoprimime.body_encode(sniff.decode('latin-1'),
+                                          policy.max_line_length)
+        sniff_base64 = binascii.b2a_base64(sniff)
+        # This is a little unfair to qp; it includes lineseps, base64 doesn't.
+        if len(sniff_qp) > len(sniff_base64):
+            cte = 'base64'
+        else:
+            cte = 'quoted-printable'
+            if len(lines) <= 10:
+                return cte, sniff_qp
+    if cte == '7bit':
+        data = normal_body(lines).decode('ascii')
+    elif cte == '8bit':
+        data = normal_body(lines).decode('ascii', 'surrogateescape')
+    elif cte == 'quoted-printable':
+        data = quoprimime.body_encode(normal_body(lines).decode('latin-1'),
+                                      policy.max_line_length)
+    elif cte == 'base64':
+        data = _encode_base64(embeded_body(lines), policy.max_line_length)
+    else:
+        raise ValueError("Unknown content transfer encoding {}".format(cte))
+    return cte, data
+
+
+def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
+                     disposition=None, filename=None, cid=None,
+                     params=None, headers=None):
+    _prepare_set(msg, 'text', subtype, headers)
+    cte, payload = _encode_text(string, charset, cte, msg.policy)
+    msg.set_payload(payload)
+    msg.set_param('charset',
+                  email.charset.ALIASES.get(charset, charset),
+                  replace=True)
+    msg['Content-Transfer-Encoding'] = cte
+    _finalize_set(msg, disposition, filename, cid, params)
+raw_data_manager.add_set_handler(str, set_text_content)
+
+
+def set_message_content(msg, message, subtype="rfc822", cte=None,
+                       disposition=None, filename=None, cid=None,
+                       params=None, headers=None):
+    if subtype == 'partial':
+        raise ValueError("message/partial is not supported for Message objects")
+    if subtype == 'rfc822':
+        if cte not in (None, '7bit', '8bit', 'binary'):
+            # http://tools.ietf.org/html/rfc2046#section-5.2.1 mandate.
+            raise ValueError(
+                "message/rfc822 parts do not support cte={}".format(cte))
+        # 8bit will get coerced on serialization if policy.cte_type='7bit'.  We
+        # may end up claiming 8bit when it isn't needed, but the only negative
+        # result of that should be a gateway that needs to coerce to 7bit
+        # having to look through the whole embedded message to discover whether
+        # or not it actually has to do anything.
+        cte = '8bit' if cte is None else cte
+    elif subtype == 'external-body':
+        if cte not in (None, '7bit'):
+            # http://tools.ietf.org/html/rfc2046#section-5.2.3 mandate.
+            raise ValueError(
+                "message/external-body parts do not support cte={}".format(cte))
+        cte = '7bit'
+    elif cte is None:
+        # http://tools.ietf.org/html/rfc2046#section-5.2.4 says all future
+        # subtypes should be restricted to 7bit, so assume that.
+        cte = '7bit'
+    _prepare_set(msg, 'message', subtype, headers)
+    msg.set_payload([message])
+    msg['Content-Transfer-Encoding'] = cte
+    _finalize_set(msg, disposition, filename, cid, params)
+raw_data_manager.add_set_handler(email.message.Message, set_message_content)
+
+
+def set_bytes_content(msg, data, maintype, subtype, cte='base64',
+                     disposition=None, filename=None, cid=None,
+                     params=None, headers=None):
+    _prepare_set(msg, maintype, subtype, headers)
+    if cte == 'base64':
+        data = _encode_base64(data, max_line_length=msg.policy.max_line_length)
+    elif cte == 'quoted-printable':
+        # XXX: quoprimime.body_encode won't encode newline characters in data,
+        # so we can't use it.  This means max_line_length is ignored.  Another
+        # bug to fix later.  (Note: encoders.quopri is broken on line ends.)
+        data = binascii.b2a_qp(data, istext=False, header=False, quotetabs=True)
+        data = data.decode('ascii')
+    elif cte == '7bit':
+        # Make sure it really is only ASCII.  The early warning here seems
+        # worth the overhead...if you care write your own content manager :).
+        data.encode('ascii')
+    elif cte in ('8bit', 'binary'):
+        data = data.decode('ascii', 'surrogateescape')
+    msg.set_payload(data)
+    msg['Content-Transfer-Encoding'] = cte
+    _finalize_set(msg, disposition, filename, cid, params)
+for typ in (bytes, bytearray, memoryview):
+    raw_data_manager.add_set_handler(typ, set_bytes_content)
diff --git a/Lib/email/encoders.py b/Lib/email/encoders.py
index f9657f0a25..0a66acb624 100644
--- a/Lib/email/encoders.py
+++ b/Lib/email/encoders.py
@@ -54,21 +54,12 @@ def encode_7or8bit(msg):
         # There's no payload.  For backwards compatibility we use 7bit
         msg['Content-Transfer-Encoding'] = '7bit'
         return
-    # We play a trick to make this go fast.  If encoding/decode to ASCII
-    # succeeds, we know the data must be 7bit, otherwise treat it as 8bit.
+    # We play a trick to make this go fast.  If decoding from ASCII succeeds,
+    # we know the data must be 7bit, otherwise treat it as 8bit.
     try:
-        if isinstance(orig, str):
-            orig.encode('ascii')
-        else:
-            orig.decode('ascii')
+        orig.decode('ascii')
     except UnicodeError:
-        charset = msg.get_charset()
-        output_cset = charset and charset.output_charset
-        # iso-2022-* is non-ASCII but encodes to a 7-bit representation
-        if output_cset and output_cset.lower().startswith('iso-2022-'):
-            msg['Content-Transfer-Encoding'] = '7bit'
-        else:
-            msg['Content-Transfer-Encoding'] = '8bit'
+        msg['Content-Transfer-Encoding'] = '8bit'
     else:
         msg['Content-Transfer-Encoding'] = '7bit'
 
diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py
index ea41e9571d..6cf9b91c1f 100644
--- a/Lib/email/feedparser.py
+++ b/Lib/email/feedparser.py
@@ -98,24 +98,15 @@ class BufferedSubFile(object):
         """Push some new data into this object."""
         # Handle any previous leftovers
         data, self._partial = self._partial + data, ''
-        # Crack into lines, but preserve the newlines on the end of each
-        parts = NLCRE_crack.split(data)
-        # The *ahem* interesting behaviour of re.split when supplied grouping
-        # parentheses is that the last element of the resulting list is the
-        # data after the final RE.  In the case of a NL/CR terminated string,
-        # this is the empty string.
-        self._partial = parts.pop()
-        #GAN 29Mar09  bugs 1555570, 1721862  Confusion at 8K boundary ending with \r:
-        # is there a \n to follow later?
-        if not self._partial and parts and parts[-1].endswith('\r'):
-            self._partial = parts.pop(-2)+parts.pop()
-        # parts is a list of strings, alternating between the line contents
-        # and the eol character(s).  Gather up a list of lines after
-        # re-attaching the newlines.
-        lines = []
-        for i in range(len(parts) // 2):
-            lines.append(parts[i*2] + parts[i*2+1])
-        self.pushlines(lines)
+        # Crack into lines, but preserve the linesep characters on the end of each
+        parts = data.splitlines(True)
+        # If the last element of the list does not end in a newline, then treat
+        # it as a partial line.  We only check for '\n' here because a line
+        # ending with '\r' might be a line that was split in the middle of a
+        # '\r\n' sequence (see bugs 1555570 and 1721862).
+        if parts and not parts[-1].endswith('\n'):
+            self._partial = parts.pop()
+        self.pushlines(parts)
 
     def pushlines(self, lines):
         # Reverse and insert at the front of the lines.
@@ -135,7 +126,7 @@ class BufferedSubFile(object):
 class FeedParser:
     """A feed-style parser of email."""
 
-    def __init__(self, _factory=message.Message, *, policy=compat32):
+    def __init__(self, _factory=None, *, policy=compat32):
         """_factory is called with no arguments to create a new message obj
 
         The policy keyword specifies a policy object that controls a number of
@@ -143,14 +134,23 @@ class FeedParser:
         backward compatibility.
 
         """
-        self._factory = _factory
         self.policy = policy
-        try:
-            _factory(policy=self.policy)
-            self._factory_kwds = lambda: {'policy': self.policy}
-        except TypeError:
-            # Assume this is an old-style factory
-            self._factory_kwds = lambda: {}
+        self._factory_kwds = lambda: {'policy': self.policy}
+        if _factory is None:
+            # What this should be:
+            #self._factory = policy.default_message_factory
+            # but, because we are post 3.4 feature freeze, fix with temp hack:
+            if self.policy is compat32:
+                self._factory = message.Message
+            else:
+                self._factory = message.EmailMessage
+        else:
+            self._factory = _factory
+            try:
+                _factory(policy=self.policy)
+            except TypeError:
+                # Assume this is an old-style factory
+                self._factory_kwds = lambda: {}
         self._input = BufferedSubFile()
         self._msgstack = []
         self._parse = self._parsegen().__next__
diff --git a/Lib/email/iterators.py b/Lib/email/iterators.py
index 3adc4a04ba..b5502ee975 100644
--- a/Lib/email/iterators.py
+++ b/Lib/email/iterators.py
@@ -26,8 +26,7 @@ def walk(self):
     yield self
     if self.is_multipart():
         for subpart in self.get_payload():
-            for subsubpart in subpart.walk():
-                yield subsubpart
+            yield from subpart.walk()
 
 
 
@@ -40,8 +39,7 @@ def body_line_iterator(msg, decode=False):
     for subpart in msg.walk():
         payload = subpart.get_payload(decode=decode)
         if isinstance(payload, str):
-            for line in StringIO(payload):
-                yield line
+            yield from StringIO(payload)
 
 
 def typed_subpart_iterator(msg, maintype='text', subtype=None):
diff --git a/Lib/email/message.py b/Lib/email/message.py
index afe350c902..88b5fa3552 100644
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@@ -8,8 +8,6 @@ __all__ = ['Message']
 
 import re
 import uu
-import base64
-import binascii
 from io import BytesIO, StringIO
 
 # Intrapackage imports
@@ -132,22 +130,50 @@ class Message:
 
     def __str__(self):
         """Return the entire formatted message as a string.
-        This includes the headers, body, and envelope header.
         """
         return self.as_string()
 
-    def as_string(self, unixfrom=False, maxheaderlen=0):
+    def as_string(self, unixfrom=False, maxheaderlen=0, policy=None):
         """Return the entire formatted message as a string.
-        Optional `unixfrom' when True, means include the Unix From_ envelope
-        header.
 
-        This is a convenience method and may not generate the message exactly
-        as you intend.  For more flexibility, use the flatten() method of a
-        Generator instance.
+        Optional 'unixfrom', when true, means include the Unix From_ envelope
+        header.  For backward compatibility reasons, if maxheaderlen is
+        not specified it defaults to 0, so you must override it explicitly
+        if you want a different maxheaderlen.  'policy' is passed to the
+        Generator instance used to serialize the mesasge; if it is not
+        specified the policy associated with the message instance is used.
+
+        If the message object contains binary data that is not encoded
+        according to RFC standards, the non-compliant data will be replaced by
+        unicode "unknown character" code points.
         """
         from email.generator import Generator
+        policy = self.policy if policy is None else policy
         fp = StringIO()
-        g = Generator(fp, mangle_from_=False, maxheaderlen=maxheaderlen)
+        g = Generator(fp,
+                      mangle_from_=False,
+                      maxheaderlen=maxheaderlen,
+                      policy=policy)
+        g.flatten(self, unixfrom=unixfrom)
+        return fp.getvalue()
+
+    def __bytes__(self):
+        """Return the entire formatted message as a bytes object.
+        """
+        return self.as_bytes()
+
+    def as_bytes(self, unixfrom=False, policy=None):
+        """Return the entire formatted message as a bytes object.
+
+        Optional 'unixfrom', when true, means include the Unix From_ envelope
+        header.  'policy' is passed to the BytesGenerator instance used to
+        serialize the message; if not specified the policy associated with
+        the message instance is used.
+        """
+        from email.generator import BytesGenerator
+        policy = self.policy if policy is None else policy
+        fp = BytesIO()
+        g = BytesGenerator(fp, mangle_from_=False, policy=policy)
         g.flatten(self, unixfrom=unixfrom)
         return fp.getvalue()
 
@@ -668,7 +694,7 @@ class Message:
         return failobj
 
     def set_param(self, param, value, header='Content-Type', requote=True,
-                  charset=None, language=''):
+                  charset=None, language='', replace=False):
         """Set a parameter in the Content-Type header.
 
         If the parameter already exists in the header, its value will be
@@ -712,8 +738,11 @@ class Message:
                 else:
                     ctype = SEMISPACE.join([ctype, append_param])
         if ctype != self.get(header):
-            del self[header]
-            self[header] = ctype
+            if replace:
+                self.replace_header(header, ctype)
+            else:
+                del self[header]
+                self[header] = ctype
 
     def del_param(self, param, header='content-type', requote=True):
         """Remove the given parameter completely from the Content-Type header.
@@ -894,3 +923,208 @@ class Message:
 
     # I.e. def walk(self): ...
     from email.iterators import walk
+
+
+class MIMEPart(Message):
+
+    def __init__(self, policy=None):
+        if policy is None:
+            from email.policy import default
+            policy = default
+        Message.__init__(self, policy)
+
+    @property
+    def is_attachment(self):
+        c_d = self.get('content-disposition')
+        if c_d is None:
+            return False
+        return c_d.lower() == 'attachment'
+
+    def _find_body(self, part, preferencelist):
+        if part.is_attachment:
+            return
+        maintype, subtype = part.get_content_type().split('/')
+        if maintype == 'text':
+            if subtype in preferencelist:
+                yield (preferencelist.index(subtype), part)
+            return
+        if maintype != 'multipart':
+            return
+        if subtype != 'related':
+            for subpart in part.iter_parts():
+                yield from self._find_body(subpart, preferencelist)
+            return
+        if 'related' in preferencelist:
+            yield (preferencelist.index('related'), part)
+        candidate = None
+        start = part.get_param('start')
+        if start:
+            for subpart in part.iter_parts():
+                if subpart['content-id'] == start:
+                    candidate = subpart
+                    break
+        if candidate is None:
+            subparts = part.get_payload()
+            candidate = subparts[0] if subparts else None
+        if candidate is not None:
+            yield from self._find_body(candidate, preferencelist)
+
+    def get_body(self, preferencelist=('related', 'html', 'plain')):
+        """Return best candidate mime part for display as 'body' of message.
+
+        Do a depth first search, starting with self, looking for the first part
+        matching each of the items in preferencelist, and return the part
+        corresponding to the first item that has a match, or None if no items
+        have a match.  If 'related' is not included in preferencelist, consider
+        the root part of any multipart/related encountered as a candidate
+        match.  Ignore parts with 'Content-Disposition: attachment'.
+        """
+        best_prio = len(preferencelist)
+        body = None
+        for prio, part in self._find_body(self, preferencelist):
+            if prio < best_prio:
+                best_prio = prio
+                body = part
+                if prio == 0:
+                    break
+        return body
+
+    _body_types = {('text', 'plain'),
+                   ('text', 'html'),
+                   ('multipart', 'related'),
+                   ('multipart', 'alternative')}
+    def iter_attachments(self):
+        """Return an iterator over the non-main parts of a multipart.
+
+        Skip the first of each occurrence of text/plain, text/html,
+        multipart/related, or multipart/alternative in the multipart (unless
+        they have a 'Content-Disposition: attachment' header) and include all
+        remaining subparts in the returned iterator.  When applied to a
+        multipart/related, return all parts except the root part.  Return an
+        empty iterator when applied to a multipart/alternative or a
+        non-multipart.
+        """
+        maintype, subtype = self.get_content_type().split('/')
+        if maintype != 'multipart' or subtype == 'alternative':
+            return
+        parts = self.get_payload()
+        if maintype == 'multipart' and subtype == 'related':
+            # For related, we treat everything but the root as an attachment.
+            # The root may be indicated by 'start'; if there's no start or we
+            # can't find the named start, treat the first subpart as the root.
+            start = self.get_param('start')
+            if start:
+                found = False
+                attachments = []
+                for part in parts:
+                    if part.get('content-id') == start:
+                        found = True
+                    else:
+                        attachments.append(part)
+                if found:
+                    yield from attachments
+                    return
+            parts.pop(0)
+            yield from parts
+            return
+        # Otherwise we more or less invert the remaining logic in get_body.
+        # This only really works in edge cases (ex: non-text relateds or
+        # alternatives) if the sending agent sets content-disposition.
+        seen = []   # Only skip the first example of each candidate type.
+        for part in parts:
+            maintype, subtype = part.get_content_type().split('/')
+            if ((maintype, subtype) in self._body_types and
+                    not part.is_attachment and subtype not in seen):
+                seen.append(subtype)
+                continue
+            yield part
+
+    def iter_parts(self):
+        """Return an iterator over all immediate subparts of a multipart.
+
+        Return an empty iterator for a non-multipart.
+        """
+        if self.get_content_maintype() == 'multipart':
+            yield from self.get_payload()
+
+    def get_content(self, *args, content_manager=None, **kw):
+        if content_manager is None:
+            content_manager = self.policy.content_manager
+        return content_manager.get_content(self, *args, **kw)
+
+    def set_content(self, *args, content_manager=None, **kw):
+        if content_manager is None:
+            content_manager = self.policy.content_manager
+        content_manager.set_content(self, *args, **kw)
+
+    def _make_multipart(self, subtype, disallowed_subtypes, boundary):
+        if self.get_content_maintype() == 'multipart':
+            existing_subtype = self.get_content_subtype()
+            disallowed_subtypes = disallowed_subtypes + (subtype,)
+            if existing_subtype in disallowed_subtypes:
+                raise ValueError("Cannot convert {} to {}".format(
+                    existing_subtype, subtype))
+        keep_headers = []
+        part_headers = []
+        for name, value in self._headers:
+            if name.lower().startswith('content-'):
+                part_headers.append((name, value))
+            else:
+                keep_headers.append((name, value))
+        if part_headers:
+            # There is existing content, move it to the first subpart.
+            part = type(self)(policy=self.policy)
+            part._headers = part_headers
+            part._payload = self._payload
+            self._payload = [part]
+        else:
+            self._payload = []
+        self._headers = keep_headers
+        self['Content-Type'] = 'multipart/' + subtype
+        if boundary is not None:
+            self.set_param('boundary', boundary)
+
+    def make_related(self, boundary=None):
+        self._make_multipart('related', ('alternative', 'mixed'), boundary)
+
+    def make_alternative(self, boundary=None):
+        self._make_multipart('alternative', ('mixed',), boundary)
+
+    def make_mixed(self, boundary=None):
+        self._make_multipart('mixed', (), boundary)
+
+    def _add_multipart(self, _subtype, *args, _disp=None, **kw):
+        if (self.get_content_maintype() != 'multipart' or
+                self.get_content_subtype() != _subtype):
+            getattr(self, 'make_' + _subtype)()
+        part = type(self)(policy=self.policy)
+        part.set_content(*args, **kw)
+        if _disp and 'content-disposition' not in part:
+            part['Content-Disposition'] = _disp
+        self.attach(part)
+
+    def add_related(self, *args, **kw):
+        self._add_multipart('related', *args, _disp='inline', **kw)
+
+    def add_alternative(self, *args, **kw):
+        self._add_multipart('alternative', *args, **kw)
+
+    def add_attachment(self, *args, **kw):
+        self._add_multipart('mixed', *args, _disp='attachment', **kw)
+
+    def clear(self):
+        self._headers = []
+        self._payload = None
+
+    def clear_content(self):
+        self._headers = [(n, v) for n, v in self._headers
+                         if not n.lower().startswith('content-')]
+        self._payload = None
+
+
+class EmailMessage(MIMEPart):
+
+    def set_content(self, *args, **kw):
+        super().set_content(*args, **kw)
+        if 'MIME-Version' not in self:
+            self['MIME-Version'] = '1.0'
diff --git a/Lib/email/parser.py b/Lib/email/parser.py
index 752bf35a6e..ed512b114f 100644
--- a/Lib/email/parser.py
+++ b/Lib/email/parser.py
@@ -4,7 +4,8 @@
 
 """A parser of RFC 2822 and MIME email messages."""
 
-__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser']
+__all__ = ['Parser', 'HeaderParser', 'BytesParser', 'BytesHeaderParser',
+           'FeedParser', 'BytesFeedParser']
 
 import warnings
 from io import StringIO, TextIOWrapper
@@ -16,7 +17,7 @@ from email._policybase import compat32
 
 
 class Parser:
-    def __init__(self, _class=Message, *, policy=compat32):
+    def __init__(self, _class=None, *, policy=compat32):
         """Parser of RFC 2822 and MIME email messages.
 
         Creates an in-memory object tree representing the email message, which
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
index 38e88afe1d..f0b20f4b19 100644
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -5,6 +5,7 @@ code that adds all the email6 features.
 from email._policybase import Policy, Compat32, compat32, _extend_docstrings
 from email.utils import _has_surrogates
 from email.headerregistry import HeaderRegistry as HeaderRegistry
+from email.contentmanager import raw_data_manager
 
 __all__ = [
     'Compat32',
@@ -58,10 +59,22 @@ class EmailPolicy(Policy):
                            special treatment, while all other fields are
                            treated as unstructured.  This list will be
                            completed before the extension is marked stable.)
+
+    content_manager     -- an object with at least two methods: get_content
+                           and set_content.  When the get_content or
+                           set_content method of a Message object is called,
+                           it calls the corresponding method of this object,
+                           passing it the message object as its first argument,
+                           and any arguments or keywords that were passed to
+                           it as additional arguments.  The default
+                           content_manager is
+                           :data:`~email.contentmanager.raw_data_manager`.
+
     """
 
     refold_source = 'long'
     header_factory = HeaderRegistry()
+    content_manager = raw_data_manager
 
     def __init__(self, **kw):
         # Ensure that each new instance gets a unique header factory
diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index f76c21eb1b..95855d81bd 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py
@@ -54,17 +54,27 @@ TICK = "'"
 specialsre = re.compile(r'[][\\()<>@,:;".]')
 escapesre = re.compile(r'[\\"]')
 
-# How to figure out if we are processing strings that come from a byte
-# source with undecodable characters.
-_has_surrogates = re.compile(
-    '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+def _has_surrogates(s):
+    """Return True if s contains surrogate-escaped binary data."""
+    # This check is based on the fact that unless there are surrogates, utf8
+    # (Python's default encoding) can encode any string.  This is the fastest
+    # way to check for surrogates, see issue 11454 for timings.
+    try:
+        s.encode()
+        return False
+    except UnicodeEncodeError:
+        return True
 
 # How to deal with a string containing bytes before handing it to the
 # application through the 'normal' interface.
 def _sanitize(string):
-    # Turn any escaped bytes into unicode 'unknown' char.
-    original_bytes = string.encode('ascii', 'surrogateescape')
-    return original_bytes.decode('ascii', 'replace')
+    # Turn any escaped bytes into unicode 'unknown' char.  If the escaped
+    # bytes happen to be utf-8 they will instead get decoded, even if they
+    # were invalid in the charset the source was supposed to be in.  This
+    # seems like it is not a bad thing; a defect was still registered.
+    original_bytes = string.encode('utf-8', 'surrogateescape')
+    return original_bytes.decode('utf-8', 'replace')
+
 
 
 # Helpers
author	R David Murray <rdmurray@bitdance.com>	2014-02-08 11:51:18 -0500
committer	R David Murray <rdmurray@bitdance.com>	2014-02-08 11:51:18 -0500
commit	a055d4d797b347ad45b2becdf5b8ec86dc135f20 (patch)
tree	d2a97624b5a31dea7b6be6398e95a9430cf1dd42 /Lib/email
parent	37b69c6a9e207f93e6a3184bde8d434c81777996 (diff)
parent	36c69903355c4b54119021ae16f844bb2f7e3942 (diff)
download	cpython-a055d4d797b347ad45b2becdf5b8ec86dc135f20.tar.gz