From 0b2647d3d59d29427130ee6cc42ea3f47b473b10 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Wed, 13 Apr 2011 16:46:05 -0400 Subject: #11684: Complete parser bytes interface by adding BytesHeaderParser Patch by Steffen Daode Nurpmeso. --- Lib/email/generator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Lib/email/generator.py') diff --git a/Lib/email/generator.py b/Lib/email/generator.py index f0e7a95477..fdd34e4ace 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -297,10 +297,12 @@ class Generator: # message/rfc822. Such messages are generated by, for example, # Groupwise when forwarding unadorned messages. (Issue 7970.) So # in that case we just emit the string body. - payload = msg.get_payload() + payload = msg._payload if isinstance(payload, list): g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL) payload = s.getvalue() + else: + payload = self._encode(payload) self._fp.write(payload) # This used to be a module level function; we use a classmethod for this -- cgit v1.2.1 From 25096720d023b59fead5325c3a2003871004484c Mon Sep 17 00:00:00 2001 From: R David Murray Date: Mon, 18 Apr 2011 13:59:37 -0400 Subject: #11731: simplify/enhance parser/generator API by introducing policy objects. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This new interface will also allow for future planned enhancements in control over the parser/generator without requiring any additional complexity in the parser/generator API. Patch reviewed by Éric Araujo and Barry Warsaw. --- Lib/email/generator.py | 62 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'Lib/email/generator.py') diff --git a/Lib/email/generator.py b/Lib/email/generator.py index fdd34e4ace..d8b8fa960b 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -13,8 +13,10 @@ import random import warnings from io import StringIO, BytesIO +from email import policy from email.header import Header from email.message import _has_surrogates +import email.charset as _charset UNDERSCORE = '_' NL = '\n' # XXX: no longer used by the code below. @@ -33,7 +35,8 @@ class Generator: # Public interface # - def __init__(self, outfp, mangle_from_=True, maxheaderlen=78): + def __init__(self, outfp, mangle_from_=True, maxheaderlen=None, *, + policy=policy.default): """Create the generator for message flattening. outfp is the output file-like object for writing the message to. It @@ -49,16 +52,23 @@ class Generator: defined in the Header class. Set maxheaderlen to zero to disable header wrapping. The default is 78, as recommended (but not required) by RFC 2822. + + The policy keyword specifies a policy object that controls a number of + aspects of the generator's operation. The default policy maintains + backward compatibility. + """ self._fp = outfp self._mangle_from_ = mangle_from_ - self._maxheaderlen = maxheaderlen + self._maxheaderlen = (maxheaderlen if maxheaderlen is not None else + policy.max_line_length) + self.policy = policy def write(self, s): # Just delegate to the file object self._fp.write(s) - def flatten(self, msg, unixfrom=False, linesep='\n'): + def flatten(self, msg, unixfrom=False, linesep=None): r"""Print the message object tree rooted at msg to the output file specified when the Generator instance was created. @@ -70,17 +80,15 @@ class Generator: Note that for subobjects, no From_ line is printed. linesep specifies the characters used to indicate a new line in - the output. The default value is the most useful for typical - Python applications, but it can be set to \r\n to produce RFC-compliant - line separators when needed. + the output. The default value is determined by the policy. """ # We use the _XXX constants for operating on data that comes directly # from the msg, and _encoded_XXX constants for operating on data that # has already been converted (to bytes in the BytesGenerator) and # inserted into a temporary buffer. - self._NL = linesep - self._encoded_NL = self._encode(linesep) + self._NL = linesep if linesep is not None else self.policy.linesep + self._encoded_NL = self._encode(self._NL) self._EMPTY = '' self._encoded_EMTPY = self._encode('') if unixfrom: @@ -338,7 +346,10 @@ class BytesGenerator(Generator): Functionally identical to the base Generator except that the output is bytes and not string. When surrogates were used in the input to encode - bytes, these are decoded back to bytes for output. + bytes, these are decoded back to bytes for output. If the policy has + must_be_7bit set true, then the message is transformed such that the + non-ASCII bytes are properly content transfer encoded, using the + charset unknown-8bit. The outfp object must accept bytes in its write method. """ @@ -361,21 +372,22 @@ class BytesGenerator(Generator): # strings with 8bit bytes. for h, v in msg._headers: self.write('%s: ' % h) - if isinstance(v, Header): - self.write(v.encode(maxlinelen=self._maxheaderlen)+NL) - elif _has_surrogates(v): - # If we have raw 8bit data in a byte string, we have no idea - # what the encoding is. There is no safe way to split this - # string. If it's ascii-subset, then we could do a normal - # ascii split, but if it's multibyte then we could break the - # string. There's no way to know so the least harm seems to - # be to not split the string and risk it being too long. - self.write(v+NL) - else: - # Header's got lots of smarts and this string is safe... - header = Header(v, maxlinelen=self._maxheaderlen, - header_name=h) - self.write(header.encode(linesep=self._NL)+self._NL) + if isinstance(v, str): + if _has_surrogates(v): + if not self.policy.must_be_7bit: + # If we have raw 8bit data in a byte string, we have no idea + # what the encoding is. There is no safe way to split this + # string. If it's ascii-subset, then we could do a normal + # ascii split, but if it's multibyte then we could break the + # string. There's no way to know so the least harm seems to + # be to not split the string and risk it being too long. + self.write(v+NL) + continue + h = Header(v, charset=_charset.UNKNOWN8BIT, header_name=h) + else: + h = Header(v, header_name=h) + self.write(h.encode(linesep=self._NL, + maxlinelen=self._maxheaderlen)+self._NL) # A blank line always separates headers from body self.write(self._NL) @@ -384,7 +396,7 @@ class BytesGenerator(Generator): # just write it back out. if msg._payload is None: return - if _has_surrogates(msg._payload): + if _has_surrogates(msg._payload) and not self.policy.must_be_7bit: self.write(msg._payload) else: super(BytesGenerator,self)._handle_text(msg) -- cgit v1.2.1 From 25a003a10f523a12abae1f81ab0119f1ae170777 Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 25 May 2012 15:01:48 -0400 Subject: #14731: refactor email policy framework. This patch primarily does two things: (1) it adds some internal-interface methods to Policy that allow for Policy to control the parsing and folding of headers in such a way that we can construct a backward compatibility policy that is 100% compatible with the 3.2 API, while allowing a new policy to implement the email6 API. (2) it adds that backward compatibility policy and refactors the test suite so that the only differences between the 3.2 test_email.py file and the 3.3 test_email.py file is some small changes in test framework and the addition of tests for bugs fixed that apply to the 3.2 API. There are some additional teaks, such as moving just the code needed for the compatibility policy into _policybase, so that the library code can import only _policybase. That way the new code that will be added for email6 will only get imported when a non-compatibility policy is imported. --- Lib/email/generator.py | 87 +++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 51 deletions(-) (limited to 'Lib/email/generator.py') diff --git a/Lib/email/generator.py b/Lib/email/generator.py index dcfea4cd8a..bfa288bea4 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -13,9 +13,9 @@ import random import warnings from io import StringIO, BytesIO -from email import policy +from email._policybase import compat32 from email.header import Header -from email.message import _has_surrogates +from email.utils import _has_surrogates import email.charset as _charset UNDERSCORE = '_' @@ -36,7 +36,7 @@ class Generator: # def __init__(self, outfp, mangle_from_=True, maxheaderlen=None, *, - policy=policy.default): + policy=None): """Create the generator for message flattening. outfp is the output file-like object for writing the message to. It @@ -60,8 +60,7 @@ class Generator: """ self._fp = outfp self._mangle_from_ = mangle_from_ - self._maxheaderlen = (maxheaderlen if maxheaderlen is not None else - policy.max_line_length) + self.maxheaderlen = maxheaderlen self.policy = policy def write(self, s): @@ -87,20 +86,33 @@ class Generator: # from the msg, and _encoded_XXX constants for operating on data that # has already been converted (to bytes in the BytesGenerator) and # inserted into a temporary buffer. - self._NL = linesep if linesep is not None else self.policy.linesep + policy = msg.policy if self.policy is None else self.policy + if linesep is not None: + policy = policy.clone(linesep=linesep) + if self.maxheaderlen is not None: + policy = policy.clone(max_line_length=self.maxheaderlen) + self._NL = policy.linesep self._encoded_NL = self._encode(self._NL) self._EMPTY = '' self._encoded_EMTPY = self._encode('') - if unixfrom: - ufrom = msg.get_unixfrom() - if not ufrom: - ufrom = 'From nobody ' + time.ctime(time.time()) - self.write(ufrom + self._NL) - self._write(msg) + p = self.policy + try: + self.policy = policy + if unixfrom: + ufrom = msg.get_unixfrom() + if not ufrom: + ufrom = 'From nobody ' + time.ctime(time.time()) + self.write(ufrom + self._NL) + self._write(msg) + finally: + self.policy = p def clone(self, fp): """Clone this generator with the exact same options.""" - return self.__class__(fp, self._mangle_from_, self._maxheaderlen) + return self.__class__(fp, + self._mangle_from_, + None, # Use policy setting, which we've adjusted + policy=self.policy) # # Protected interface - undocumented ;/ @@ -175,16 +187,8 @@ class Generator: # def _write_headers(self, msg): - for h, v in msg.items(): - self.write('%s: ' % h) - if isinstance(v, Header): - self.write(v.encode( - maxlinelen=self._maxheaderlen, linesep=self._NL)+self._NL) - else: - # Header's got lots of smarts, so use it. - header = Header(v, maxlinelen=self._maxheaderlen, - header_name=h) - self.write(header.encode(linesep=self._NL)+self._NL) + for h, v in msg.raw_items(): + self.write(self.policy.fold(h, v)) # A blank line always separates headers from body self.write(self._NL) @@ -265,12 +269,12 @@ class Generator: # The contents of signed parts has to stay unmodified in order to keep # the signature intact per RFC1847 2.1, so we disable header wrapping. # RDM: This isn't enough to completely preserve the part, but it helps. - old_maxheaderlen = self._maxheaderlen + p = self.policy + self.policy = p.clone(max_line_length=0) try: - self._maxheaderlen = 0 self._handle_multipart(msg) finally: - self._maxheaderlen = old_maxheaderlen + self.policy = p def _handle_message_delivery_status(self, msg): # We can't just write the headers directly to self's file object @@ -347,9 +351,9 @@ class BytesGenerator(Generator): Functionally identical to the base Generator except that the output is bytes and not string. When surrogates were used in the input to encode bytes, these are decoded back to bytes for output. If the policy has - must_be_7bit set true, then the message is transformed such that the - non-ASCII bytes are properly content transfer encoded, using the - charset unknown-8bit. + cte_type set to 7bit, then the message is transformed such that the + non-ASCII bytes are properly content transfer encoded, using the charset + unknown-8bit. The outfp object must accept bytes in its write method. """ @@ -370,27 +374,8 @@ class BytesGenerator(Generator): def _write_headers(self, msg): # This is almost the same as the string version, except for handling # strings with 8bit bytes. - for h, v in msg._headers: - self.write('%s: ' % h) - if isinstance(v, str): - if _has_surrogates(v): - if not self.policy.must_be_7bit: - # If we have raw 8bit data in a byte string, we have no idea - # what the encoding is. There is no safe way to split this - # string. If it's ascii-subset, then we could do a normal - # ascii split, but if it's multibyte then we could break the - # string. There's no way to know so the least harm seems to - # be to not split the string and risk it being too long. - self.write(v+NL) - continue - h = Header(v, charset=_charset.UNKNOWN8BIT, header_name=h) - else: - h = Header(v, header_name=h) - else: - # Assume it is a Header-like object. - h = v - self.write(h.encode(linesep=self._NL, - maxlinelen=self._maxheaderlen)+self._NL) + for h, v in msg.raw_items(): + self._fp.write(self.policy.fold_binary(h, v)) # A blank line always separates headers from body self.write(self._NL) @@ -399,7 +384,7 @@ class BytesGenerator(Generator): # just write it back out. if msg._payload is None: return - if _has_surrogates(msg._payload) and not self.policy.must_be_7bit: + if _has_surrogates(msg._payload) and not self.policy.cte_type=='7bit': self.write(msg._payload) else: super(BytesGenerator,self)._handle_text(msg) -- cgit v1.2.1 From 453836e4750201b27e3397f092f342e758b7943e Mon Sep 17 00:00:00 2001 From: R David Murray Date: Fri, 25 May 2012 18:42:14 -0400 Subject: #12586: add provisional email policy with new header parsing and folding. When the new policies are used (and only when the new policies are explicitly used) headers turn into objects that have attributes based on their parsed values, and can be set using objects that encapsulate the values, as well as set directly from unicode strings. The folding algorithm then takes care of encoding unicode where needed, and folding according to the highest level syntactic objects. With this patch only date and time headers are parsed as anything other than unstructured, but that is all the helper methods in the existing API handle. I do plan to add more parsers, and complete the set specified in the RFC before the package becomes stable. --- Lib/email/generator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'Lib/email/generator.py') diff --git a/Lib/email/generator.py b/Lib/email/generator.py index bfa288bea4..fcecf939a8 100644 --- a/Lib/email/generator.py +++ b/Lib/email/generator.py @@ -95,9 +95,15 @@ class Generator: self._encoded_NL = self._encode(self._NL) self._EMPTY = '' self._encoded_EMTPY = self._encode('') - p = self.policy + # Because we use clone (below) when we recursively process message + # subparts, and because clone uses the computed policy (not None), + # submessages will automatically get set to the computed policy when + # they are processed by this code. + old_gen_policy = self.policy + old_msg_policy = msg.policy try: self.policy = policy + msg.policy = policy if unixfrom: ufrom = msg.get_unixfrom() if not ufrom: @@ -105,7 +111,8 @@ class Generator: self.write(ufrom + self._NL) self._write(msg) finally: - self.policy = p + self.policy = old_gen_policy + msg.policy = old_msg_policy def clone(self, fp): """Clone this generator with the exact same options.""" -- cgit v1.2.1