#23745: handle duplicate MIME parameter names in new parser.

This mimics get_param's error handling for the most part. It is slightly better in some regards as get_param can produce some really weird results for duplicate *0* parts. It departs from get_param slightly in that if we have a mix of non-extended and extended pieces for the same parameter name, the new parser assumes they were all supposed to be extended and concatenates all the values, whereas get_param always picks the non-extended parameter value. All of this error recovery is pretty much arbitrary decisions...
author: R David Murray <rdmurray@bitdance.com> 2015-03-29 21:53:05 -0400
committer: R David Murray <rdmurray@bitdance.com> 2015-03-29 21:53:05 -0400
commit: dee0f186f266252095a91ee649409ff4c0d5b146 (patch)
tree: 6b53456c9c5a670310bc8791938df6fa9d6df725
parent: 0325ed4f2c6775433a9b835036b368fb3c7a1077 (diff)
download: cpython-dee0f186f266252095a91ee649409ff4c0d5b146.tar.gz
3 files changed, 139 insertions, 7 deletions
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 1806cac905..a9bdf4458b 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -71,6 +71,7 @@ import re
 import urllib   # For urllib.parse.unquote
 from string import hexdigits
 from collections import OrderedDict
+from operator import itemgetter
 from email import _encoded_words as _ew
 from email import errors
 from email import utils
@@ -1098,15 +1099,34 @@ class MimeParameters(TokenList):
                 params[name] = []
             params[name].append((token.section_number, token))
         for name, parts in params.items():
-            parts = sorted(parts)
-            # XXX: there might be more recovery we could do here if, for
-            # example, this is really a case of a duplicate attribute name.
+            parts = sorted(parts, key=itemgetter(0))
+            first_param = parts[0][1]
+            charset = first_param.charset
+            # Our arbitrary error recovery is to ignore duplicate parameters,
+            # to use appearance order if there are duplicate rfc 2231 parts,
+            # and to ignore gaps.  This mimics the error recovery of get_param.
+            if not first_param.extended and len(parts) > 1:
+                if parts[1][0] == 0:
+                    parts[1][1].defects.append(errors.InvalidHeaderDefect(
+                        'duplicate parameter name; duplicate(s) ignored'))
+                    parts = parts[:1]
+                # Else assume the *0* was missing...note that this is different
+                # from get_param, but we registered a defect for this earlier.
             value_parts = []
-            charset = parts[0][1].charset
-            for i, (section_number, param) in enumerate(parts):
+            i = 0
+            for section_number, param in parts:
                 if section_number != i:
-                    param.defects.append(errors.InvalidHeaderDefect(
-                        "inconsistent multipart parameter numbering"))
+                    # We could get fancier here and look for a complete
+                    # duplicate extended parameter and ignore the second one
+                    # seen.  But we're not doing that.  The old code didn't.
+                    if not param.extended:
+                        param.defects.append(errors.InvalidHeaderDefect(
+                            'duplicate parameter name; duplicate ignored'))
+                        continue
+                    else:
+                        param.defects.append(errors.InvalidHeaderDefect(
+                            "inconsistent RFC2231 parameter numbering"))
+                i += 1
                 value = param.param_value
                 if param.extended:
                     try:
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 5404d1913f..d028f7440b 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -2456,6 +2456,115 @@ class TestParser(TestParserMixin, TestEmailBase):
             ";foo", ";foo", ";foo", [errors.InvalidHeaderDefect]*3
         )
 
+
+@parameterize
+class Test_parse_mime_parameters(TestParserMixin, TestEmailBase):
+
+    def mime_parameters_as_value(self,
+                                 value,
+                                 tl_str,
+                                 tl_value,
+                                 params,
+                                 defects):
+        mime_parameters = self._test_parse_x(parser.parse_mime_parameters,
+            value, tl_str, tl_value, defects)
+        self.assertEqual(mime_parameters.token_type, 'mime-parameters')
+        self.assertEqual(list(mime_parameters.params), params)
+
+
+    mime_parameters_params = {
+
+        'simple': (
+            'filename="abc.py"',
+            ' filename="abc.py"',
+            'filename=abc.py',
+            [('filename', 'abc.py')],
+            []),
+
+        'multiple_keys': (
+            'filename="abc.py"; xyz=abc',
+            ' filename="abc.py"; xyz="abc"',
+            'filename=abc.py; xyz=abc',
+            [('filename', 'abc.py'), ('xyz', 'abc')],
+            []),
+
+        'split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            [('filename', '201.tif')],
+            []),
+
+        # Note that it is undefined what we should do for error recovery when
+        # there are duplicate parameter names or duplicate parts in a split
+        # part.  We choose to ignore all duplicate parameters after the first
+        # and to take duplicate or missing rfc 2231 parts in apperance order.
+        # This is backward compatible with get_param's behavior, but the
+        # decisions are arbitrary.
+
+        'duplicate_key': (
+            'filename=abc.gif; filename=def.tiff',
+            ' filename="abc.gif"',
+            "filename=abc.gif; filename=def.tiff",
+            [('filename', 'abc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_key_with_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename=abc.gif",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename=abc.gif",
+            [('filename', '201.tif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_key_with_split_value_other_order': (
+            "filename=abc.gif; "
+                " filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            ' filename="abc.gif"',
+            "filename=abc.gif;"
+                " filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66",
+            [('filename', 'abc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_in_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename*1*=abc.gif",
+            ' filename="201.tifabc.gif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*1*=%74%69%66;"
+                " filename*1*=abc.gif",
+            [('filename', '201.tifabc.gif')],
+            [errors.InvalidHeaderDefect]),
+
+        'missing_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;",
+            ' filename="201.tif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;",
+            [('filename', '201.tif')],
+            [errors.InvalidHeaderDefect]),
+
+        'duplicate_and_missing_split_value': (
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;"
+                " filename*3*=abc.gif",
+            ' filename="201.tifabc.gif"',
+            "filename*0*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66;"
+                " filename*3*=abc.gif",
+            [('filename', '201.tifabc.gif')],
+            [errors.InvalidHeaderDefect]*2),
+
+        # Here we depart from get_param and assume the *0* was missing.
+        'duplicate_with_broken_split_value': (
+            "filename=abc.gif; "
+                " filename*2*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66",
+            ' filename="abc.gif201.tif"',
+            "filename=abc.gif;"
+                " filename*2*=iso-8859-1''%32%30%31%2E; filename*3*=%74%69%66",
+            [('filename', 'abc.gif201.tif')],
+            # Defects are apparent missing *0*, and two 'out of sequence'.
+            [errors.InvalidHeaderDefect]*3),
+
+    }
+
 @parameterize
 class Test_parse_mime_version(TestParserMixin, TestEmailBase):
 
diff --git a/Misc/NEWS b/Misc/NEWS
index b80cbe25f9..ebac8d52f6 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -21,6 +21,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #23745: The new email header parser now handles duplicate MIME
+  parameter names without error, similar to how get_param behaves.
+
 - Issue #23792: Ignore KeyboardInterrupt when the pydoc pager is active.
   This mimics the behavior of the standard unix pagers, and prevents
   pipepager from shutting down while the pager itself is still running.
author	R David Murray <rdmurray@bitdance.com>	2015-03-29 21:53:05 -0400
committer	R David Murray <rdmurray@bitdance.com>	2015-03-29 21:53:05 -0400
commit	dee0f186f266252095a91ee649409ff4c0d5b146 (patch)
tree	6b53456c9c5a670310bc8791938df6fa9d6df725
parent	0325ed4f2c6775433a9b835036b368fb3c7a1077 (diff)
download	cpython-dee0f186f266252095a91ee649409ff4c0d5b146.tar.gz