diff options
author | ptmcg <ptmcg@austin.rr.com> | 2023-03-28 08:00:19 -0500 |
---|---|---|
committer | ptmcg <ptmcg@austin.rr.com> | 2023-03-28 08:00:19 -0500 |
commit | d46eb9e936d753d2836428e64cf1bb4d1f2b92f3 (patch) | |
tree | 859f83711d47ea531d9e0a249e44e74772e98803 | |
parent | 141980203504a1b58425d1770dc2d99da83d3252 (diff) | |
download | pyparsing-git-d46eb9e936d753d2836428e64cf1bb4d1f2b92f3.tar.gz |
Fix #474 - redo QuotedString '\' escape handling as a state machine so that all transforms are done left to right
-rw-r--r-- | CHANGES | 33 | ||||
-rw-r--r-- | pyparsing/core.py | 36 | ||||
-rw-r--r-- | tests/test_unit.py | 57 |
3 files changed, 107 insertions, 19 deletions
@@ -2,8 +2,31 @@ Change Log ========== +NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as +`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should +get released some time later in 2023. I currently plan to completely +drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until +at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to +the new function names before the old functions are completely removed. (Big +help from Devin J. Pohly in structuring the code to enable this peaceful transition.) + +Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7. + + Version 3.1.0a2 - (in development) ---------------------------------- +- API CHANGE: A slight change has been implemented when unquoting a quoted string + parsed using the QuotedString class. Formerly, when unquoting and processing + whitespace markers such as \t and \n, these substitutions would occur first, and + then any additional '\' escaping would be done on the resulting string. This would + parse "\\n" as "\<newline>". Now escapes and whitespace markers are all processed + in a single pass working left to right, so the quoted string "\\n" would get unquoted + to "\n" (a backslash followed by "n"). Fixes issue #474 raised by jakeanq, + thanks! + +- Added named field "url" to pyparsing.common.url, returning the entire + parsed URL string. + - Fixed bug when parse actions returned an empty string for an expression that had a results name, that the results name was not saved. That is: @@ -27,16 +50,6 @@ Version 3.1.0a2 - (in development) Version 3.1.0a1 - March, 2023 ----------------------------- -NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as -`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should -get released some time later in 2023. I currently plan to completely -drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until -at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to -the new function names before the old functions are completely removed. (Big -help from Devin J. Pohly in structuring the code to enable this peaceful transition.) - -Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7. - - API ENHANCEMENT: `Optional(expr)` may now be written as `expr | ""` This will make this code: diff --git a/pyparsing/core.py b/pyparsing/core.py index b51779f..ae7dcb6 100644 --- a/pyparsing/core.py +++ b/pyparsing/core.py @@ -3194,7 +3194,7 @@ class QuotedString(Token): [['This is the "quote"']] [['This is the quote with "embedded" quotes']] """ - ws_map = ((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r")) + ws_map = dict(((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r"))) def __init__( self, @@ -3244,6 +3244,7 @@ class QuotedString(Token): self.escQuote: str = escQuote or "" self.unquoteResults: bool = unquoteResults self.convertWhitespaceEscapes: bool = convertWhitespaceEscapes + self.multiline = multiline sep = "" inner_pattern = "" @@ -3292,6 +3293,17 @@ class QuotedString(Token): ] ) + if self.unquoteResults: + if self.convertWhitespaceEscapes: + self.unquote_scan_re = re.compile( + rf"({'|'.join(re.escape(k) for k in self.ws_map)})|({re.escape(self.escChar)}.)|(\n|.)", + flags=self.flags, + ) + else: + self.unquote_scan_re = re.compile( + rf"({re.escape(self.escChar)}.)|(\n|.)", flags=self.flags + ) + try: self.re = re.compile(self.pattern, self.flags) self.reString = self.pattern @@ -3327,14 +3339,20 @@ class QuotedString(Token): ret = ret[self.quoteCharLen : -self.endQuoteCharLen] if isinstance(ret, str_type): - # replace escaped whitespace - if "\\" in ret and self.convertWhitespaceEscapes: - for wslit, wschar in self.ws_map: - ret = ret.replace(wslit, wschar) - - # replace escaped characters - if self.escChar: - ret = re.sub(self.escCharReplacePattern, r"\g<1>", ret) + if self.convertWhitespaceEscapes: + ret = "".join( + self.ws_map[match.group(1)] + if match.group(1) + else match.group(2)[-1] + if match.group(2) + else match.group(3) + for match in self.unquote_scan_re.finditer(ret) + ) + else: + ret = "".join( + match.group(1)[-1] if match.group(1) else match.group(2) + for match in self.unquote_scan_re.finditer(ret) + ) # replace escaped quotes if self.escQuote: diff --git a/tests/test_unit.py b/tests/test_unit.py index 1ebf3b6..bb60e03 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1265,6 +1265,63 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase): ) self.assertEqual(source, stripped) + def testQuotedStringUnquotesAndConvertWhitespaceEscapes(self): + # test for Issue #474 + #fmt: off + backslash = chr(92) # a single backslash + tab = "\t" + newline = "\n" + test_string_0 = f'"{backslash}{backslash}n"' # r"\\n" + test_string_1 = f'"{backslash}t{backslash}{backslash}n"' # r"\t\\n" + test_string_2 = f'"a{backslash}tb"' # r"a\tb" + test_string_3 = f'"{backslash}{backslash}{backslash}n"' # r"\\\n" + T, F = True, False # these make the test cases format nicely + for test_parameters in ( + # Parameters are the arguments to creating a QuotedString + # and the expected parsed list of characters): + # - unquote_results + # - convert_whitespace_escapes + # - test string + # - expected parsed characters (broken out as separate + # list items (all those doubled backslashes make it + # difficult to interpret the output) + (T, T, test_string_0, [backslash, "n"]), + (T, F, test_string_0, [backslash, "n"]), + (F, F, test_string_0, ['"', backslash, backslash, "n", '"']), + (T, T, test_string_1, [tab, backslash, "n"]), + (T, F, test_string_1, ["t", backslash, "n"]), + (F, F, test_string_1, ['"', backslash, "t", backslash, backslash, "n", '"']), + (T, T, test_string_2, ["a", tab, "b"]), + (T, F, test_string_2, ["a", "t", "b"]), + (F, F, test_string_2, ['"', "a", backslash, "t", "b", '"']), + (T, T, test_string_3, [backslash, newline]), + (T, F, test_string_3, [backslash, "n"]), + (F, F, test_string_3, ['"', backslash, backslash, backslash, "n", '"']), + ): + unquote_results, convert_ws_escapes, test_string, expected_list = test_parameters + test_description = f"Testing with parameters {test_parameters}" + with self.subTest(msg=test_description): + print(test_description) + print(f"unquote_results: {unquote_results}" + f"\nconvert_whitespace_escapes: {convert_ws_escapes}") + qs_expr = pp.QuotedString( + quoteChar='"', + escChar='\\', + unquote_results=unquote_results, + convert_whitespace_escapes=convert_ws_escapes + ) + result = qs_expr.parse_string(test_string) + + # do this instead of assertParserAndCheckList to explicitly + # check and display the separate items in the list + print("Results:") + control_chars = {newline: "<NEWLINE>", backslash: "<BACKSLASH>", tab: "<TAB>"} + print(f"[{', '.join(control_chars.get(c, repr(c)) for c in result[0])}]") + self.assertEqual(expected_list, list(result[0])) + + print() + #fmt: on + def testCaselessOneOf(self): caseless1 = pp.oneOf("d a b c aA B A C", caseless=True) caseless1str = str(caseless1) |