summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorptmcg <ptmcg@austin.rr.com>2023-03-28 08:00:19 -0500
committerptmcg <ptmcg@austin.rr.com>2023-03-28 08:00:19 -0500
commitd46eb9e936d753d2836428e64cf1bb4d1f2b92f3 (patch)
tree859f83711d47ea531d9e0a249e44e74772e98803
parent141980203504a1b58425d1770dc2d99da83d3252 (diff)
downloadpyparsing-git-d46eb9e936d753d2836428e64cf1bb4d1f2b92f3.tar.gz
Fix #474 - redo QuotedString '\' escape handling as a state machine so that all transforms are done left to right
-rw-r--r--CHANGES33
-rw-r--r--pyparsing/core.py36
-rw-r--r--tests/test_unit.py57
3 files changed, 107 insertions, 19 deletions
diff --git a/CHANGES b/CHANGES
index a73e2c0..19196ba 100644
--- a/CHANGES
+++ b/CHANGES
@@ -2,8 +2,31 @@
Change Log
==========
+NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as
+`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should
+get released some time later in 2023. I currently plan to completely
+drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until
+at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to
+the new function names before the old functions are completely removed. (Big
+help from Devin J. Pohly in structuring the code to enable this peaceful transition.)
+
+Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7.
+
+
Version 3.1.0a2 - (in development)
----------------------------------
+- API CHANGE: A slight change has been implemented when unquoting a quoted string
+ parsed using the QuotedString class. Formerly, when unquoting and processing
+ whitespace markers such as \t and \n, these substitutions would occur first, and
+ then any additional '\' escaping would be done on the resulting string. This would
+ parse "\\n" as "\<newline>". Now escapes and whitespace markers are all processed
+ in a single pass working left to right, so the quoted string "\\n" would get unquoted
+ to "\n" (a backslash followed by "n"). Fixes issue #474 raised by jakeanq,
+ thanks!
+
+- Added named field "url" to pyparsing.common.url, returning the entire
+ parsed URL string.
+
- Fixed bug when parse actions returned an empty string for an expression that
had a results name, that the results name was not saved. That is:
@@ -27,16 +50,6 @@ Version 3.1.0a2 - (in development)
Version 3.1.0a1 - March, 2023
-----------------------------
-NOTE: In the future release 3.2.0, use of many of the pre-PEP8 methods (such as
-`ParserElement.parseString`) will start to raise `DeprecationWarnings`. 3.2.0 should
-get released some time later in 2023. I currently plan to completely
-drop the pre-PEP8 methods in pyparsing 4.0, though we won't see that release until
-at least late 2023 if not 2024. So there is plenty of time to convert existing parsers to
-the new function names before the old functions are completely removed. (Big
-help from Devin J. Pohly in structuring the code to enable this peaceful transition.)
-
-Version 3.2.0 will also discontinue support for Python versions 3.6 and 3.7.
-
- API ENHANCEMENT: `Optional(expr)` may now be written as `expr | ""`
This will make this code:
diff --git a/pyparsing/core.py b/pyparsing/core.py
index b51779f..ae7dcb6 100644
--- a/pyparsing/core.py
+++ b/pyparsing/core.py
@@ -3194,7 +3194,7 @@ class QuotedString(Token):
[['This is the "quote"']]
[['This is the quote with "embedded" quotes']]
"""
- ws_map = ((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r"))
+ ws_map = dict(((r"\t", "\t"), (r"\n", "\n"), (r"\f", "\f"), (r"\r", "\r")))
def __init__(
self,
@@ -3244,6 +3244,7 @@ class QuotedString(Token):
self.escQuote: str = escQuote or ""
self.unquoteResults: bool = unquoteResults
self.convertWhitespaceEscapes: bool = convertWhitespaceEscapes
+ self.multiline = multiline
sep = ""
inner_pattern = ""
@@ -3292,6 +3293,17 @@ class QuotedString(Token):
]
)
+ if self.unquoteResults:
+ if self.convertWhitespaceEscapes:
+ self.unquote_scan_re = re.compile(
+ rf"({'|'.join(re.escape(k) for k in self.ws_map)})|({re.escape(self.escChar)}.)|(\n|.)",
+ flags=self.flags,
+ )
+ else:
+ self.unquote_scan_re = re.compile(
+ rf"({re.escape(self.escChar)}.)|(\n|.)", flags=self.flags
+ )
+
try:
self.re = re.compile(self.pattern, self.flags)
self.reString = self.pattern
@@ -3327,14 +3339,20 @@ class QuotedString(Token):
ret = ret[self.quoteCharLen : -self.endQuoteCharLen]
if isinstance(ret, str_type):
- # replace escaped whitespace
- if "\\" in ret and self.convertWhitespaceEscapes:
- for wslit, wschar in self.ws_map:
- ret = ret.replace(wslit, wschar)
-
- # replace escaped characters
- if self.escChar:
- ret = re.sub(self.escCharReplacePattern, r"\g<1>", ret)
+ if self.convertWhitespaceEscapes:
+ ret = "".join(
+ self.ws_map[match.group(1)]
+ if match.group(1)
+ else match.group(2)[-1]
+ if match.group(2)
+ else match.group(3)
+ for match in self.unquote_scan_re.finditer(ret)
+ )
+ else:
+ ret = "".join(
+ match.group(1)[-1] if match.group(1) else match.group(2)
+ for match in self.unquote_scan_re.finditer(ret)
+ )
# replace escaped quotes
if self.escQuote:
diff --git a/tests/test_unit.py b/tests/test_unit.py
index 1ebf3b6..bb60e03 100644
--- a/tests/test_unit.py
+++ b/tests/test_unit.py
@@ -1265,6 +1265,63 @@ class Test02_WithoutPackrat(ppt.TestParseResultsAsserts, TestCase):
)
self.assertEqual(source, stripped)
+ def testQuotedStringUnquotesAndConvertWhitespaceEscapes(self):
+ # test for Issue #474
+ #fmt: off
+ backslash = chr(92) # a single backslash
+ tab = "\t"
+ newline = "\n"
+ test_string_0 = f'"{backslash}{backslash}n"' # r"\\n"
+ test_string_1 = f'"{backslash}t{backslash}{backslash}n"' # r"\t\\n"
+ test_string_2 = f'"a{backslash}tb"' # r"a\tb"
+ test_string_3 = f'"{backslash}{backslash}{backslash}n"' # r"\\\n"
+ T, F = True, False # these make the test cases format nicely
+ for test_parameters in (
+ # Parameters are the arguments to creating a QuotedString
+ # and the expected parsed list of characters):
+ # - unquote_results
+ # - convert_whitespace_escapes
+ # - test string
+ # - expected parsed characters (broken out as separate
+ # list items (all those doubled backslashes make it
+ # difficult to interpret the output)
+ (T, T, test_string_0, [backslash, "n"]),
+ (T, F, test_string_0, [backslash, "n"]),
+ (F, F, test_string_0, ['"', backslash, backslash, "n", '"']),
+ (T, T, test_string_1, [tab, backslash, "n"]),
+ (T, F, test_string_1, ["t", backslash, "n"]),
+ (F, F, test_string_1, ['"', backslash, "t", backslash, backslash, "n", '"']),
+ (T, T, test_string_2, ["a", tab, "b"]),
+ (T, F, test_string_2, ["a", "t", "b"]),
+ (F, F, test_string_2, ['"', "a", backslash, "t", "b", '"']),
+ (T, T, test_string_3, [backslash, newline]),
+ (T, F, test_string_3, [backslash, "n"]),
+ (F, F, test_string_3, ['"', backslash, backslash, backslash, "n", '"']),
+ ):
+ unquote_results, convert_ws_escapes, test_string, expected_list = test_parameters
+ test_description = f"Testing with parameters {test_parameters}"
+ with self.subTest(msg=test_description):
+ print(test_description)
+ print(f"unquote_results: {unquote_results}"
+ f"\nconvert_whitespace_escapes: {convert_ws_escapes}")
+ qs_expr = pp.QuotedString(
+ quoteChar='"',
+ escChar='\\',
+ unquote_results=unquote_results,
+ convert_whitespace_escapes=convert_ws_escapes
+ )
+ result = qs_expr.parse_string(test_string)
+
+ # do this instead of assertParserAndCheckList to explicitly
+ # check and display the separate items in the list
+ print("Results:")
+ control_chars = {newline: "<NEWLINE>", backslash: "<BACKSLASH>", tab: "<TAB>"}
+ print(f"[{', '.join(control_chars.get(c, repr(c)) for c in result[0])}]")
+ self.assertEqual(expected_list, list(result[0]))
+
+ print()
+ #fmt: on
+
def testCaselessOneOf(self):
caseless1 = pp.oneOf("d a b c aA B A C", caseless=True)
caseless1str = str(caseless1)