Fix slow backtracking when parsing strings (no external deps) (#347)

* Fix slow backtracking when parsing strings (no external deps) Fixes #61 This uses negative lookaheads to avoid ambiguity in how string should be parsed by the regex. - https://docs.python.org/2/library/re.html#regular-expression-syntax - Previously, if it didn't immediately succeed at parsing an escape sequence such as `\123`, it would have to try `\1`+`23`, `\12` + `3`, and `\123`, which multiplied the time taken by 3 per additional escape sequence. This solves that by only allowing `\123` - The same fix was added for hex escapes. Also fix a test that relied on the incorrect handling of regexes. The implementation documentation says that it intends to allow **decimal** escapes permissively. * WIP debug * Fix ambiguity caused by allowing #path directives Solve this by allowing "\x" when not followed by hex, in the regular string literal. In the previous commits, `\x12` could be parsed both as `\x`+`12` and `\x12`, which caused exponential options for backtracking. * Document changes to lexer, remove debug code * Optimize this for strings
author: Tyson Andre <tysonandre775@hotmail.com> 2019-08-26 17:18:38 -0400
committer: Eli Bendersky <eliben@users.noreply.github.com> 2019-08-26 14:18:38 -0700
commit: 62ee4ba5fbe58f469c72e7b5b02e88584577a147 (patch)
tree: 47a09a14d4eae3ccec918e9474f7c6560e8a4795 /tests
parent: 5d5904d2538e054356ac01ba9ef965783f73e36b (diff)
download: pycparser-62ee4ba5fbe58f469c72e7b5b02e88584577a147.tar.gz
1 files changed, 30 insertions, 1 deletions
diff --git a/tests/test_c_lexer.py b/tests/test_c_lexer.py
index 11c7b26..3a70c18 100644
--- a/tests/test_c_lexer.py
+++ b/tests/test_c_lexer.py
@@ -116,6 +116,7 @@ class TestCLexerNoErrors(unittest.TestCase):
         self.assertTokensTypes(r"""'\t'""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\''""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\?'""", ['CHAR_CONST'])
+        self.assertTokensTypes(r"""'\0'""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\012'""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\x2f'""", ['CHAR_CONST'])
         self.assertTokensTypes(r"""'\x2f12'""", ['CHAR_CONST'])
@@ -149,6 +150,24 @@ class TestCLexerNoErrors(unittest.TestCase):
         self.assertTokensTypes(
             '"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"',
             ['STRING_LITERAL'])
+        # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
+        # directives with Windows paths as filenames (..\..\dir\file)
+        self.assertTokensTypes(
+            r'"\x"',
+            ['STRING_LITERAL'])
+        self.assertTokensTypes(
+            r'"\a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z\A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z"',
+            ['STRING_LITERAL'])
+        self.assertTokensTypes(
+            r'"C:\x\fa\x1e\xited"',
+            ['STRING_LITERAL'])
+        # The lexer is permissive and allows decimal escapes (not just octal)
+        self.assertTokensTypes(
+            '"jx\9"',
+            ['STRING_LITERAL'])
+        self.assertTokensTypes(
+            '"fo\9999999"',
+            ['STRING_LITERAL'])
 
     def test_mess(self):
         self.assertTokensTypes(
@@ -428,14 +447,24 @@ class TestCLexerErrors(unittest.TestCase):
     def test_char_constants(self):
         self.assertLexerError("'", ERR_UNMATCHED_QUOTE)
         self.assertLexerError("'b\n", ERR_UNMATCHED_QUOTE)
+        self.assertLexerError("'\\xaa\n'", ERR_UNMATCHED_QUOTE)
 
+        self.assertLexerError(r"'\12a'", ERR_INVALID_CCONST)
+        self.assertLexerError(r"'\xabg'", ERR_INVALID_CCONST)
+        self.assertLexerError("''", ERR_INVALID_CCONST)
         self.assertLexerError("'jx'", ERR_INVALID_CCONST)
         self.assertLexerError(r"'\*'", ERR_INVALID_CCONST)
 
     def test_string_literals(self):
-        self.assertLexerError(r'"jx\9"', ERR_STRING_ESCAPE)
+        self.assertLexerError(r'"jx\`"', ERR_STRING_ESCAPE)
         self.assertLexerError(r'"hekllo\* on ix"', ERR_STRING_ESCAPE)
         self.assertLexerError(r'L"hekllo\* on ix"', ERR_STRING_ESCAPE)
+        # Should not suffer from slow backtracking
+        self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123"', ERR_STRING_ESCAPE)
+        self.assertLexerError(r'"\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\x23\`\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23\xf1\x23"', ERR_STRING_ESCAPE)
+        # Should not suffer from slow backtracking when there's no end quote
+        self.assertLexerError(r'"\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\`\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\123\12\123456', ERR_ILLEGAL_CHAR)
+        self.assertLexerError(r'"\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\`\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x23\x2\x23456', ERR_ILLEGAL_CHAR)
 
     def test_preprocessor(self):
         self.assertLexerError('#line "ka"', ERR_FILENAME_BEFORE_LINE)
author	Tyson Andre <tysonandre775@hotmail.com>	2019-08-26 17:18:38 -0400
committer	Eli Bendersky <eliben@users.noreply.github.com>	2019-08-26 14:18:38 -0700
commit	62ee4ba5fbe58f469c72e7b5b02e88584577a147 (patch)
tree	47a09a14d4eae3ccec918e9474f7c6560e8a4795 /tests
parent	5d5904d2538e054356ac01ba9ef965783f73e36b (diff)
download	pycparser-62ee4ba5fbe58f469c72e7b5b02e88584577a147.tar.gz