diff options
author | Jean Abou-Samra <jean@abou-samra.fr> | 2023-04-17 18:41:55 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-17 18:41:55 +0200 |
commit | c97762448b1e4eac8d74b8d88415f23c32aa0cdd (patch) | |
tree | fea2ebba54f728956dc532ca2e508b86c488932f /pygments/lexers | |
parent | 50dd4d80e25c4c4afab503d41b471a536ed2af13 (diff) | |
download | pygments-git-c97762448b1e4eac8d74b8d88415f23c32aa0cdd.tar.gz |
Refactor PythonConsoleLexer as a DelegatingLexer (#2412)
This is simpler and more reliable than hand-coding the state machine.
Fixes #2411
Diffstat (limited to 'pygments/lexers')
-rw-r--r-- | pygments/lexers/python.py | 117 |
1 files changed, 55 insertions, 62 deletions
diff --git a/pygments/lexers/python.py b/pygments/lexers/python.py index eaaf6476..6537d4d9 100644 --- a/pygments/lexers/python.py +++ b/pygments/lexers/python.py @@ -11,8 +11,8 @@ import re import keyword -from pygments.lexer import Lexer, RegexLexer, include, bygroups, using, \ - default, words, combined, do_insertions, this, line_re +from pygments.lexer import DelegatingLexer, Lexer, RegexLexer, include, \ + bygroups, using, default, words, combined, do_insertions, this, line_re from pygments.util import get_bool_opt, shebang_matches from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ Number, Punctuation, Generic, Other, Error, Whitespace @@ -635,8 +635,43 @@ class Python2Lexer(RegexLexer): def analyse_text(text): return shebang_matches(text, r'pythonw?2(\.\d)?') +class _PythonConsoleLexerBase(RegexLexer): + name = 'Python console session' + aliases = ['pycon'] + mimetypes = ['text/x-python-doctest'] -class PythonConsoleLexer(Lexer): + """Auxiliary lexer for `PythonConsoleLexer`. + + Code tokens are output as ``Token.Other.Code``, traceback tokens as + ``Token.Other.Traceback``. + """ + tokens = { + 'root': [ + (r'(>>> )(.*\n)', bygroups(Generic.Prompt, Other.Code), 'continuations'), + # This happens, e.g., when tracebacks are embedded in documentation; + # trailing whitespaces are often stripped in such contexts. + (r'(>>>)(\n)', bygroups(Generic.Prompt, Whitespace)), + (r'(\^C)?Traceback \(most recent call last\):\n', Other.Traceback, 'traceback'), + # SyntaxError starts with this + (r' File "[^"]+", line \d+', Other.Traceback, 'traceback'), + (r'.*\n', Generic.Output), + ], + 'continuations': [ + (r'(\.\.\. )(.*\n)', bygroups(Generic.Prompt, Other.Code)), + # See above. + (r'(\.\.\.)(\n)', bygroups(Generic.Prompt, Whitespace)), + default('#pop'), + ], + 'traceback': [ + # As soon as we see a traceback, consume everything until the next + # >>> prompt. + (r'(?=>>>( |$))', Text, '#pop'), + (r'(KeyboardInterrupt)(\n)', bygroups(Name.Class, Whitespace)), + (r'.*\n', Other.Traceback), + ], + } + +class PythonConsoleLexer(DelegatingLexer): """ For Python console output or doctests, such as: @@ -659,70 +694,28 @@ class PythonConsoleLexer(Lexer): .. versionchanged:: 2.5 Now defaults to ``True``. """ + name = 'Python console session' aliases = ['pycon'] mimetypes = ['text/x-python-doctest'] def __init__(self, **options): - self.python3 = get_bool_opt(options, 'python3', True) - Lexer.__init__(self, **options) - - def get_tokens_unprocessed(self, text): - if self.python3: - pylexer = PythonLexer(**self.options) - tblexer = PythonTracebackLexer(**self.options) + python3 = get_bool_opt(options, 'python3', True) + if python3: + pylexer = PythonLexer + tblexer = PythonTracebackLexer else: - pylexer = Python2Lexer(**self.options) - tblexer = Python2TracebackLexer(**self.options) - - curcode = '' - insertions = [] - curtb = '' - tbindex = 0 - in_tb = False - for match in line_re.finditer(text): - line = match.group() - if line.startswith('>>> ') or line.startswith('... '): - in_tb = False - insertions.append((len(curcode), - [(0, Generic.Prompt, line[:4])])) - curcode += line[4:] - elif line.rstrip() == '...' and not in_tb: - # only a new >>> prompt can end an exception block - # otherwise an ellipsis in place of the traceback frames - # will be mishandled - insertions.append((len(curcode), - [(0, Generic.Prompt, '...')])) - curcode += line[3:] - else: - if curcode: - yield from do_insertions( - insertions, pylexer.get_tokens_unprocessed(curcode)) - curcode = '' - insertions = [] - if in_tb: - curtb += line - if not (line.startswith(' ') or line.strip() == '...'): - in_tb = False - for i, t, v in tblexer.get_tokens_unprocessed(curtb): - yield tbindex+i, t, v - curtb = '' - elif (line.startswith('Traceback (most recent call last):') or - re.match(' File "[^"]+", line \\d+\\n$', line)): - in_tb = True - curtb = line - tbindex = match.start() - elif line == 'KeyboardInterrupt\n': - yield match.start(), Name.Class, line - else: - yield match.start(), Generic.Output, line - if curcode: - yield from do_insertions(insertions, - pylexer.get_tokens_unprocessed(curcode)) - if curtb: - for i, t, v in tblexer.get_tokens_unprocessed(curtb): - yield tbindex+i, t, v - + pylexer = Python2Lexer + tblexer = Python2TracebackLexer + # We have two auxiliary lexers. Use DelegatingLexer twice with + # different tokens. TODO: DelegatingLexer should support this + # directly, by accepting a tuplet of auxiliary lexers and a tuple of + # distinguishing tokens. Then we wouldn't need this intermediary + # class. + class _ReplaceInnerCode(DelegatingLexer): + def __init__(self, **options): + super().__init__(pylexer, _PythonConsoleLexerBase, Other.Code, **options) + super().__init__(tblexer, _ReplaceInnerCode, Other.Traceback, **options) class PythonTracebackLexer(RegexLexer): """ @@ -743,7 +736,7 @@ class PythonTracebackLexer(RegexLexer): tokens = { 'root': [ (r'\n', Whitespace), - (r'^Traceback \(most recent call last\):\n', Generic.Traceback, 'intb'), + (r'^(\^C)?Traceback \(most recent call last\):\n', Generic.Traceback, 'intb'), (r'^During handling of the above exception, another ' r'exception occurred:\n\n', Generic.Traceback), (r'^The above exception was the direct cause of the ' |