diff options
Diffstat (limited to 'pygments/lexers/lisp.py')
-rw-r--r-- | pygments/lexers/lisp.py | 289 |
1 files changed, 196 insertions, 93 deletions
diff --git a/pygments/lexers/lisp.py b/pygments/lexers/lisp.py index 798907df..e895a8f5 100644 --- a/pygments/lexers/lisp.py +++ b/pygments/lexers/lisp.py @@ -12,21 +12,19 @@ import re from pygments.lexer import RegexLexer, include, bygroups, words, default from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ - Number, Punctuation, Literal, Error + Number, Punctuation, Literal, Error, Whitespace from pygments.lexers.python import PythonLexer +from pygments.lexers._scheme_builtins import scheme_keywords, scheme_builtins + __all__ = ['SchemeLexer', 'CommonLispLexer', 'HyLexer', 'RacketLexer', 'NewLispLexer', 'EmacsLispLexer', 'ShenLexer', 'CPSALexer', 'XtlangLexer', 'FennelLexer'] - class SchemeLexer(RegexLexer): """ - A Scheme lexer, parsing a stream and outputting the tokens - needed to highlight scheme code. - This lexer could be most probably easily subclassed to parse - other LISP-Dialects like Common Lisp, Emacs Lisp or AutoLisp. + A Scheme lexer. This parser is checked with pastes from the LISP pastebin at http://paste.lisp.org/ to cover as much syntax as possible. @@ -41,60 +39,148 @@ class SchemeLexer(RegexLexer): mimetypes = ['text/x-scheme', 'application/x-scheme'] flags = re.DOTALL | re.MULTILINE - # list of known keywords and builtins taken form vim 6.4 scheme.vim - # syntax file. - keywords = ( - 'lambda', 'define', 'if', 'else', 'cond', 'and', 'or', 'case', 'let', - 'let*', 'letrec', 'begin', 'do', 'delay', 'set!', '=>', 'quote', - 'quasiquote', 'unquote', 'unquote-splicing', 'define-syntax', - 'let-syntax', 'letrec-syntax', 'syntax-rules' - ) - builtins = ( - '*', '+', '-', '/', '<', '<=', '=', '>', '>=', 'abs', 'acos', 'angle', - 'append', 'apply', 'asin', 'assoc', 'assq', 'assv', 'atan', - 'boolean?', 'caaaar', 'caaadr', 'caaar', 'caadar', 'caaddr', 'caadr', - 'caar', 'cadaar', 'cadadr', 'cadar', 'caddar', 'cadddr', 'caddr', - 'cadr', 'call-with-current-continuation', 'call-with-input-file', - 'call-with-output-file', 'call-with-values', 'call/cc', 'car', - 'cdaaar', 'cdaadr', 'cdaar', 'cdadar', 'cdaddr', 'cdadr', 'cdar', - 'cddaar', 'cddadr', 'cddar', 'cdddar', 'cddddr', 'cdddr', 'cddr', - 'cdr', 'ceiling', 'char->integer', 'char-alphabetic?', 'char-ci<=?', - 'char-ci<?', 'char-ci=?', 'char-ci>=?', 'char-ci>?', 'char-downcase', - 'char-lower-case?', 'char-numeric?', 'char-ready?', 'char-upcase', - 'char-upper-case?', 'char-whitespace?', 'char<=?', 'char<?', 'char=?', - 'char>=?', 'char>?', 'char?', 'close-input-port', 'close-output-port', - 'complex?', 'cons', 'cos', 'current-input-port', 'current-output-port', - 'denominator', 'display', 'dynamic-wind', 'eof-object?', 'eq?', - 'equal?', 'eqv?', 'eval', 'even?', 'exact->inexact', 'exact?', 'exp', - 'expt', 'floor', 'for-each', 'force', 'gcd', 'imag-part', - 'inexact->exact', 'inexact?', 'input-port?', 'integer->char', - 'integer?', 'interaction-environment', 'lcm', 'length', 'list', - 'list->string', 'list->vector', 'list-ref', 'list-tail', 'list?', - 'load', 'log', 'magnitude', 'make-polar', 'make-rectangular', - 'make-string', 'make-vector', 'map', 'max', 'member', 'memq', 'memv', - 'min', 'modulo', 'negative?', 'newline', 'not', 'null-environment', - 'null?', 'number->string', 'number?', 'numerator', 'odd?', - 'open-input-file', 'open-output-file', 'output-port?', 'pair?', - 'peek-char', 'port?', 'positive?', 'procedure?', 'quotient', - 'rational?', 'rationalize', 'read', 'read-char', 'real-part', 'real?', - 'remainder', 'reverse', 'round', 'scheme-report-environment', - 'set-car!', 'set-cdr!', 'sin', 'sqrt', 'string', 'string->list', - 'string->number', 'string->symbol', 'string-append', 'string-ci<=?', - 'string-ci<?', 'string-ci=?', 'string-ci>=?', 'string-ci>?', - 'string-copy', 'string-fill!', 'string-length', 'string-ref', - 'string-set!', 'string<=?', 'string<?', 'string=?', 'string>=?', - 'string>?', 'string?', 'substring', 'symbol->string', 'symbol?', - 'tan', 'transcript-off', 'transcript-on', 'truncate', 'values', - 'vector', 'vector->list', 'vector-fill!', 'vector-length', - 'vector-ref', 'vector-set!', 'vector?', 'with-input-from-file', - 'with-output-to-file', 'write', 'write-char', 'zero?' - ) # valid names for identifiers # well, names can only not consist fully of numbers # but this should be good enough for now valid_name = r'[\w!$%&*+,/:<=>?@^~|-]+' + # Use within verbose regexes + token_end = r''' + (?= + \s # whitespace + | ; # comment + | \#[;|!] # fancy comments + | [)\]] # end delimiters + | $ # end of file + ) + ''' + + # Recognizing builtins. + def get_tokens_unprocessed(self, text): + for index, token, value in super().get_tokens_unprocessed(text): + if token is Name.Function or token is Name.Variable: + if value in scheme_keywords: + yield index, Keyword, value + elif value in scheme_builtins: + yield index, Name.Builtin, value + else: + yield index, token, value + else: + yield index, token, value + + # Scheme has funky syntactic rules for numbers. These are all + # valid number literals: 5.0e55|14, 14/13, -1+5j, +1@5, #b110, + # #o#Iinf.0-nan.0i. This is adapted from the formal grammar given + # in http://www.r6rs.org/final/r6rs.pdf, section 4.2.1. Take a + # deep breath ... + + # It would be simpler if we could just not bother about invalid + # numbers like #b35. But we cannot parse 'abcdef' without #x as a + # number. + + number_rules = {} + for base in (2, 8, 10, 16): + if base == 2: + digit = r'[01]' + radix = r'( \#[bB] )' + elif base == 8: + digit = r'[0-7]' + radix = r'( \#[oO] )' + elif base == 10: + digit = r'[0-9]' + radix = r'( (\#[dD])? )' + elif base == 16: + digit = r'[0-9a-fA-F]' + radix = r'( \#[xX] )' + + # Radix, optional exactness indicator. + prefix = rf''' + ( + {radix} (\#[iIeE])? + | \#[iIeE] {radix} + ) + ''' + + # Simple unsigned number or fraction. + ureal = rf''' + ( + {digit}+ + ( / {digit}+ )? + ) + ''' + + # Add decimal numbers. + if base == 10: + decimal = r''' + ( + # Decimal part + ( + [0-9]+ ([.][0-9]*)? + | [.][0-9]+ + ) + + # Optional exponent + ( + [eEsSfFdDlL] [+-]? [0-9]+ + )? + + # Optional mantissa width + ( + \|[0-9]+ + )? + ) + ''' + ureal = rf''' + ( + {decimal} (?!/) + | {ureal} + ) + ''' + + naninf = r'(nan.0|inf.0)' + + real = rf''' + ( + [+-] {naninf} # Sign mandatory + | [+-]? {ureal} # Sign optional + ) + ''' + + complex_ = rf''' + ( + {real}? [+-] ({naninf}|{ureal})? i + | {real} (@ {real})? + + ) + ''' + + num = rf'''(?x) + ( + {prefix} + {complex_} + ) + # Need to ensure we have a full token. 1+ is not a + # number followed by something else, but a function + # name. + {token_end} + ''' + + number_rules[base] = num + + # If you have a headache now, say thanks to RnRS editors. + + # Doing it this way is simpler than splitting the number(10) + # regex in a floating-point and a no-floating-point version. + def decimal_cb(self, match): + if '.' in match.group(): + token_type = Number.Float # includes [+-](inf|nan).0 + else: + token_type = Number.Integer + yield match.start(), token_type, match.group() + + # -- + # The 'scheme-root' state parses as many expressions as needed, always # delegating to the 'scheme-value' state. The latter parses one complete # expression and immediately pops back. This is needed for the LilyPondLexer. @@ -120,24 +206,26 @@ class SchemeLexer(RegexLexer): # multi-line comment (r'#\|', Comment.Multiline, 'multiline-comment'), # commented form (entire sexpr following) - (r'#;\s*\(', Comment, 'commented-form'), + (r'#;[([]', Comment, 'commented-form'), + # commented datum + (r'#;', Comment, 'commented-datum'), # signifies that the program text that follows is written with the # lexical and datum syntax described in r6rs (r'#!r6rs', Comment), # whitespaces - usually not relevant - (r'\s+', Text), + (r'\s+', Whitespace), # numbers - (r'-?\d+\.\d+', Number.Float, '#pop'), - (r'-?\d+', Number.Integer, '#pop'), - # support for uncommon kinds of numbers - - # have to figure out what the characters mean - # (r'(#e|#i|#b|#o|#d|#x)[\d.]+', Number), + (number_rules[2], Number.Bin, '#pop'), + (number_rules[8], Number.Oct, '#pop'), + (number_rules[10], decimal_cb, '#pop'), + (number_rules[16], Number.Hex, '#pop'), - # strings, symbols and characters - (r'"(\\\\|\\[^\\]|[^"\\])*"', String, "#pop"), + # strings, symbols, keywords and characters + (r'"', String, 'string'), (r"'" + valid_name, String.Symbol, "#pop"), + (r'#:' + valid_name, Keyword.Declaration, '#pop'), (r"#\\([()/'\"._!ยง$%& ?=+-]|[a-zA-Z0-9]+)", String.Char, "#pop"), # constants @@ -146,23 +234,16 @@ class SchemeLexer(RegexLexer): # special operators (r"('|#|`|,@|,|\.)", Operator), - # highlight the keywords - ('(%s)' % '|'.join(re.escape(entry) + ' ' for entry in keywords), - Keyword, - '#pop'), - # first variable in a quoted string like # '(this is syntactic sugar) (r"(?<='\()" + valid_name, Name.Variable, '#pop'), (r"(?<=#\()" + valid_name, Name.Variable, '#pop'), - # highlight the builtins - (r"(?<=\()(%s)" % '|'.join(re.escape(entry) + ' ' for entry in builtins), - Name.Builtin, - '#pop'), - - # the remaining functions + # Functions -- note that this also catches variables + # defined in let/let*, but there is little that can + # be done about it. (r'(?<=\()' + valid_name, Name.Function, '#pop'), + # find the remaining variables (valid_name, Name.Variable, '#pop'), @@ -170,11 +251,11 @@ class SchemeLexer(RegexLexer): # Push scheme-root to enter a state that will parse as many things # as needed in the parentheses. - (r'\(|\[', Punctuation, 'scheme-root'), + (r'[([]', Punctuation, 'scheme-root'), # Pop one 'value', one 'scheme-root', and yet another 'value', so # we get back to a state parsing expressions as needed in the # enclosing context. - (r'\)|\]', Punctuation, '#pop:3'), + (r'[)\]]', Punctuation, '#pop:3'), ], 'multiline-comment': [ (r'#\|', Comment.Multiline, '#push'), @@ -183,10 +264,30 @@ class SchemeLexer(RegexLexer): (r'[|#]', Comment.Multiline), ], 'commented-form': [ - (r'\(', Comment, '#push'), - (r'\)', Comment, '#pop'), - (r'[^()]+', Comment), + (r'[([]', Comment, '#push'), + (r'[)\]]', Comment, '#pop'), + (r'[^()[\]]+', Comment), ], + 'commented-datum': [ + (rf'(?x).*?{token_end}', Comment, '#pop'), + ], + 'string': [ + # Pops back from 'string', and pops 'value' as well. + ('"', String, '#pop:2'), + # Hex escape sequences, R6RS-style. + (r'\\x[0-9a-fA-F]+;', String.Escape), + # We try R6RS style first, but fall back to Guile-style. + (r'\\x[0-9a-fA-F]{2}', String.Escape), + # Other special escape sequences implemented by Guile. + (r'\\u[0-9a-fA-F]{4}', String.Escape), + (r'\\U[0-9a-fA-F]{6}', String.Escape), + # Escape sequences are not overly standardized. Recognizing + # a single character after the backslash should be good enough. + # NB: we have DOTALL. + (r'\\.', String.Escape), + # The rest + (r'[^\\"]+', String), + ] } @@ -271,7 +372,7 @@ class CommonLispLexer(RegexLexer): ], 'body': [ # whitespace - (r'\s+', Text), + (r'\s+', Whitespace), # single-line comment (r';.*$', Comment.Single), @@ -419,7 +520,8 @@ class HyLexer(RegexLexer): (r';.*$', Comment.Single), # whitespaces - usually not relevant - (r'[,\s]+', Text), + (r',+', Text), + (r'\s+', Whitespace), # numbers (r'-?\d+\.\d+', Number.Float), @@ -1299,7 +1401,7 @@ class RacketLexer(RegexLexer): (r'#\|', Comment.Multiline, 'block-comment'), # Whitespaces - (r'(?u)\s+', Text), + (r'(?u)\s+', Whitespace), # Numbers: Keep in mind Racket reader hash prefixes, which # can denote the base or the type. These don't map neatly @@ -1348,7 +1450,7 @@ class RacketLexer(RegexLexer): (r'#(true|false|[tTfF])', Name.Constant, '#pop'), # Keyword argument names (e.g. #:keyword) - (r'(?u)#:%s' % _symbol, Keyword.Declaration, '#pop'), + (r'#:%s' % _symbol, Keyword.Declaration, '#pop'), # Reader extensions (r'(#lang |#!)(\S+)', @@ -1377,9 +1479,9 @@ class RacketLexer(RegexLexer): (r'quasiquote(?=[%s])' % _delimiters, Keyword, ('#pop', 'quasiquoted-datum')), (_opening_parenthesis, Punctuation, ('#pop', 'unquoted-list')), - (words(_keywords, prefix='(?u)', suffix='(?=[%s])' % _delimiters), + (words(_keywords, suffix='(?=[%s])' % _delimiters), Keyword, '#pop'), - (words(_builtins, prefix='(?u)', suffix='(?=[%s])' % _delimiters), + (words(_builtins, suffix='(?=[%s])' % _delimiters), Name.Builtin, '#pop'), (_symbol, Name, '#pop'), include('datum*') @@ -1435,7 +1537,7 @@ class NewLispLexer(RegexLexer): filenames = ['*.lsp', '*.nl', '*.kif'] mimetypes = ['text/x-newlisp', 'application/x-newlisp'] - flags = re.IGNORECASE | re.MULTILINE | re.UNICODE + flags = re.IGNORECASE | re.MULTILINE # list of built-in functions for newLISP version 10.3 builtins = ( @@ -1512,7 +1614,7 @@ class NewLispLexer(RegexLexer): (r'#.*$', Comment.Single), # whitespace - (r'\s+', Text), + (r'\s+', Whitespace), # strings, symbols and characters (r'"(\\\\|\\[^\\]|[^"\\])*"', String), @@ -2124,7 +2226,7 @@ class EmacsLispLexer(RegexLexer): ], 'body': [ # whitespace - (r'\s+', Text), + (r'\s+', Whitespace), # single-line comment (r';.*$', Comment.Single), @@ -2271,7 +2373,7 @@ class ShenLexer(RegexLexer): 'root': [ (r'(?s)\\\*.*?\*\\', Comment.Multiline), # \* ... *\ (r'\\\\.*', Comment.Single), # \\ ... - (r'\s+', Text), + (r'\s+', Whitespace), (r'_{5,}', Punctuation), (r'={5,}', Punctuation), (r'(;|:=|\||--?>|<--?)', Punctuation), @@ -2293,7 +2395,7 @@ class ShenLexer(RegexLexer): return tokens def _relevant(self, token): - return token not in (Text, Comment.Single, Comment.Multiline) + return token not in (Text, Whitespace, Comment.Single, Comment.Multiline) def _process_declarations(self, tokens): opening_paren = False @@ -2398,7 +2500,7 @@ class CPSALexer(RegexLexer): (r';.*$', Comment.Single), # whitespaces - usually not relevant - (r'\s+', Text), + (r'\s+', Whitespace), # numbers (r'-?\d+\.\d+', Number.Float), @@ -2611,7 +2713,7 @@ class XtlangLexer(RegexLexer): (r';.*$', Comment.Single), # whitespaces - usually not relevant - (r'\s+', Text), + (r'\s+', Whitespace), # numbers (r'-?\d+\.\d+', Number.Float), @@ -2692,7 +2794,8 @@ class FennelLexer(RegexLexer): # the only comment form is a semicolon; goes to the end of the line (r';.*$', Comment.Single), - (r'[,\s]+', Text), + (r',+', Text), + (r'\s+', Whitespace), (r'-?\d+\.\d+', Number.Float), (r'-?\d+', Number.Integer), |