1 files changed, 196 insertions, 93 deletions
diff --git a/pygments/lexers/lisp.py b/pygments/lexers/lisp.py
index 798907df..e895a8f5 100644
--- a/pygments/lexers/lisp.py
+++ b/pygments/lexers/lisp.py
@@ -12,21 +12,19 @@ import re
 
 from pygments.lexer import RegexLexer, include, bygroups, words, default
 from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
-    Number, Punctuation, Literal, Error
+    Number, Punctuation, Literal, Error, Whitespace
 
 from pygments.lexers.python import PythonLexer
 
+from pygments.lexers._scheme_builtins import scheme_keywords, scheme_builtins
+
 __all__ = ['SchemeLexer', 'CommonLispLexer', 'HyLexer', 'RacketLexer',
            'NewLispLexer', 'EmacsLispLexer', 'ShenLexer', 'CPSALexer',
            'XtlangLexer', 'FennelLexer']
 
-
 class SchemeLexer(RegexLexer):
     """
-    A Scheme lexer, parsing a stream and outputting the tokens
-    needed to highlight scheme code.
-    This lexer could be most probably easily subclassed to parse
-    other LISP-Dialects like Common Lisp, Emacs Lisp or AutoLisp.
+    A Scheme lexer.
 
     This parser is checked with pastes from the LISP pastebin
     at http://paste.lisp.org/ to cover as much syntax as possible.
@@ -41,60 +39,148 @@ class SchemeLexer(RegexLexer):
     mimetypes = ['text/x-scheme', 'application/x-scheme']
 
     flags = re.DOTALL | re.MULTILINE
-    # list of known keywords and builtins taken form vim 6.4 scheme.vim
-    # syntax file.
-    keywords = (
-        'lambda', 'define', 'if', 'else', 'cond', 'and', 'or', 'case', 'let',
-        'let*', 'letrec', 'begin', 'do', 'delay', 'set!', '=>', 'quote',
-        'quasiquote', 'unquote', 'unquote-splicing', 'define-syntax',
-        'let-syntax', 'letrec-syntax', 'syntax-rules'
-    )
-    builtins = (
-        '*', '+', '-', '/', '<', '<=', '=', '>', '>=', 'abs', 'acos', 'angle',
-        'append', 'apply', 'asin', 'assoc', 'assq', 'assv', 'atan',
-        'boolean?', 'caaaar', 'caaadr', 'caaar', 'caadar', 'caaddr', 'caadr',
-        'caar', 'cadaar', 'cadadr', 'cadar', 'caddar', 'cadddr', 'caddr',
-        'cadr', 'call-with-current-continuation', 'call-with-input-file',
-        'call-with-output-file', 'call-with-values', 'call/cc', 'car',
-        'cdaaar', 'cdaadr', 'cdaar', 'cdadar', 'cdaddr', 'cdadr', 'cdar',
-        'cddaar', 'cddadr', 'cddar', 'cdddar', 'cddddr', 'cdddr', 'cddr',
-        'cdr', 'ceiling', 'char->integer', 'char-alphabetic?', 'char-ci<=?',
-        'char-ci<?', 'char-ci=?', 'char-ci>=?', 'char-ci>?', 'char-downcase',
-        'char-lower-case?', 'char-numeric?', 'char-ready?', 'char-upcase',
-        'char-upper-case?', 'char-whitespace?', 'char<=?', 'char<?', 'char=?',
-        'char>=?', 'char>?', 'char?', 'close-input-port', 'close-output-port',
-        'complex?', 'cons', 'cos', 'current-input-port', 'current-output-port',
-        'denominator', 'display', 'dynamic-wind', 'eof-object?', 'eq?',
-        'equal?', 'eqv?', 'eval', 'even?', 'exact->inexact', 'exact?', 'exp',
-        'expt', 'floor', 'for-each', 'force', 'gcd', 'imag-part',
-        'inexact->exact', 'inexact?', 'input-port?', 'integer->char',
-        'integer?', 'interaction-environment', 'lcm', 'length', 'list',
-        'list->string', 'list->vector', 'list-ref', 'list-tail', 'list?',
-        'load', 'log', 'magnitude', 'make-polar', 'make-rectangular',
-        'make-string', 'make-vector', 'map', 'max', 'member', 'memq', 'memv',
-        'min', 'modulo', 'negative?', 'newline', 'not', 'null-environment',
-        'null?', 'number->string', 'number?', 'numerator', 'odd?',
-        'open-input-file', 'open-output-file', 'output-port?', 'pair?',
-        'peek-char', 'port?', 'positive?', 'procedure?', 'quotient',
-        'rational?', 'rationalize', 'read', 'read-char', 'real-part', 'real?',
-        'remainder', 'reverse', 'round', 'scheme-report-environment',
-        'set-car!', 'set-cdr!', 'sin', 'sqrt', 'string', 'string->list',
-        'string->number', 'string->symbol', 'string-append', 'string-ci<=?',
-        'string-ci<?', 'string-ci=?', 'string-ci>=?', 'string-ci>?',
-        'string-copy', 'string-fill!', 'string-length', 'string-ref',
-        'string-set!', 'string<=?', 'string<?', 'string=?', 'string>=?',
-        'string>?', 'string?', 'substring', 'symbol->string', 'symbol?',
-        'tan', 'transcript-off', 'transcript-on', 'truncate', 'values',
-        'vector', 'vector->list', 'vector-fill!', 'vector-length',
-        'vector-ref', 'vector-set!', 'vector?', 'with-input-from-file',
-        'with-output-to-file', 'write', 'write-char', 'zero?'
-    )
 
     # valid names for identifiers
     # well, names can only not consist fully of numbers
     # but this should be good enough for now
     valid_name = r'[\w!$%&*+,/:<=>?@^~|-]+'
 
+    # Use within verbose regexes
+    token_end = r'''
+      (?=
+        \s         # whitespace
+        | ;        # comment
+        | \#[;|!] # fancy comments
+        | [)\]]    # end delimiters
+        | $        # end of file
+      )
+    '''
+
+    # Recognizing builtins.
+    def get_tokens_unprocessed(self, text):
+        for index, token, value in super().get_tokens_unprocessed(text):
+            if token is Name.Function or token is Name.Variable:
+                if value in scheme_keywords:
+                    yield index, Keyword, value
+                elif value in scheme_builtins:
+                    yield index, Name.Builtin, value
+                else:
+                    yield index, token, value
+            else:
+                yield index, token, value
+
+    # Scheme has funky syntactic rules for numbers. These are all
+    # valid number literals: 5.0e55|14, 14/13, -1+5j, +1@5, #b110,
+    # #o#Iinf.0-nan.0i.  This is adapted from the formal grammar given
+    # in http://www.r6rs.org/final/r6rs.pdf, section 4.2.1.  Take a
+    # deep breath ...
+
+    # It would be simpler if we could just not bother about invalid
+    # numbers like #b35. But we cannot parse 'abcdef' without #x as a
+    # number.
+
+    number_rules = {}
+    for base in (2, 8, 10, 16):
+        if base == 2:
+            digit = r'[01]'
+            radix = r'( \#[bB] )'
+        elif base == 8:
+            digit = r'[0-7]'
+            radix = r'( \#[oO] )'
+        elif base == 10:
+            digit = r'[0-9]'
+            radix = r'( (\#[dD])? )'
+        elif base == 16:
+            digit = r'[0-9a-fA-F]'
+            radix = r'( \#[xX] )'
+
+        # Radix, optional exactness indicator.
+        prefix = rf'''
+          (
+            {radix} (\#[iIeE])?
+            | \#[iIeE] {radix}
+          )
+        '''
+
+        # Simple unsigned number or fraction.
+        ureal = rf'''
+          (
+            {digit}+
+            ( / {digit}+ )?
+          )
+        '''
+
+        # Add decimal numbers.
+        if base == 10:
+            decimal = r'''
+              (
+                # Decimal part
+                (
+                  [0-9]+ ([.][0-9]*)?
+                  | [.][0-9]+
+                )
+
+                # Optional exponent
+                (
+                  [eEsSfFdDlL] [+-]? [0-9]+
+                )?
+
+                # Optional mantissa width
+                (
+                  \|[0-9]+
+                )?
+              )
+            '''
+            ureal = rf'''
+              (
+                {decimal} (?!/)
+                | {ureal}
+              )
+            '''
+
+        naninf = r'(nan.0|inf.0)'
+
+        real = rf'''
+          (
+            [+-] {naninf}  # Sign mandatory
+            | [+-]? {ureal}    # Sign optional
+          )
+        '''
+
+        complex_ = rf'''
+          (
+            {real}?  [+-]  ({naninf}|{ureal})?  i
+            | {real} (@ {real})?
+
+          )
+        '''
+
+        num = rf'''(?x)
+          (
+            {prefix}
+            {complex_}
+          )
+          # Need to ensure we have a full token. 1+ is not a
+          # number followed by something else, but a function
+          # name.
+          {token_end}
+        '''
+
+        number_rules[base] = num
+
+    # If you have a headache now, say thanks to RnRS editors.
+
+    # Doing it this way is simpler than splitting the number(10)
+    # regex in a floating-point and a no-floating-point version.
+    def decimal_cb(self, match):
+        if '.' in match.group():
+            token_type = Number.Float # includes [+-](inf|nan).0
+        else:
+            token_type = Number.Integer
+        yield match.start(), token_type, match.group()
+
+    # --
+
     # The 'scheme-root' state parses as many expressions as needed, always
     # delegating to the 'scheme-value' state. The latter parses one complete
     # expression and immediately pops back. This is needed for the LilyPondLexer.
@@ -120,24 +206,26 @@ class SchemeLexer(RegexLexer):
             # multi-line comment
             (r'#\|', Comment.Multiline, 'multiline-comment'),
             # commented form (entire sexpr following)
-            (r'#;\s*\(', Comment, 'commented-form'),
+            (r'#;[([]', Comment, 'commented-form'),
+            # commented datum
+            (r'#;', Comment, 'commented-datum'),
             # signifies that the program text that follows is written with the
             # lexical and datum syntax described in r6rs
             (r'#!r6rs', Comment),
 
             # whitespaces - usually not relevant
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
 
             # numbers
-            (r'-?\d+\.\d+', Number.Float, '#pop'),
-            (r'-?\d+', Number.Integer, '#pop'),
-            # support for uncommon kinds of numbers -
-            # have to figure out what the characters mean
-            # (r'(#e|#i|#b|#o|#d|#x)[\d.]+', Number),
+            (number_rules[2], Number.Bin, '#pop'),
+            (number_rules[8], Number.Oct, '#pop'),
+            (number_rules[10], decimal_cb, '#pop'),
+            (number_rules[16], Number.Hex, '#pop'),
 
-            # strings, symbols and characters
-            (r'"(\\\\|\\[^\\]|[^"\\])*"', String, "#pop"),
+            # strings, symbols, keywords and characters
+            (r'"', String, 'string'),
             (r"'" + valid_name, String.Symbol, "#pop"),
+            (r'#:' + valid_name, Keyword.Declaration, '#pop'),
             (r"#\\([()/'\"._!§$%& ?=+-]|[a-zA-Z0-9]+)", String.Char, "#pop"),
 
             # constants
@@ -146,23 +234,16 @@ class SchemeLexer(RegexLexer):
             # special operators
             (r"('|#|`|,@|,|\.)", Operator),
 
-            # highlight the keywords
-            ('(%s)' % '|'.join(re.escape(entry) + ' ' for entry in keywords),
-             Keyword,
-             '#pop'),
-
             # first variable in a quoted string like
             # '(this is syntactic sugar)
             (r"(?<='\()" + valid_name, Name.Variable, '#pop'),
             (r"(?<=#\()" + valid_name, Name.Variable, '#pop'),
 
-            # highlight the builtins
-            (r"(?<=\()(%s)" % '|'.join(re.escape(entry) + ' ' for entry in builtins),
-             Name.Builtin,
-             '#pop'),
-
-            # the remaining functions
+            # Functions -- note that this also catches variables
+            # defined in let/let*, but there is little that can
+            # be done about it.
             (r'(?<=\()' + valid_name, Name.Function, '#pop'),
+
             # find the remaining variables
             (valid_name, Name.Variable, '#pop'),
 
@@ -170,11 +251,11 @@ class SchemeLexer(RegexLexer):
 
             # Push scheme-root to enter a state that will parse as many things
             # as needed in the parentheses.
-            (r'\(|\[', Punctuation, 'scheme-root'),
+            (r'[([]', Punctuation, 'scheme-root'),
             # Pop one 'value', one 'scheme-root', and yet another 'value', so
             # we get back to a state parsing expressions as needed in the
             # enclosing context.
-            (r'\)|\]', Punctuation, '#pop:3'),
+            (r'[)\]]', Punctuation, '#pop:3'),
         ],
         'multiline-comment': [
             (r'#\|', Comment.Multiline, '#push'),
@@ -183,10 +264,30 @@ class SchemeLexer(RegexLexer):
             (r'[|#]', Comment.Multiline),
         ],
         'commented-form': [
-            (r'\(', Comment, '#push'),
-            (r'\)', Comment, '#pop'),
-            (r'[^()]+', Comment),
+            (r'[([]', Comment, '#push'),
+            (r'[)\]]', Comment, '#pop'),
+            (r'[^()[\]]+', Comment),
         ],
+        'commented-datum': [
+            (rf'(?x).*?{token_end}', Comment, '#pop'),
+        ],
+        'string': [
+            # Pops back from 'string', and pops 'value' as well.
+            ('"', String, '#pop:2'),
+            # Hex escape sequences, R6RS-style.
+            (r'\\x[0-9a-fA-F]+;', String.Escape),
+            # We try R6RS style first, but fall back to Guile-style.
+            (r'\\x[0-9a-fA-F]{2}', String.Escape),
+            # Other special escape sequences implemented by Guile.
+            (r'\\u[0-9a-fA-F]{4}', String.Escape),
+            (r'\\U[0-9a-fA-F]{6}', String.Escape),
+            # Escape sequences are not overly standardized. Recognizing
+            # a single character after the backslash should be good enough.
+            # NB: we have DOTALL.
+            (r'\\.', String.Escape),
+            # The rest
+            (r'[^\\"]+', String),
+        ]
     }
 
 
@@ -271,7 +372,7 @@ class CommonLispLexer(RegexLexer):
         ],
         'body': [
             # whitespace
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
 
             # single-line comment
             (r';.*$', Comment.Single),
@@ -419,7 +520,8 @@ class HyLexer(RegexLexer):
             (r';.*$', Comment.Single),
 
             # whitespaces - usually not relevant
-            (r'[,\s]+', Text),
+            (r',+', Text),
+            (r'\s+', Whitespace),
 
             # numbers
             (r'-?\d+\.\d+', Number.Float),
@@ -1299,7 +1401,7 @@ class RacketLexer(RegexLexer):
             (r'#\|', Comment.Multiline, 'block-comment'),
 
             # Whitespaces
-            (r'(?u)\s+', Text),
+            (r'(?u)\s+', Whitespace),
 
             # Numbers: Keep in mind Racket reader hash prefixes, which
             # can denote the base or the type. These don't map neatly
@@ -1348,7 +1450,7 @@ class RacketLexer(RegexLexer):
             (r'#(true|false|[tTfF])', Name.Constant, '#pop'),
 
             # Keyword argument names (e.g. #:keyword)
-            (r'(?u)#:%s' % _symbol, Keyword.Declaration, '#pop'),
+            (r'#:%s' % _symbol, Keyword.Declaration, '#pop'),
 
             # Reader extensions
             (r'(#lang |#!)(\S+)',
@@ -1377,9 +1479,9 @@ class RacketLexer(RegexLexer):
             (r'quasiquote(?=[%s])' % _delimiters, Keyword,
              ('#pop', 'quasiquoted-datum')),
             (_opening_parenthesis, Punctuation, ('#pop', 'unquoted-list')),
-            (words(_keywords, prefix='(?u)', suffix='(?=[%s])' % _delimiters),
+            (words(_keywords, suffix='(?=[%s])' % _delimiters),
              Keyword, '#pop'),
-            (words(_builtins, prefix='(?u)', suffix='(?=[%s])' % _delimiters),
+            (words(_builtins, suffix='(?=[%s])' % _delimiters),
              Name.Builtin, '#pop'),
             (_symbol, Name, '#pop'),
             include('datum*')
@@ -1435,7 +1537,7 @@ class NewLispLexer(RegexLexer):
     filenames = ['*.lsp', '*.nl', '*.kif']
     mimetypes = ['text/x-newlisp', 'application/x-newlisp']
 
-    flags = re.IGNORECASE | re.MULTILINE | re.UNICODE
+    flags = re.IGNORECASE | re.MULTILINE
 
     # list of built-in functions for newLISP version 10.3
     builtins = (
@@ -1512,7 +1614,7 @@ class NewLispLexer(RegexLexer):
             (r'#.*$', Comment.Single),
 
             # whitespace
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
 
             # strings, symbols and characters
             (r'"(\\\\|\\[^\\]|[^"\\])*"', String),
@@ -2124,7 +2226,7 @@ class EmacsLispLexer(RegexLexer):
         ],
         'body': [
             # whitespace
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
 
             # single-line comment
             (r';.*$', Comment.Single),
@@ -2271,7 +2373,7 @@ class ShenLexer(RegexLexer):
         'root': [
             (r'(?s)\\\*.*?\*\\', Comment.Multiline),  # \* ... *\
             (r'\\\\.*', Comment.Single),              # \\ ...
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
             (r'_{5,}', Punctuation),
             (r'={5,}', Punctuation),
             (r'(;|:=|\||--?>|<--?)', Punctuation),
@@ -2293,7 +2395,7 @@ class ShenLexer(RegexLexer):
         return tokens
 
     def _relevant(self, token):
-        return token not in (Text, Comment.Single, Comment.Multiline)
+        return token not in (Text, Whitespace, Comment.Single, Comment.Multiline)
 
     def _process_declarations(self, tokens):
         opening_paren = False
@@ -2398,7 +2500,7 @@ class CPSALexer(RegexLexer):
             (r';.*$', Comment.Single),
 
             # whitespaces - usually not relevant
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
 
             # numbers
             (r'-?\d+\.\d+', Number.Float),
@@ -2611,7 +2713,7 @@ class XtlangLexer(RegexLexer):
             (r';.*$', Comment.Single),
 
             # whitespaces - usually not relevant
-            (r'\s+', Text),
+            (r'\s+', Whitespace),
 
             # numbers
             (r'-?\d+\.\d+', Number.Float),
@@ -2692,7 +2794,8 @@ class FennelLexer(RegexLexer):
             # the only comment form is a semicolon; goes to the end of the line
             (r';.*$', Comment.Single),
 
-            (r'[,\s]+', Text),
+            (r',+', Text),
+            (r'\s+', Whitespace),
             (r'-?\d+\.\d+', Number.Float),
             (r'-?\d+', Number.Integer),