Scheme: autogenerate lists of builtins

This enriches the space of recognized builtins. Note that this also fixes the space after a builtin being included in the token.
author: Jean Abou Samra <jean@abou-samra.fr> 2022-02-01 13:33:38 +0100
committer: Georg Brandl <georg@python.org> 2022-02-08 16:36:49 +0100
commit: 93d4cce817553c4bc2694747cfa6b3a04c6e5ddf (patch)
tree: 8fb49ebe6ab07a1edf20ec1848a09a8f81ac168b /pygments/lexers/lisp.py
parent: 88f36b5a27ab6b086018a2bb339e17a018345850 (diff)
download: pygments-git-93d4cce817553c4bc2694747cfa6b3a04c6e5ddf.tar.gz
1 files changed, 20 insertions, 64 deletions
diff --git a/pygments/lexers/lisp.py b/pygments/lexers/lisp.py
index 7e5dadb2..7d457d3c 100644
--- a/pygments/lexers/lisp.py
+++ b/pygments/lexers/lisp.py
@@ -16,17 +16,15 @@ from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
 
 from pygments.lexers.python import PythonLexer
 
+from pygments.lexers._scheme_builtins import scheme_keywords, scheme_builtins
+
 __all__ = ['SchemeLexer', 'CommonLispLexer', 'HyLexer', 'RacketLexer',
            'NewLispLexer', 'EmacsLispLexer', 'ShenLexer', 'CPSALexer',
            'XtlangLexer', 'FennelLexer']
 
-
 class SchemeLexer(RegexLexer):
     """
-    A Scheme lexer, parsing a stream and outputting the tokens
-    needed to highlight scheme code.
-    This lexer could be most probably easily subclassed to parse
-    other LISP-Dialects like Common Lisp, Emacs Lisp or AutoLisp.
+    A Scheme lexer.
 
     This parser is checked with pastes from the LISP pastebin
     at http://paste.lisp.org/ to cover as much syntax as possible.
@@ -41,60 +39,25 @@ class SchemeLexer(RegexLexer):
     mimetypes = ['text/x-scheme', 'application/x-scheme']
 
     flags = re.DOTALL | re.MULTILINE
-    # list of known keywords and builtins taken form vim 6.4 scheme.vim
-    # syntax file.
-    keywords = (
-        'lambda', 'define', 'if', 'else', 'cond', 'and', 'or', 'case', 'let',
-        'let*', 'letrec', 'begin', 'do', 'delay', 'set!', '=>', 'quote',
-        'quasiquote', 'unquote', 'unquote-splicing', 'define-syntax',
-        'let-syntax', 'letrec-syntax', 'syntax-rules'
-    )
-    builtins = (
-        '*', '+', '-', '/', '<', '<=', '=', '>', '>=', 'abs', 'acos', 'angle',
-        'append', 'apply', 'asin', 'assoc', 'assq', 'assv', 'atan',
-        'boolean?', 'caaaar', 'caaadr', 'caaar', 'caadar', 'caaddr', 'caadr',
-        'caar', 'cadaar', 'cadadr', 'cadar', 'caddar', 'cadddr', 'caddr',
-        'cadr', 'call-with-current-continuation', 'call-with-input-file',
-        'call-with-output-file', 'call-with-values', 'call/cc', 'car',
-        'cdaaar', 'cdaadr', 'cdaar', 'cdadar', 'cdaddr', 'cdadr', 'cdar',
-        'cddaar', 'cddadr', 'cddar', 'cdddar', 'cddddr', 'cdddr', 'cddr',
-        'cdr', 'ceiling', 'char->integer', 'char-alphabetic?', 'char-ci<=?',
-        'char-ci<?', 'char-ci=?', 'char-ci>=?', 'char-ci>?', 'char-downcase',
-        'char-lower-case?', 'char-numeric?', 'char-ready?', 'char-upcase',
-        'char-upper-case?', 'char-whitespace?', 'char<=?', 'char<?', 'char=?',
-        'char>=?', 'char>?', 'char?', 'close-input-port', 'close-output-port',
-        'complex?', 'cons', 'cos', 'current-input-port', 'current-output-port',
-        'denominator', 'display', 'dynamic-wind', 'eof-object?', 'eq?',
-        'equal?', 'eqv?', 'eval', 'even?', 'exact->inexact', 'exact?', 'exp',
-        'expt', 'floor', 'for-each', 'force', 'gcd', 'imag-part',
-        'inexact->exact', 'inexact?', 'input-port?', 'integer->char',
-        'integer?', 'interaction-environment', 'lcm', 'length', 'list',
-        'list->string', 'list->vector', 'list-ref', 'list-tail', 'list?',
-        'load', 'log', 'magnitude', 'make-polar', 'make-rectangular',
-        'make-string', 'make-vector', 'map', 'max', 'member', 'memq', 'memv',
-        'min', 'modulo', 'negative?', 'newline', 'not', 'null-environment',
-        'null?', 'number->string', 'number?', 'numerator', 'odd?',
-        'open-input-file', 'open-output-file', 'output-port?', 'pair?',
-        'peek-char', 'port?', 'positive?', 'procedure?', 'quotient',
-        'rational?', 'rationalize', 'read', 'read-char', 'real-part', 'real?',
-        'remainder', 'reverse', 'round', 'scheme-report-environment',
-        'set-car!', 'set-cdr!', 'sin', 'sqrt', 'string', 'string->list',
-        'string->number', 'string->symbol', 'string-append', 'string-ci<=?',
-        'string-ci<?', 'string-ci=?', 'string-ci>=?', 'string-ci>?',
-        'string-copy', 'string-fill!', 'string-length', 'string-ref',
-        'string-set!', 'string<=?', 'string<?', 'string=?', 'string>=?',
-        'string>?', 'string?', 'substring', 'symbol->string', 'symbol?',
-        'tan', 'transcript-off', 'transcript-on', 'truncate', 'values',
-        'vector', 'vector->list', 'vector-fill!', 'vector-length',
-        'vector-ref', 'vector-set!', 'vector?', 'with-input-from-file',
-        'with-output-to-file', 'write', 'write-char', 'zero?'
-    )
 
     # valid names for identifiers
     # well, names can only not consist fully of numbers
     # but this should be good enough for now
     valid_name = r'[\w!$%&*+,/:<=>?@^~|-]+'
 
+    # Recognizing builtins.
+    def get_tokens_unprocessed(self, text):
+        for index, token, value in super().get_tokens_unprocessed(text):
+            if token is Name.Function or token is Name.Variable:
+                if value in scheme_keywords:
+                    yield index, Keyword, value
+                elif value in scheme_builtins:
+                    yield index, Name.Builtin, value
+                else:
+                    yield index, token, value
+            else:
+                yield index, token, value
+
     # Scheme has funky syntactic rules for numbers. These are all
     # valid number literals: 5.0e55|14, 14/13, -1+5j, +1@5, #b110,
     # #o#Iinf.0-nan.0i.  This is adapted from the formal grammar given
@@ -263,23 +226,16 @@ class SchemeLexer(RegexLexer):
             # special operators
             (r"('|#|`|,@|,|\.)", Operator),
 
-            # highlight the keywords
-            ('(%s)' % '|'.join(re.escape(entry) + ' ' for entry in keywords),
-             Keyword,
-             '#pop'),
-
             # first variable in a quoted string like
             # '(this is syntactic sugar)
             (r"(?<='\()" + valid_name, Name.Variable, '#pop'),
             (r"(?<=#\()" + valid_name, Name.Variable, '#pop'),
 
-            # highlight the builtins
-            (r"(?<=\()(%s)" % '|'.join(re.escape(entry) + ' ' for entry in builtins),
-             Name.Builtin,
-             '#pop'),
-
-            # the remaining functions
+            # Functions -- note that this also catches variables
+            # defined in let/let*, but there is little that can
+            # be done about it.
             (r'(?<=\()' + valid_name, Name.Function, '#pop'),
+
             # find the remaining variables
             (valid_name, Name.Variable, '#pop'),
author	Jean Abou Samra <jean@abou-samra.fr>	2022-02-01 13:33:38 +0100
committer	Georg Brandl <georg@python.org>	2022-02-08 16:36:49 +0100
commit	93d4cce817553c4bc2694747cfa6b3a04c6e5ddf (patch)
tree	8fb49ebe6ab07a1edf20ec1848a09a8f81ac168b /pygments/lexers/lisp.py
parent	88f36b5a27ab6b086018a2bb339e17a018345850 (diff)
download	pygments-git-93d4cce817553c4bc2694747cfa6b3a04c6e5ddf.tar.gz