Add Zeek lexer based on the Bro lexer

Bro has been renamed to Zeek, but the language is essentially the same without any different treatment of .zeek files from .bro files. This change also adds general improvements to the lexer.
author: Jon Siwek <jsiwek@corelight.com> 2019-11-24 09:02:28 -0800
committer: Georg Brandl <georg@python.org> 2019-11-25 20:39:24 +0100
commit: 71835d83fbfe004608609b4d2ae29aa8ddf31359 (patch)
tree: 18a8101d5c2ec6bff4d0e17831f5b7b3adfc9f7a
parent: ed97a09b19cb256c6d0081bfb2f96f9d54e6c404 (diff)
download: pygments-git-71835d83fbfe004608609b4d2ae29aa8ddf31359.tar.gz
5 files changed, 329 insertions, 56 deletions
diff --git a/doc/languages.rst b/doc/languages.rst
index be886683..e365ef2e 100644
--- a/doc/languages.rst
+++ b/doc/languages.rst
@@ -104,6 +104,7 @@ Programming languages
 * Visual Basic.NET
 * Visual FoxPro
 * XQuery
+* `Zeek <https://www.zeek.org>`_
 * Zephir
 
 Template languages
diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
index 7dfb2775..2533f651 100644
--- a/pygments/lexers/_mapping.py
+++ b/pygments/lexers/_mapping.py
@@ -475,6 +475,7 @@ LEXERS = {
     'XtlangLexer': ('pygments.lexers.lisp', 'xtlang', ('extempore',), ('*.xtm',), ()),
     'YamlJinjaLexer': ('pygments.lexers.templates', 'YAML+Jinja', ('yaml+jinja', 'salt', 'sls'), ('*.sls',), ('text/x-yaml+jinja', 'text/x-sls')),
     'YamlLexer': ('pygments.lexers.data', 'YAML', ('yaml',), ('*.yaml', '*.yml'), ('text/x-yaml',)),
+    'ZeekLexer': ('pygments.lexers.dsls', 'Zeek', ('zeek',), ('*.zeek',), ()),
     'ZephirLexer': ('pygments.lexers.php', 'Zephir', ('zephir',), ('*.zep',), ()),
     'ZigLexer': ('pygments.lexers.zig', 'Zig', ('zig',), ('*.zig',), ('text/zig',)),
 }
diff --git a/pygments/lexers/dsls.py b/pygments/lexers/dsls.py
index eb338d79..23fc9a97 100644
--- a/pygments/lexers/dsls.py
+++ b/pygments/lexers/dsls.py
@@ -16,7 +16,7 @@ from pygments.lexer import ExtendedRegexLexer, RegexLexer, bygroups, words, \
 from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
     Number, Punctuation, Literal, Whitespace
 
-__all__ = ['ProtoBufLexer', 'BroLexer', 'PuppetLexer', 'RslLexer',
+__all__ = ['ProtoBufLexer', 'ZeekLexer', 'BroLexer', 'PuppetLexer', 'RslLexer',
            'MscgenLexer', 'VGLLexer', 'AlloyLexer', 'PanLexer',
            'CrmshLexer', 'ThriftLexer', 'FlatlineLexer', 'SnowballLexer']
 
@@ -188,84 +188,174 @@ class ThriftLexer(RegexLexer):
     }
 
 
-class BroLexer(RegexLexer):
+class ZeekLexer(RegexLexer):
     """
-    For `Bro <http://bro-ids.org/>`_ scripts.
+    For `Zeek <https://www.zeek.org/>`_ scripts.
 
-    .. versionadded:: 1.5
+    .. versionadded:: 2.5
     """
-    name = 'Bro'
-    aliases = ['bro']
-    filenames = ['*.bro']
+    name = 'Zeek'
+    aliases = ['zeek']
+    filenames = ['*.zeek']
 
-    _hex = r'[0-9a-fA-F_]'
+    _hex = r'[0-9a-fA-F]'
     _float = r'((\d*\.?\d+)|(\d+\.?\d*))([eE][-+]?\d+)?'
     _h = r'[A-Za-z0-9][-A-Za-z0-9]*'
 
     tokens = {
         'root': [
-            # Whitespace
-            (r'^@.*?\n', Comment.Preproc),
-            (r'#.*?\n', Comment.Single),
+            include('whitespace'),
+            include('comments'),
+            include('directives'),
+            include('attributes'),
+            include('types'),
+            include('keywords'),
+            include('literals'),
+            include('operators'),
+            include('punctuation'),
+            (r'\b((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)(?=\s*\()',
+                Name.Function),
+            include('identifiers'),
+        ],
+
+        'whitespace': [
             (r'\n', Text),
             (r'\s+', Text),
             (r'\\\n', Text),
-            # Keywords
-            (r'(add|alarm|break|case|const|continue|delete|do|else|enum|event'
-             r'|export|for|function|if|global|hook|local|module|next'
-             r'|of|print|redef|return|schedule|switch|type|when|while)\b', Keyword),
-            (r'(addr|any|bool|count|counter|double|file|int|interval|net'
-             r'|pattern|port|record|set|string|subnet|table|time|timer'
-             r'|vector)\b', Keyword.Type),
-            (r'(T|F)\b', Keyword.Constant),
-            (r'(&)((?:add|delete|expire)_func|attr|(?:create|read|write)_expire'
-             r'|default|disable_print_hook|raw_output|encrypt|group|log'
-             r'|mergeable|optional|persistent|priority|redef'
-             r'|rotate_(?:interval|size)|synchronized)\b',
-             bygroups(Punctuation, Keyword)),
-            (r'\s+module\b', Keyword.Namespace),
-            # Addresses, ports and networks
-            (r'\d+/(tcp|udp|icmp|unknown)\b', Number),
-            (r'(\d+\.){3}\d+', Number),
-            (r'(' + _hex + r'){7}' + _hex, Number),
-            (r'0x' + _hex + r'(' + _hex + r'|:)*::(' + _hex + r'|:)*', Number),
-            (r'((\d+|:)(' + _hex + r'|:)*)?::(' + _hex + r'|:)*', Number),
-            (r'(\d+\.\d+\.|(\d+\.){2}\d+)', Number),
+        ],
+
+        'comments': [
+            (r'#.*$', Comment),
+        ],
+
+        'directives': [
+            (r'(@(load-plugin|load-sigs|load|unload))\b.*$', Comment.Preproc),
+            (r'(@(DEBUG|DIR|FILENAME|deprecated|if|ifdef|ifndef|else|endif))\b', Comment.Preproc),
+            (r'(@prefixes)\s*(\+?=).*$', Comment.Preproc),
+        ],
+
+        'attributes': [
+            (words(('redef', 'priority', 'log', 'optional', 'default', 'add_func',
+                    'delete_func', 'expire_func', 'read_expire', 'write_expire',
+                    'create_expire', 'synchronized', 'persistent', 'rotate_interval',
+                    'rotate_size', 'encrypt', 'raw_output', 'mergeable', 'error_handler',
+                    'type_column', 'deprecated'),
+                prefix=r'&', suffix=r'\b'),
+             Keyword.Pseudo),
+        ],
+
+        'types': [
+            (words(('any',
+                    'enum', 'record', 'set', 'table', 'vector',
+                    'function', 'hook', 'event',
+                    'addr', 'bool', 'count', 'double', 'file', 'int', 'interval',
+                    'pattern', 'port', 'string', 'subnet', 'time'),
+                prefix=r'\b', suffix=r'\b'),
+             Keyword.Type),
+
+            (r'\b(opaque)(\s+)(of)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)\b',
+                bygroups(Keyword.Type, Text, Operator.Word, Text, Keyword.Type)),
+
+            (r'\b(type)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)(\s*)(:)(\s*)\b(record|enum)\b',
+                bygroups(Keyword, Text, Name.Class, Text, Operator, Text, Keyword.Type)),
+
+            (r'\b(type)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)(\s*)(:)',
+                bygroups(Keyword, Text, Name, Text, Operator)),
+
+            (r'\b(redef)(\s+)(record|enum)(\s+)((?:[A-Za-z_][A-Za-z_0-9]*)(?:::(?:[A-Za-z_][A-Za-z_0-9]*))*)\b',
+                bygroups(Keyword, Text, Keyword.Type, Text, Name.Class)),
+        ],
+
+        'keywords': [
+            (words(('redef', 'export', 'if', 'else', 'for', 'while',
+                    'return', 'break', 'next', 'continue', 'fallthrough',
+                    'switch', 'default', 'case',
+                    'add', 'delete',
+                    'when', 'timeout', 'schedule'),
+                prefix=r'\b', suffix=r'\b'),
+             Keyword),
+            (r'\b(print)\b', Keyword),
+            (r'\b(global|local|const|option)\b', Keyword.Declaration),
+            (r'\b(module)(\s+)(([A-Za-z_][A-Za-z_0-9]*)(?:::([A-Za-z_][A-Za-z_0-9]*))*)\b',
+                bygroups(Keyword.Namespace, Text, Name.Namespace)),
+        ],
+
+        'literals': [
+            (r'"', String, 'string'),
+
+            # Not the greatest match for patterns, but generally helps
+            # disambiguate between start of a pattern and just a division
+            # operator.
+            (r'/(?=.*/)', String.Regex, 'regex'),
+
+            (r'\b(T|F)\b', Keyword.Constant),
+
+            # Port
+            (r'\b\d{1,5}/(udp|tcp|icmp|unknown)\b', Number),
+
+            # IPv4 Address
+            (r'\b(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\b', Number),
+
+            # IPv6 Address (not 100% correct: that takes more effort)
+            (r'\[([0-9a-fA-F]{0,4}:){2,7}([0-9a-fA-F]{0,4})?((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2})\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[0-9]{1,2}))?\]', Number),
+
+            # Numeric
+            (r'\b0[xX]' + _hex + r'+\b', Number.Hex),
+            (r'\b' + _float + r'\s*(day|hr|min|sec|msec|usec)s?\b', Number.Float),
+            (r'\b' + _float + r'\b', Number.Float),
+            (r'\b(\d+)\b', Number.Integer),
+
             # Hostnames
             (_h + r'(\.' + _h + r')+', String),
-            # Numeric
-            (_float + r'\s+(day|hr|min|sec|msec|usec)s?\b', Literal.Date),
-            (r'0[xX]' + _hex, Number.Hex),
-            (_float, Number.Float),
-            (r'\d+', Number.Integer),
-            (r'/', String.Regex, 'regex'),
-            (r'"', String, 'string'),
-            # Operators
-            (r'[!%*/+:<=>?~|-]', Operator),
+        ],
+
+        'operators': [
+            (r'[!%*/+<=>~|&^-]', Operator),
             (r'([-+=&|]{2}|[+=!><-]=)', Operator),
-            (r'(in|match)\b', Operator.Word),
-            (r'[{}()\[\]$.,;]', Punctuation),
-            # Identfier
-            (r'([_a-zA-Z]\w*)(::)', bygroups(Name, Name.Namespace)),
+            (r'\b(in|as|is|of)\b', Operator.Word),
+            (r'\??\$', Operator),
+            # Technically, colons are often used for punctuation/sepration.
+            # E.g. field name/type separation.
+            (r'[?:]', Operator),
+        ],
+
+        'punctuation': [
+            (r'\?\$', Punctuation),
+            (r'[{}()\[\],;:.]', Punctuation),
+        ],
+
+        'identifiers': [
+            (r'([a-zA-Z_]\w*)(::)', bygroups(Name, Punctuation)),
             (r'[a-zA-Z_]\w*', Name)
         ],
+
         'string': [
+            (r'\\.', String.Escape),
+            (r'%-?[0-9]*(\.[0-9]+)?[DTdxsefg]', String.Escape),
             (r'"', String, '#pop'),
-            (r'\\([\\abfnrtv"\']|x[a-fA-F0-9]{2,4}|[0-7]{1,3})', String.Escape),
-            (r'[^\\"\n]+', String),
-            (r'\\\n', String),
-            (r'\\', String)
+            (r'.', String),
         ],
+
         'regex': [
+            (r'\\.', String.Escape),
             (r'/', String.Regex, '#pop'),
-            (r'\\[\\nt/]', String.Regex),  # String.Escape is too intense here.
-            (r'[^\\/\n]+', String.Regex),
-            (r'\\\n', String.Regex),
-            (r'\\', String.Regex)
-        ]
+            (r'.', String.Regex),
+        ],
     }
 
 
+class BroLexer(ZeekLexer):
+    """
+    For `Zeek <https://www.zeek.org/>`_ scripts.
+    Note, this is equivalent to ZeekLexer (Bro is the old name for Zeek).
+
+    .. versionadded:: 1.5
+    """
+    name = 'Bro'
+    aliases = ['bro']
+    filenames = ['*.bro']
+
+
 class PuppetLexer(RegexLexer):
     """
     For `Puppet <http://puppetlabs.com/>`__ configuration DSL.
diff --git a/pygments/lexers/other.py b/pygments/lexers/other.py
index c3a60cef..441ffe26 100644
--- a/pygments/lexers/other.py
+++ b/pygments/lexers/other.py
@@ -27,8 +27,8 @@ from pygments.lexers.graphics import PostScriptLexer, GnuplotLexer, \
 from pygments.lexers.business import ABAPLexer, OpenEdgeLexer, \
     GoodDataCLLexer, MaqlLexer
 from pygments.lexers.automation import AutoItLexer, AutohotkeyLexer
-from pygments.lexers.dsls import ProtoBufLexer, BroLexer, PuppetLexer, \
-    MscgenLexer, VGLLexer
+from pygments.lexers.dsls import ProtoBufLexer, ZeekLexer, BroLexer, \
+    PuppetLexer, MscgenLexer, VGLLexer
 from pygments.lexers.basic import CbmBasicV2Lexer
 from pygments.lexers.pawn import SourcePawnLexer, PawnLexer
 from pygments.lexers.ecl import ECLLexer
diff --git a/tests/examplefiles/test.zeek b/tests/examplefiles/test.zeek
new file mode 100644
index 00000000..ee6bad8d
--- /dev/null
+++ b/tests/examplefiles/test.zeek
@@ -0,0 +1,181 @@
+# An example of the Zeek scripting language.
+
+##! A Zeekygen-style summmary comment.
+
+# TODO: just an example of a todo-indicator
+
+@load base/frameworks/notice
+
+@if ( F )
+@endif
+
+module Example;
+
+export {
+
+  type mycount: count;
+
+  type SimpleEnum: enum { ONE, TWO, THREE };
+
+  redef enum SimpleEnum += {
+
+    ## A Zeekygen-style comment.
+    FOUR,
+    FIVE, ##< A Zeekygen-style comment.
+  };
+
+  type SimpleRecord: record {
+    field1: count;
+    field2: bool;
+  } &redef;
+
+  redef record SimpleRecord += {
+
+    field3: string &optional;
+
+    field4: string &default="blah";
+  };
+
+  const init_option: bool = T;
+
+  option runtime_option: bool = F;
+
+  global test_opaque: opaque of md5;
+
+  global test_vector: vector of count;
+
+  global myfunction: function(msg: string, c: count &default=0): count;
+
+  global myhook: hook(tag: string);
+
+  global myevent: event(tag: string);
+}
+
+function myfunction(msg: string, c: count): count
+  {
+  print "in myfunction", msg, c;
+  return 0;
+  }
+
+event myevent(msg: string) &priority=1
+  {
+  print "in myevent";
+  }
+
+hook myhook(msg: string)
+  {
+  print "in myevent";
+  }
+
+event zeek_init()
+  {
+  local b = T;
+  local s = "\xff\xaf\"and more after the escaped quote";
+  local p = /foo|bar\xbe\/and more after the escaped slash/;
+  local c = 10;
+
+  local sr = SimpleRecord($field1 = 0, $field2 = T, $field3 = "hi");
+
+  print sr?$field3, sr$field1;
+
+  local myset: set[string] = set("one", "two", "three");
+
+  add myset["four"];
+  delete myset["one"];
+
+  for ( ms in myset )
+    {
+    print ms is string, s as string;
+
+    print s[1:3];
+
+    local tern: count = s == "two" ? 2 : 0;
+
+    if ( s !in myset )
+       print fmt("error %4.2f: %s", 3.14159, "wtf?");
+    }
+
+  switch ( c ) {
+  case 1:
+    break;
+  case 2:
+    fallthrough;
+  default:
+    break;
+  }
+
+  if ( ! b )
+    print "here";
+  else
+    print "there";
+
+  while ( c != 0 )
+    {
+    if ( c >= 5 )
+      c += 0;
+    else if ( c == 8 )
+      c -= 0;
+
+    c = c / 1;
+    c = c / 1;
+    c = c - 1;
+    }
+
+  print |myset|;
+  print ~5;
+  print 1 & 0xff;
+  print 2 ^ 5;
+
+  myfunction ("hello function");
+  hook myhook("hell hook");
+  event myevent("hello event");
+  schedule 1sec { myevent("hello scheduled event") };
+
+  print 0, 7;
+  print 0xff, 0xdeadbeef;
+
+  print 3.14159;
+  print 1234.0;
+  print 1234e0;
+  print .003E-23;
+  print .003E+23;
+
+  print 123/udp;
+  print 8000/tcp;
+  print 13/icmp;
+  print 42/unknown;
+
+  print google.com;
+  print 192.168.50.1;
+  print 255.255.255.255;
+  print 0.0.0.0;
+
+  print 10.0.0.0/16;
+
+  print [2001:0db8:85a3:0000:0000:8a2e:0370:7334];
+  # test for case insensitivity
+  print [2001:0DB8:85A3:0000:0000:8A2E:0370:7334];
+  # any case mixture is allowed
+  print [2001:0dB8:85a3:0000:0000:8A2E:0370:7334];
+  # leading zeroes of a 16-bit group may be omitted
+  print [2001:db8:85a3:0:0:8a2e:370:7334];
+  # a single occurrence of consecutive groups of zeroes may be replaced by ::
+  print [2001:db8:85a3::8a2e:370:7334];
+  # all zeroes should work
+  print [0:0:0:0:0:0:0:0];
+  # all zeroes condensed should work
+  print [::];
+  # hybrid ipv6-ipv4 address should work
+  print [2001:db8:0:0:0:FFFF:192.168.0.5];
+  # hybrid ipv6-ipv4 address with zero ommission should work
+  print [2001:db8::FFFF:192.168.0.5];
+
+  print [2001:0db8:85a3:0000:0000:8a2e:0370:7334]/64;
+
+  print 1day, 1days, 1.0day, 1.0days;
+  print 1hr, 1hrs, 1.0hr, 1.0hrs;
+  print 1min, 1mins, 1.0min, 1.0mins;
+  print 1sec, 1secs, 1.0sec, 1.0secs;
+  print 1msec, 1msecs, 1.0msec, 1.0msecs;
+  print 1usec, 1usecs, 1.0usec, 1.0usecs;
+  }
author	Jon Siwek <jsiwek@corelight.com>	2019-11-24 09:02:28 -0800
committer	Georg Brandl <georg@python.org>	2019-11-25 20:39:24 +0100
commit	71835d83fbfe004608609b4d2ae29aa8ddf31359 (patch)
tree	18a8101d5c2ec6bff4d0e17831f5b7b3adfc9f7a
parent	ed97a09b19cb256c6d0081bfb2f96f9d54e6c404 (diff)
download	pygments-git-71835d83fbfe004608609b4d2ae29aa8ddf31359.tar.gz