Improve various analyse_text methods.task/add-analyze-text

* Make Perl less confident in presence of :=. * Improve brainfuck check to not parse the whole input. * Improve Unicon by matching \self, /self * Fix Ezhil not matching against the input text
author: Matthäus G. Chajdas <dev@anteru.net> 2020-09-22 20:40:58 +0200
committer: Matthäus G. Chajdas <dev@anteru.net> 2020-09-22 20:40:58 +0200
commit: f0d31da6b26e7057dc488d8ee04e06b1a448a49c (patch)
tree: bb01aa5da7a5f4aa4e3e2a4836b040c535ea977d
parent: 07f596dbb9357c1ec2077bdc4319f594d287ae15 (diff)
download: pygments-git-task/add-analyze-text.tar.gz
7 files changed, 64 insertions, 11 deletions
diff --git a/pygments/lexers/esoteric.py b/pygments/lexers/esoteric.py
index 0997ffe1..0fe89299 100644
--- a/pygments/lexers/esoteric.py
+++ b/pygments/lexers/esoteric.py
@@ -55,17 +55,26 @@ class BrainfuckLexer(RegexLexer):
         and < > is brainfuck."""
         plus_minus_count = 0
         greater_less_count = 0
-        for c in text:
+
+        range_to_check = max(256, len(text))
+
+        for c in text[:range_to_check]:
             if c == '+' or c == '-':
                 plus_minus_count += 1
             if c == '<' or c == '>':
                 greater_less_count += 1
 
-        if plus_minus_count > (0.25 * len(text)):
+        if plus_minus_count > (0.25 * range_to_check):
             return 1.0
-        if greater_less_count > (0.25 * len(text)):
+        if greater_less_count > (0.25 * range_to_check):
             return 1.0
 
+        result = 0
+        if '[-]' in text:
+            result += 0.5
+
+        return result
+
 
 class BefungeLexer(RegexLexer):
     """
diff --git a/pygments/lexers/ezhil.py b/pygments/lexers/ezhil.py
index 109b607b..4af37f33 100644
--- a/pygments/lexers/ezhil.py
+++ b/pygments/lexers/ezhil.py
@@ -69,7 +69,7 @@ class EzhilLexer(RegexLexer):
         decent amount of Tamil-characters, it's this language. This assumption
         is obviously horribly off if someone uses string literals in tamil
         in another language."""
-        if len(re.findall('[\u0b80-\u0bff]')) > 10:
+        if len(re.findall(r'[\u0b80-\u0bff]', text)) > 10:
             return 0.25
 
     def __init__(self, **options):
diff --git a/pygments/lexers/modula2.py b/pygments/lexers/modula2.py
index c4b95b38..05144222 100644
--- a/pygments/lexers/modula2.py
+++ b/pygments/lexers/modula2.py
@@ -1563,9 +1563,20 @@ class Modula2Lexer(RegexLexer):
     def analyse_text(text):
         """Not much we can go by. (* for comments is our best guess."""
         result = 0
+
+        is_pascal_like = 0
         if '(*' in text and '*)' in text:
-            result += 0.01
+            is_pascal_like += 0.5
         if ':=' in text:
-            result += 0.01
+            is_pascal_like += 0.5
+
+        if is_pascal_like == 1:
+            # Procedure is in Modula2
+            if re.search(r'\bPROCEDURE\b', text):
+                result += 0.6
+
+            # FUNCTION is only valid in Pascal, but not in Modula2
+            if re.search(r'\bFUNCTION\b', text):
+                result = 0.0
 
         return result
diff --git a/pygments/lexers/perl.py b/pygments/lexers/perl.py
index 741de3fd..95fb94e7 100644
--- a/pygments/lexers/perl.py
+++ b/pygments/lexers/perl.py
@@ -208,8 +208,18 @@ class PerlLexer(RegexLexer):
     def analyse_text(text):
         if shebang_matches(text, r'perl'):
             return True
+
+        result = 0
+
         if re.search(r'(?:my|our)\s+[$@%(]', text):
-            return 0.9
+            result += 0.9
+
+        if ':=' in text:
+            # := is not valid Perl, but it appears in unicon, so we should
+            # become less confident if we think we found Perl with :=
+            result /= 2
+
+        return result
 
 
 class Perl6Lexer(ExtendedRegexLexer):
@@ -711,6 +721,10 @@ class Perl6Lexer(ExtendedRegexLexer):
                 continue
             break
 
+        if ':=' in text:
+            # Same logic as above for PerlLexer
+            rating /= 2
+
         return rating
 
     def __init__(self, **options):
diff --git a/pygments/lexers/theorem.py b/pygments/lexers/theorem.py
index c4c857d4..a4fa24de 100644
--- a/pygments/lexers/theorem.py
+++ b/pygments/lexers/theorem.py
@@ -154,8 +154,8 @@ class CoqLexer(RegexLexer):
     }
 
     def analyse_text(text):
-        if text.startswith('(*'):
-            return True
+        if 'qed' in text and 'tauto' in text:
+            return 1
 
 
 class IsabelleLexer(RegexLexer):
diff --git a/pygments/lexers/unicon.py b/pygments/lexers/unicon.py
index 45b4c15a..95815907 100644
--- a/pygments/lexers/unicon.py
+++ b/pygments/lexers/unicon.py
@@ -387,7 +387,8 @@ class UcodeLexer(RegexLexer):
     }
 
     def analyse_text(text):
-        """endsuspend and endrepeat are unique to this language."""
+        """endsuspend and endrepeat are unique to this language, and
+        \\self, /self doesn't seem to get used anywhere else either."""
         result = 0
 
         if 'endsuspend' in text:
@@ -402,4 +403,10 @@ class UcodeLexer(RegexLexer):
         if 'procedure' in text and 'end' in text:
             result += 0.01
 
+        # This seems quite unique to unicon -- doesn't appear in any other
+        # example source we have (A quick search reveals that \SELF appears in
+        # Perl/Raku code)
+        if r'\self' in text and r'/self' in text:
+            result += 0.5
+
         return result
diff --git a/tests/test_analyze_lexer.py b/tests/test_analyze_lexer.py
index 933ea15b..1d80bb01 100644
--- a/tests/test_analyze_lexer.py
+++ b/tests/test_analyze_lexer.py
@@ -45,4 +45,16 @@ def test_guess_lexer_hybris():
 
 def test_guess_lexer_forth():
     l = _guess_lexer_for_file('demo.frt')
-    assert l.__class__.__name__ == 'ForthLexer'
-\ No newline at end of file
+    assert l.__class__.__name__ == 'ForthLexer'
+
+def test_guess_lexer_modula2():
+    l = _guess_lexer_for_file('modula2_test_cases.def')
+    assert l.__class__.__name__ == 'Modula2Lexer'
+
+def test_guess_lexer_unicon():
+    l = _guess_lexer_for_file('example.icn')
+    assert l.__class__.__name__ == 'UcodeLexer'
+
+def test_guess_lexer_ezhil():
+    l = _guess_lexer_for_file('ezhil_primefactors.n')
+    assert l.__class__.__name__ == 'EzhilLexer'
+\ No newline at end of file
author	Matthäus G. Chajdas <dev@anteru.net>	2020-09-22 20:40:58 +0200
committer	Matthäus G. Chajdas <dev@anteru.net>	2020-09-22 20:40:58 +0200
commit	f0d31da6b26e7057dc488d8ee04e06b1a448a49c (patch)
tree	bb01aa5da7a5f4aa4e3e2a4836b040c535ea977d
parent	07f596dbb9357c1ec2077bdc4319f594d287ae15 (diff)
download	pygments-git-task/add-analyze-text.tar.gz