summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthäus G. Chajdas <dev@anteru.net>2020-09-22 20:40:58 +0200
committerMatthäus G. Chajdas <dev@anteru.net>2020-09-22 20:40:58 +0200
commitf0d31da6b26e7057dc488d8ee04e06b1a448a49c (patch)
treebb01aa5da7a5f4aa4e3e2a4836b040c535ea977d
parent07f596dbb9357c1ec2077bdc4319f594d287ae15 (diff)
downloadpygments-git-task/add-analyze-text.tar.gz
Improve various analyse_text methods.task/add-analyze-text
* Make Perl less confident in presence of :=. * Improve brainfuck check to not parse the whole input. * Improve Unicon by matching \self, /self * Fix Ezhil not matching against the input text
-rw-r--r--pygments/lexers/esoteric.py15
-rw-r--r--pygments/lexers/ezhil.py2
-rw-r--r--pygments/lexers/modula2.py15
-rw-r--r--pygments/lexers/perl.py16
-rw-r--r--pygments/lexers/theorem.py4
-rw-r--r--pygments/lexers/unicon.py9
-rw-r--r--tests/test_analyze_lexer.py14
7 files changed, 64 insertions, 11 deletions
diff --git a/pygments/lexers/esoteric.py b/pygments/lexers/esoteric.py
index 0997ffe1..0fe89299 100644
--- a/pygments/lexers/esoteric.py
+++ b/pygments/lexers/esoteric.py
@@ -55,17 +55,26 @@ class BrainfuckLexer(RegexLexer):
and < > is brainfuck."""
plus_minus_count = 0
greater_less_count = 0
- for c in text:
+
+ range_to_check = max(256, len(text))
+
+ for c in text[:range_to_check]:
if c == '+' or c == '-':
plus_minus_count += 1
if c == '<' or c == '>':
greater_less_count += 1
- if plus_minus_count > (0.25 * len(text)):
+ if plus_minus_count > (0.25 * range_to_check):
return 1.0
- if greater_less_count > (0.25 * len(text)):
+ if greater_less_count > (0.25 * range_to_check):
return 1.0
+ result = 0
+ if '[-]' in text:
+ result += 0.5
+
+ return result
+
class BefungeLexer(RegexLexer):
"""
diff --git a/pygments/lexers/ezhil.py b/pygments/lexers/ezhil.py
index 109b607b..4af37f33 100644
--- a/pygments/lexers/ezhil.py
+++ b/pygments/lexers/ezhil.py
@@ -69,7 +69,7 @@ class EzhilLexer(RegexLexer):
decent amount of Tamil-characters, it's this language. This assumption
is obviously horribly off if someone uses string literals in tamil
in another language."""
- if len(re.findall('[\u0b80-\u0bff]')) > 10:
+ if len(re.findall(r'[\u0b80-\u0bff]', text)) > 10:
return 0.25
def __init__(self, **options):
diff --git a/pygments/lexers/modula2.py b/pygments/lexers/modula2.py
index c4b95b38..05144222 100644
--- a/pygments/lexers/modula2.py
+++ b/pygments/lexers/modula2.py
@@ -1563,9 +1563,20 @@ class Modula2Lexer(RegexLexer):
def analyse_text(text):
"""Not much we can go by. (* for comments is our best guess."""
result = 0
+
+ is_pascal_like = 0
if '(*' in text and '*)' in text:
- result += 0.01
+ is_pascal_like += 0.5
if ':=' in text:
- result += 0.01
+ is_pascal_like += 0.5
+
+ if is_pascal_like == 1:
+ # Procedure is in Modula2
+ if re.search(r'\bPROCEDURE\b', text):
+ result += 0.6
+
+ # FUNCTION is only valid in Pascal, but not in Modula2
+ if re.search(r'\bFUNCTION\b', text):
+ result = 0.0
return result
diff --git a/pygments/lexers/perl.py b/pygments/lexers/perl.py
index 741de3fd..95fb94e7 100644
--- a/pygments/lexers/perl.py
+++ b/pygments/lexers/perl.py
@@ -208,8 +208,18 @@ class PerlLexer(RegexLexer):
def analyse_text(text):
if shebang_matches(text, r'perl'):
return True
+
+ result = 0
+
if re.search(r'(?:my|our)\s+[$@%(]', text):
- return 0.9
+ result += 0.9
+
+ if ':=' in text:
+ # := is not valid Perl, but it appears in unicon, so we should
+ # become less confident if we think we found Perl with :=
+ result /= 2
+
+ return result
class Perl6Lexer(ExtendedRegexLexer):
@@ -711,6 +721,10 @@ class Perl6Lexer(ExtendedRegexLexer):
continue
break
+ if ':=' in text:
+ # Same logic as above for PerlLexer
+ rating /= 2
+
return rating
def __init__(self, **options):
diff --git a/pygments/lexers/theorem.py b/pygments/lexers/theorem.py
index c4c857d4..a4fa24de 100644
--- a/pygments/lexers/theorem.py
+++ b/pygments/lexers/theorem.py
@@ -154,8 +154,8 @@ class CoqLexer(RegexLexer):
}
def analyse_text(text):
- if text.startswith('(*'):
- return True
+ if 'qed' in text and 'tauto' in text:
+ return 1
class IsabelleLexer(RegexLexer):
diff --git a/pygments/lexers/unicon.py b/pygments/lexers/unicon.py
index 45b4c15a..95815907 100644
--- a/pygments/lexers/unicon.py
+++ b/pygments/lexers/unicon.py
@@ -387,7 +387,8 @@ class UcodeLexer(RegexLexer):
}
def analyse_text(text):
- """endsuspend and endrepeat are unique to this language."""
+ """endsuspend and endrepeat are unique to this language, and
+ \\self, /self doesn't seem to get used anywhere else either."""
result = 0
if 'endsuspend' in text:
@@ -402,4 +403,10 @@ class UcodeLexer(RegexLexer):
if 'procedure' in text and 'end' in text:
result += 0.01
+ # This seems quite unique to unicon -- doesn't appear in any other
+ # example source we have (A quick search reveals that \SELF appears in
+ # Perl/Raku code)
+ if r'\self' in text and r'/self' in text:
+ result += 0.5
+
return result
diff --git a/tests/test_analyze_lexer.py b/tests/test_analyze_lexer.py
index 933ea15b..1d80bb01 100644
--- a/tests/test_analyze_lexer.py
+++ b/tests/test_analyze_lexer.py
@@ -45,4 +45,16 @@ def test_guess_lexer_hybris():
def test_guess_lexer_forth():
l = _guess_lexer_for_file('demo.frt')
- assert l.__class__.__name__ == 'ForthLexer' \ No newline at end of file
+ assert l.__class__.__name__ == 'ForthLexer'
+
+def test_guess_lexer_modula2():
+ l = _guess_lexer_for_file('modula2_test_cases.def')
+ assert l.__class__.__name__ == 'Modula2Lexer'
+
+def test_guess_lexer_unicon():
+ l = _guess_lexer_for_file('example.icn')
+ assert l.__class__.__name__ == 'UcodeLexer'
+
+def test_guess_lexer_ezhil():
+ l = _guess_lexer_for_file('ezhil_primefactors.n')
+ assert l.__class__.__name__ == 'EzhilLexer' \ No newline at end of file