From f0d31da6b26e7057dc488d8ee04e06b1a448a49c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matth=C3=A4us=20G=2E=20Chajdas?= Date: Tue, 22 Sep 2020 20:40:58 +0200 Subject: Improve various analyse_text methods. * Make Perl less confident in presence of :=. * Improve brainfuck check to not parse the whole input. * Improve Unicon by matching \self, /self * Fix Ezhil not matching against the input text --- pygments/lexers/esoteric.py | 15 ++++++++++++--- pygments/lexers/ezhil.py | 2 +- pygments/lexers/modula2.py | 15 +++++++++++++-- pygments/lexers/perl.py | 16 +++++++++++++++- pygments/lexers/theorem.py | 4 ++-- pygments/lexers/unicon.py | 9 ++++++++- tests/test_analyze_lexer.py | 14 +++++++++++++- 7 files changed, 64 insertions(+), 11 deletions(-) diff --git a/pygments/lexers/esoteric.py b/pygments/lexers/esoteric.py index 0997ffe1..0fe89299 100644 --- a/pygments/lexers/esoteric.py +++ b/pygments/lexers/esoteric.py @@ -55,17 +55,26 @@ class BrainfuckLexer(RegexLexer): and < > is brainfuck.""" plus_minus_count = 0 greater_less_count = 0 - for c in text: + + range_to_check = max(256, len(text)) + + for c in text[:range_to_check]: if c == '+' or c == '-': plus_minus_count += 1 if c == '<' or c == '>': greater_less_count += 1 - if plus_minus_count > (0.25 * len(text)): + if plus_minus_count > (0.25 * range_to_check): return 1.0 - if greater_less_count > (0.25 * len(text)): + if greater_less_count > (0.25 * range_to_check): return 1.0 + result = 0 + if '[-]' in text: + result += 0.5 + + return result + class BefungeLexer(RegexLexer): """ diff --git a/pygments/lexers/ezhil.py b/pygments/lexers/ezhil.py index 109b607b..4af37f33 100644 --- a/pygments/lexers/ezhil.py +++ b/pygments/lexers/ezhil.py @@ -69,7 +69,7 @@ class EzhilLexer(RegexLexer): decent amount of Tamil-characters, it's this language. This assumption is obviously horribly off if someone uses string literals in tamil in another language.""" - if len(re.findall('[\u0b80-\u0bff]')) > 10: + if len(re.findall(r'[\u0b80-\u0bff]', text)) > 10: return 0.25 def __init__(self, **options): diff --git a/pygments/lexers/modula2.py b/pygments/lexers/modula2.py index c4b95b38..05144222 100644 --- a/pygments/lexers/modula2.py +++ b/pygments/lexers/modula2.py @@ -1563,9 +1563,20 @@ class Modula2Lexer(RegexLexer): def analyse_text(text): """Not much we can go by. (* for comments is our best guess.""" result = 0 + + is_pascal_like = 0 if '(*' in text and '*)' in text: - result += 0.01 + is_pascal_like += 0.5 if ':=' in text: - result += 0.01 + is_pascal_like += 0.5 + + if is_pascal_like == 1: + # Procedure is in Modula2 + if re.search(r'\bPROCEDURE\b', text): + result += 0.6 + + # FUNCTION is only valid in Pascal, but not in Modula2 + if re.search(r'\bFUNCTION\b', text): + result = 0.0 return result diff --git a/pygments/lexers/perl.py b/pygments/lexers/perl.py index 741de3fd..95fb94e7 100644 --- a/pygments/lexers/perl.py +++ b/pygments/lexers/perl.py @@ -208,8 +208,18 @@ class PerlLexer(RegexLexer): def analyse_text(text): if shebang_matches(text, r'perl'): return True + + result = 0 + if re.search(r'(?:my|our)\s+[$@%(]', text): - return 0.9 + result += 0.9 + + if ':=' in text: + # := is not valid Perl, but it appears in unicon, so we should + # become less confident if we think we found Perl with := + result /= 2 + + return result class Perl6Lexer(ExtendedRegexLexer): @@ -711,6 +721,10 @@ class Perl6Lexer(ExtendedRegexLexer): continue break + if ':=' in text: + # Same logic as above for PerlLexer + rating /= 2 + return rating def __init__(self, **options): diff --git a/pygments/lexers/theorem.py b/pygments/lexers/theorem.py index c4c857d4..a4fa24de 100644 --- a/pygments/lexers/theorem.py +++ b/pygments/lexers/theorem.py @@ -154,8 +154,8 @@ class CoqLexer(RegexLexer): } def analyse_text(text): - if text.startswith('(*'): - return True + if 'qed' in text and 'tauto' in text: + return 1 class IsabelleLexer(RegexLexer): diff --git a/pygments/lexers/unicon.py b/pygments/lexers/unicon.py index 45b4c15a..95815907 100644 --- a/pygments/lexers/unicon.py +++ b/pygments/lexers/unicon.py @@ -387,7 +387,8 @@ class UcodeLexer(RegexLexer): } def analyse_text(text): - """endsuspend and endrepeat are unique to this language.""" + """endsuspend and endrepeat are unique to this language, and + \\self, /self doesn't seem to get used anywhere else either.""" result = 0 if 'endsuspend' in text: @@ -402,4 +403,10 @@ class UcodeLexer(RegexLexer): if 'procedure' in text and 'end' in text: result += 0.01 + # This seems quite unique to unicon -- doesn't appear in any other + # example source we have (A quick search reveals that \SELF appears in + # Perl/Raku code) + if r'\self' in text and r'/self' in text: + result += 0.5 + return result diff --git a/tests/test_analyze_lexer.py b/tests/test_analyze_lexer.py index 933ea15b..1d80bb01 100644 --- a/tests/test_analyze_lexer.py +++ b/tests/test_analyze_lexer.py @@ -45,4 +45,16 @@ def test_guess_lexer_hybris(): def test_guess_lexer_forth(): l = _guess_lexer_for_file('demo.frt') - assert l.__class__.__name__ == 'ForthLexer' \ No newline at end of file + assert l.__class__.__name__ == 'ForthLexer' + +def test_guess_lexer_modula2(): + l = _guess_lexer_for_file('modula2_test_cases.def') + assert l.__class__.__name__ == 'Modula2Lexer' + +def test_guess_lexer_unicon(): + l = _guess_lexer_for_file('example.icn') + assert l.__class__.__name__ == 'UcodeLexer' + +def test_guess_lexer_ezhil(): + l = _guess_lexer_for_file('ezhil_primefactors.n') + assert l.__class__.__name__ == 'EzhilLexer' \ No newline at end of file -- cgit v1.2.1