summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2020-12-19 18:37:36 +0100
committerGeorg Brandl <georg@python.org>2020-12-19 18:38:24 +0100
commit873298b4d8511f635dba1d0d29fdf2a677f9d85f (patch)
treea2eaa122fd979e76e2ab7753c7b186c834c7e8f6 /scripts
parenta56ed8a1f2eb0db14aad50cb7e4eaaf7f2f0d3b3 (diff)
downloadpygments-git-873298b4d8511f635dba1d0d29fdf2a677f9d85f.tar.gz
scripts/debug_lexer: allow guessing from content
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/debug_lexer.py31
1 files changed, 23 insertions, 8 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
index 0f1dc0bd..059a489f 100755
--- a/scripts/debug_lexer.py
+++ b/scripts/debug_lexer.py
@@ -24,7 +24,7 @@ if os.path.isdir(os.path.join(srcpath, 'pygments')):
from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \
ProfilingRegexLexer, ProfilingRegexLexerMeta
from pygments.lexers import get_lexer_by_name, find_lexer_class, \
- find_lexer_class_for_filename
+ find_lexer_class_for_filename, guess_lexer
from pygments.token import Error, Text, _TokenType
from pygments.cmdline import _parse_options
@@ -96,8 +96,24 @@ class DebuggingRegexLexer(ExtendedRegexLexer):
def main(fn, lexer=None, options={}):
+ if fn == '-':
+ text = sys.stdin.read()
+ else:
+ try:
+ with open(fn, 'rb') as fp:
+ text = fp.read().decode('utf-8')
+ except UnicodeError:
+ print('Warning: non-UTF8 input, using latin1')
+ with open(fn, 'rb') as fp:
+ text = fp.read().decode('latin1')
+ text = text.strip('\n') + '\n'
+
if lexer is not None:
lxcls = get_lexer_by_name(lexer).__class__
+ elif guess:
+ lxcls = guess_lexer(text).__class__
+ print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
+ lxcls.__name__))
else:
lxcls = find_lexer_class_for_filename(os.path.basename(fn))
if lxcls is None:
@@ -129,12 +145,6 @@ def main(fn, lexer=None, options={}):
lx = lxcls(**options)
lno = 1
- if fn == '-':
- text = sys.stdin.read()
- else:
- with open(fn, 'rb') as fp:
- text = fp.read().decode('utf-8')
- text = text.strip('\n') + '\n'
tokens = []
states = []
@@ -188,6 +198,7 @@ Selecting lexer and options:
-l NAME use lexer named NAME (default is to guess from
the given filenames)
+ -g guess lexer from content
-O OPTIONSTR use lexer options parsed from OPTIONSTR
Debugging lexing errors:
@@ -205,6 +216,7 @@ Profiling:
column 4, the time per call)
''')
+
num = 10
showall = False
ignerror = False
@@ -212,10 +224,11 @@ lexer = None
options = {}
profile = False
profsort = 4
+guess = False
if __name__ == '__main__':
import getopt
- opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:h')
+ opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hg')
for opt, val in opts:
if opt == '-n':
num = int(val)
@@ -231,6 +244,8 @@ if __name__ == '__main__':
profsort = int(val)
elif opt == '-O':
options = _parse_options([val])
+ elif opt == '-g':
+ guess = True
elif opt == '-h':
print_help()
sys.exit(0)