1 files changed, 73 insertions, 38 deletions
diff --git a/tests/test_examplefiles.py b/tests/test_examplefiles.py
index d785cf3b..924e1184 100644
--- a/tests/test_examplefiles.py
+++ b/tests/test_examplefiles.py
@@ -3,59 +3,94 @@
     Pygments tests with example files
     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS.
+    :copyright: Copyright 2006-2015 by the Pygments team, see AUTHORS.
     :license: BSD, see LICENSE for details.
 """
 
+from __future__ import print_function
+
 import os
 import pprint
 import difflib
-import cPickle as pickle
+import pickle
 
 from pygments.lexers import get_lexer_for_filename, get_lexer_by_name
 from pygments.token import Error
-from pygments.util import ClassNotFound, b
+from pygments.util import ClassNotFound
+
+import support
 
 STORE_OUTPUT = False
 
-# generate methods
+STATS = {}
+
+TESTDIR = os.path.dirname(__file__)
+
+# Jython generates a StackOverflowError for repetitions of the form (a|b)+,
+# which are commonly used in string patterns, when matching more than about 1000
+# chars.  These tests do not complete.  See http://bugs.jython.org/issue1965
+BAD_FILES_FOR_JYTHON = ('Object.st', 'all.nit', 'genclass.clj',
+                        'ragel-cpp_rlscan')
+
 def test_example_files():
-    testdir = os.path.dirname(__file__)
-    outdir = os.path.join(testdir, 'examplefiles', 'output')
+    global STATS
+    STATS = {}
+    outdir = os.path.join(TESTDIR, 'examplefiles', 'output')
     if STORE_OUTPUT and not os.path.isdir(outdir):
         os.makedirs(outdir)
-    for fn in os.listdir(os.path.join(testdir, 'examplefiles')):
+    for fn in os.listdir(os.path.join(TESTDIR, 'examplefiles')):
         if fn.startswith('.') or fn.endswith('#'):
             continue
 
-        absfn = os.path.join(testdir, 'examplefiles', fn)
+        absfn = os.path.join(TESTDIR, 'examplefiles', fn)
         if not os.path.isfile(absfn):
             continue
-        outfn = os.path.join(outdir, fn)
 
+        print(absfn)
+        with open(absfn, 'rb') as f:
+            code = f.read()
         try:
-            lx = get_lexer_for_filename(absfn)
-        except ClassNotFound:
-            if "_" not in fn:
+            code = code.decode('utf-8')
+        except UnicodeError:
+            code = code.decode('latin1')
+
+        lx = None
+        if '_' in fn:
+            try:
+                lx = get_lexer_by_name(fn.split('_')[0])
+            except ClassNotFound:
+                pass
+        if lx is None:
+            try:
+                lx = get_lexer_for_filename(absfn, code=code)
+            except ClassNotFound:
                 raise AssertionError('file %r has no registered extension, '
                                      'nor is of the form <lexer>_filename '
                                      'for overriding, thus no lexer found.'
-                                    % fn)
-            try:
-                name, rest = fn.split("_", 1)
-                lx = get_lexer_by_name(name)
-            except ClassNotFound:
-                raise AssertionError('no lexer found for file %r' % fn)
-        yield check_lexer, lx, absfn, outfn
+                                     % fn)
+        yield check_lexer, lx, fn
 
-def check_lexer(lx, absfn, outfn):
-    fp = open(absfn, 'rb')
-    try:
+    N = 7
+    stats = list(STATS.items())
+    stats.sort(key=lambda x: x[1][1])
+    print('\nExample files that took longest absolute time:')
+    for fn, t in stats[-N:]:
+        print('%-30s  %6d chars  %8.2f ms  %7.3f ms/char' % ((fn,) + t))
+    print()
+    stats.sort(key=lambda x: x[1][2])
+    print('\nExample files that took longest relative time:')
+    for fn, t in stats[-N:]:
+        print('%-30s  %6d chars  %8.2f ms  %7.3f ms/char' % ((fn,) + t))
+
+
+def check_lexer(lx, fn):
+    if os.name == 'java' and fn in BAD_FILES_FOR_JYTHON:
+        raise support.SkipTest
+    absfn = os.path.join(TESTDIR, 'examplefiles', fn)
+    with open(absfn, 'rb') as fp:
         text = fp.read()
-    finally:
-        fp.close()
-    text = text.replace(b('\r\n'), b('\n'))
-    text = text.strip(b('\n')) + b('\n')
+    text = text.replace(b'\r\n', b'\n')
+    text = text.strip(b'\n') + b'\n'
     try:
         text = text.decode('utf-8')
         if text.startswith(u'\ufeff'):
@@ -64,36 +99,36 @@ def check_lexer(lx, absfn, outfn):
         text = text.decode('latin1')
     ntext = []
     tokens = []
+    import time
+    t1 = time.time()
     for type, val in lx.get_tokens(text):
         ntext.append(val)
         assert type != Error, \
             'lexer %s generated error token for %s: %r at position %d' % \
             (lx, absfn, val, len(u''.join(ntext)))
         tokens.append((type, val))
+    t2 = time.time()
+    STATS[os.path.basename(absfn)] = (len(text),
+                                      1000 * (t2 - t1), 1000 * (t2 - t1) / len(text))
     if u''.join(ntext) != text:
-        print '\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(),
-                                             text.splitlines()))
+        print('\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(),
+                                             text.splitlines())))
         raise AssertionError('round trip failed for ' + absfn)
 
     # check output against previous run if enabled
     if STORE_OUTPUT:
         # no previous output -- store it
+        outfn = os.path.join(TESTDIR, 'examplefiles', 'output', fn)
         if not os.path.isfile(outfn):
-            fp = open(outfn, 'wb')
-            try:
+            with open(outfn, 'wb') as fp:
                 pickle.dump(tokens, fp)
-            finally:
-                fp.close()
             return
         # otherwise load it and compare
-        fp = open(outfn, 'rb')
-        try:
+        with open(outfn, 'rb') as fp:
             stored_tokens = pickle.load(fp)
-        finally:
-            fp.close()
         if stored_tokens != tokens:
             f1 = pprint.pformat(stored_tokens)
             f2 = pprint.pformat(tokens)
-            print '\n'.join(difflib.unified_diff(f1.splitlines(),
-                                                 f2.splitlines()))
+            print('\n'.join(difflib.unified_diff(f1.splitlines(),
+                                                 f2.splitlines())))
             assert False, absfn