summaryrefslogtreecommitdiff
path: root/tests/test_examplefiles.py
blob: 491c1e0da439aacbb22973a1b51775a2422b04ee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""
    Pygments tests with example files
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
    :license: BSD, see LICENSE for details.
"""

from __future__ import print_function

import os
import pprint
import difflib
import pickle

import pytest

from pygments.lexers import get_lexer_for_filename, get_lexer_by_name
from pygments.token import Error
from pygments.util import ClassNotFound

# You can set this to True to store the exact token type output of example
# files in tests/examplefiles/output, and on the next run the test will
# want them to stay the same.  In the repository, this should stay False.
STORE_OUTPUT = False

STATS = {}

TESTDIR = os.path.dirname(__file__)

# Jython generates a StackOverflowError for repetitions of the form (a|b)+,
# which are commonly used in string patterns, when matching more than about 1000
# chars.  These tests do not complete.  See http://bugs.jython.org/issue1965
BAD_FILES_FOR_JYTHON = ('Object.st', 'all.nit', 'genclass.clj',
                        'ragel-cpp_rlscan')


def get_example_files():
    # TODO: move stats to a fixture
    # global STATS
    # STATS = {}
    outdir = os.path.join(TESTDIR, 'examplefiles', 'output')
    if STORE_OUTPUT and not os.path.isdir(outdir):
        os.makedirs(outdir)
    for fn in os.listdir(os.path.join(TESTDIR, 'examplefiles')):
        if fn.startswith('.') or fn.endswith('#'):
            continue

        absfn = os.path.join(TESTDIR, 'examplefiles', fn)
        if not os.path.isfile(absfn):
            continue

        extension = os.getenv('TEST_EXT')
        if extension and not absfn.endswith(extension):
            continue

        print(absfn)
        yield fn

    # N = 7
    # stats = list(STATS.items())
    # stats.sort(key=lambda x: x[1][1])
    # print('\nExample files that took longest absolute time:')
    # for fn, t in stats[-N:]:
    #     print('%-30s  %6d chars  %8.2f ms  %7.3f ms/char' % ((fn,) + t))
    # print()
    # stats.sort(key=lambda x: x[1][2])
    # print('\nExample files that took longest relative time:')
    # for fn, t in stats[-N:]:
    #     print('%-30s  %6d chars  %8.2f ms  %7.3f ms/char' % ((fn,) + t))


@pytest.mark.parametrize('filename', get_example_files())
def test_examplefile(filename):
    if os.name == 'java' and filename in BAD_FILES_FOR_JYTHON:
        pytest.skip('%s is a known bad file on Jython' % filename)

    absfn = os.path.join(TESTDIR, 'examplefiles', filename)
    with open(absfn, 'rb') as f:
        text = f.read()
    try:
        utext = text.decode('utf-8')
    except UnicodeError:
        utext = text.decode('latin1')

    lx = None
    if '_' in filename:
        try:
            lx = get_lexer_by_name(filename.split('_')[0])
        except ClassNotFound:
            pass
    if lx is None:
        try:
            lx = get_lexer_for_filename(absfn, code=utext)
        except ClassNotFound:
            raise AssertionError('file %r has no registered extension, '
                                 'nor is of the form <lexer>_filename '
                                 'for overriding, thus no lexer found.'
                                 % filename)

    text = text.replace(b'\r\n', b'\n')
    text = text.strip(b'\n') + b'\n'
    try:
        text = text.decode('utf-8')
        if text.startswith(u'\ufeff'):
            text = text[len(u'\ufeff'):]
    except UnicodeError:
        text = text.decode('latin1')
    ntext = []
    tokens = []
    import time
    t1 = time.time()
    for type, val in lx.get_tokens(text):
        ntext.append(val)
        assert type != Error, \
            'lexer %s generated error token for %s: %r at position %d' % \
            (lx, absfn, val, len(u''.join(ntext)))
        tokens.append((type, val))
    t2 = time.time()
    STATS[os.path.basename(absfn)] = (len(text),
                                      1000 * (t2 - t1), 1000 * (t2 - t1) / len(text))
    if u''.join(ntext) != text:
        print('\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(),
                                             text.splitlines())))
        raise AssertionError('round trip failed for ' + absfn)

    # check output against previous run if enabled
    if STORE_OUTPUT:
        # no previous output -- store it
        outfn = os.path.join(TESTDIR, 'examplefiles', 'output', filename)
        if not os.path.isfile(outfn):
            with open(outfn, 'wb') as fp:
                pickle.dump(tokens, fp)
            return
        # otherwise load it and compare
        with open(outfn, 'rb') as fp:
            stored_tokens = pickle.load(fp)
        if stored_tokens != tokens:
            f1 = pprint.pformat(stored_tokens)
            f2 = pprint.pformat(tokens)
            print('\n'.join(difflib.unified_diff(f1.splitlines(),
                                                 f2.splitlines())))
            assert False, absfn