summaryrefslogtreecommitdiff
path: root/benchmarks/generate_json.py
blob: ec5a1aaa4cc0a40c45bff1282c99d51d1775b405 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Generate a JSON file for json decode benchmarking purposes using freqdata.py.
Works with pypy and python.

Initial setup::

    $ git clone -b pypy-support git://github.com/simplejson/simplejson.git; \
      cd simplejson

To generate::

    $ mkdir -p build/bench; \
      PYTHONPATH=. python benchmarks/generate_json.py > build/bench/feed.json

To run the read benchmark::

    $ python -m timeit -n 1 -r 5 \
      -s 'import simplejson;s = open("build/bench/feed.json", "rb").read()' \
      'simplejson.loads(s)'

To run the write benchmark::

    $ python -m timeit -n 1 -r 5 \
      -s 'import simplejson;s = open("build/bench/feed.json", "rb").read()' \
      -s 'd = simplejson.loads(s)' \
      'simplejson.dumps(d)'

To run the read benchmark with speedups disabled::

    $ python -m timeit -n 1 -r 5 \
      -s 'import simplejson' \
      -s 'simplejson._use_speedups(False)' \
      -s 's = open("build/bench/feed.json", "rb").read()' \
      'simplejson.loads(s)'

To run the write benchmark with speedups disabled::

    $ python -m timeit -n 1 -r 5 \
      -s 'import simplejson' \
      -s 'simplejson._use_speedups(False)' \
      -s 's = open("build/bench/feed.json", "rb").read()' \
      -s 'd = simplejson.loads(s)' \
      'simplejson.dumps(d)'

"""
import sys
import operator
from random import Random

import simplejson

from freqdata import FREQ

def main(num_games, freq=FREQ):
    game_keys = sorted([k for k in freq.iterkeys() if not isinstance(k, tuple)])
    rng = Random()
    rng.seed(6935051420896)

    def type_chooser(type_freq):
        types = sorted(type_freq.iteritems(), key=operator.itemgetter(1, 0))
        type_sum = sum([v for (k, v) in types])
        def chooser():
            orig_i = rng.randrange(0, type_sum)
            i = orig_i
            for typ, freq in types:
                if i < freq:
                    return typ
                i -= freq
            assert False, "orig_i=%r type_sum=%r" % (orig_i, type_sum)
        return chooser

    choosers = dict((k, type_chooser(v)) for (k, v) in freq.iteritems())

    def generate_string(typ, (start, stop), coderangegen):
        if typ is str:
            empty, char = '', chr
        else:
            empty, char = u'', unichr
        for _ in xrange(rng.randrange(start, stop)):
            loword, highord = coderangegen()
        return empty.join([char(rng.randrange(*coderangegen()))
                           for _ in xrange(rng.randrange(start, stop))])

    def generate_list(key, typ):
        start, stop = typ[1]
        return [key_chooser((key, typ))
                for _ in xrange(rng.randrange(start, stop))]

    def key_chooser(key):
        typ = choosers[key]()
        if typ is None:
            return None
        elif typ is bool:
            return rng.choice((False, True))
        elif typ is float:
            # actually 100.0 is included in the actual data
            return rng.uniform(0.0, 100.0)
        elif typ is int:
            # this is arbitrary, not from data
            return rng.randrange(0, 10000000)
        elif isinstance(typ, tuple):
            (parent, rest) = typ
            if parent in (str, unicode):
                return generate_string(parent, rest, choosers.get((key, typ)))
            elif parent is list:
                return generate_list(key, typ)
        raise NotImplementedError(typ)

    def digits(n):
        s = str(rng.randrange(0, 10 ** n))
        return ('0' * (n - len(s))) + s

    def timestamp():
        return '{}-{}-{}T{}:{}:{}.{}'.format(
            *(digits(n) for n in (4, 2, 2, 2, 2, 2, 6)))

    def metascore():
        # actually 100.0 is included in the actual data
        return rng.uniform(0.0, 100.0)

    def game():
        d = {}
        for k in game_keys:
            d[k] = key_chooser(k)
        return d

    json = {}
    json['generated'] = timestamp()
    json['games'] = [game() for _ in xrange(num_games)]
    return simplejson.dumps(json, sort_keys=True)

if __name__ == '__main__':
    try:
        num = int(sys.argv[1])
    except IndexError:
        num = 20000
    print main(num)