Tools/dump_github_issues.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

"""
Dump the GitHub issues of the current project to a file (.json.gz).

Usage:  python3 Tools/dump_github_issues.py
"""

import configparser
import gzip
import json
import os.path

from datetime import datetime
from urllib.request import urlopen

GIT_CONFIG_FILE = ".git/config"


class RateLimitReached(Exception):
    pass


def gen_urls(repo):
    i = 0
    while True:
        yield f"https://api.github.com/repos/{repo}/issues?state=all&per_page=100&page={i}"
        i += 1


def read_rate_limit():
    with urlopen("https://api.github.com/rate_limit") as p:
        return json.load(p)


def parse_rate_limit(limits):
    limits = limits['resources']['core']
    return limits['limit'], limits['remaining'], datetime.fromtimestamp(limits['reset'])


def load_url(url):
    with urlopen(url) as p:
        data = json.load(p)
    if isinstance(data, dict) and 'rate limit' in data.get('message', ''):
        raise RateLimitReached()

    assert isinstance(data, list), type(data)
    return data or None  # None indicates empty last page


def join_list_data(lists):
    result = []
    for data in lists:
        if not data:
            break
        result.extend(data)
    return result


def output_filename(repo):
    timestamp = datetime.now()
    return f"github_issues_{repo.replace('/', '_')}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json.gz"


def write_gzjson(file_name, data, indent=2):
    with gzip.open(file_name, "wt", encoding='utf-8') as gz:
        json.dump(data, gz, indent=indent)


def find_origin_url(git_config=GIT_CONFIG_FILE):
    assert os.path.exists(git_config)
    parser = configparser.ConfigParser()
    parser.read(git_config)
    return parser.get('remote "origin"', 'url')


def parse_repo_name(git_url):
    if git_url.endswith('.git'):
        git_url = git_url[:-4]
    return '/'.join(git_url.split('/')[-2:])


def dump_issues(repo):
    """Main entry point."""
    print(f"Reading issues from repo '{repo}'")
    urls = gen_urls(repo)
    try:
        paged_data = map(load_url, urls)
        issues = join_list_data(paged_data)
    except RateLimitReached:
        limit, remaining, reset_time = parse_rate_limit(read_rate_limit())
        print(f"FAILURE: Rate limits ({limit}) reached, remaining: {remaining}, reset at {reset_time}")
        return

    filename = output_filename(repo)
    print(f"Writing {len(issues)} to {filename}")
    write_gzjson(filename, issues)


### TESTS

def test_join_list_data():
    assert join_list_data([]) == []
    assert join_list_data([[1,2]]) == [1,2]
    assert join_list_data([[1,2], [3]]) == [1,2,3]
    assert join_list_data([[0], [1,2], [3]]) == [0,1,2,3]
    assert join_list_data([[0], [1,2], [[[]],[]]]) == [0,1,2,[[]],[]]


def test_output_filename():
    filename = output_filename("re/po")
    import re
    assert re.match(r"github_issues_re_po_[0-9]{8}_[0-9]{6}\.json", filename)


def test_find_origin_url():
    assert find_origin_url()


def test_parse_repo_name():
    assert parse_repo_name("https://github.com/cython/cython") == "cython/cython"
    assert parse_repo_name("git+ssh://git@github.com/cython/cython.git") == "cython/cython"
    assert parse_repo_name("git+ssh://git@github.com/fork/cython.git") == "fork/cython"


def test_write_gzjson():
    import tempfile
    with tempfile.NamedTemporaryFile() as tmp:
        write_gzjson(tmp.name, [{}])

        # test JSON format
        with gzip.open(tmp.name) as f:
            assert json.load(f) == [{}]

        # test indentation
        with gzip.open(tmp.name) as f:
            assert f.read() == b'[\n  {}\n]'


### MAIN

if __name__ == '__main__':
    repo_name = parse_repo_name(find_origin_url())
    dump_issues(repo_name)