1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
"""
Dump the GitHub issues of the current project to a file (.json.gz).
Usage: python3 Tools/dump_github_issues.py
"""
import configparser
import gzip
import json
import os.path
from datetime import datetime
from urllib.request import urlopen
GIT_CONFIG_FILE = ".git/config"
class RateLimitReached(Exception):
pass
def gen_urls(repo):
i = 0
while True:
yield f"https://api.github.com/repos/{repo}/issues?state=all&per_page=100&page={i}"
i += 1
def read_rate_limit():
with urlopen("https://api.github.com/rate_limit") as p:
return json.load(p)
def parse_rate_limit(limits):
limits = limits['resources']['core']
return limits['limit'], limits['remaining'], datetime.fromtimestamp(limits['reset'])
def load_url(url):
with urlopen(url) as p:
data = json.load(p)
if isinstance(data, dict) and 'rate limit' in data.get('message', ''):
raise RateLimitReached()
assert isinstance(data, list), type(data)
return data or None # None indicates empty last page
def join_list_data(lists):
result = []
for data in lists:
if not data:
break
result.extend(data)
return result
def output_filename(repo):
timestamp = datetime.now()
return f"github_issues_{repo.replace('/', '_')}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json.gz"
def write_gzjson(file_name, data, indent=2):
with gzip.open(file_name, "wt", encoding='utf-8') as gz:
json.dump(data, gz, indent=indent)
def find_origin_url(git_config=GIT_CONFIG_FILE):
assert os.path.exists(git_config)
parser = configparser.ConfigParser()
parser.read(git_config)
return parser.get('remote "origin"', 'url')
def parse_repo_name(git_url):
if git_url.endswith('.git'):
git_url = git_url[:-4]
return '/'.join(git_url.split('/')[-2:])
def dump_issues(repo):
"""Main entry point."""
print(f"Reading issues from repo '{repo}'")
urls = gen_urls(repo)
try:
paged_data = map(load_url, urls)
issues = join_list_data(paged_data)
except RateLimitReached:
limit, remaining, reset_time = parse_rate_limit(read_rate_limit())
print(f"FAILURE: Rate limits ({limit}) reached, remaining: {remaining}, reset at {reset_time}")
return
filename = output_filename(repo)
print(f"Writing {len(issues)} to {filename}")
write_gzjson(filename, issues)
### TESTS
def test_join_list_data():
assert join_list_data([]) == []
assert join_list_data([[1,2]]) == [1,2]
assert join_list_data([[1,2], [3]]) == [1,2,3]
assert join_list_data([[0], [1,2], [3]]) == [0,1,2,3]
assert join_list_data([[0], [1,2], [[[]],[]]]) == [0,1,2,[[]],[]]
def test_output_filename():
filename = output_filename("re/po")
import re
assert re.match(r"github_issues_re_po_[0-9]{8}_[0-9]{6}\.json", filename)
def test_find_origin_url():
assert find_origin_url()
def test_parse_repo_name():
assert parse_repo_name("https://github.com/cython/cython") == "cython/cython"
assert parse_repo_name("git+ssh://git@github.com/cython/cython.git") == "cython/cython"
assert parse_repo_name("git+ssh://git@github.com/fork/cython.git") == "fork/cython"
def test_write_gzjson():
import tempfile
with tempfile.NamedTemporaryFile() as tmp:
write_gzjson(tmp.name, [{}])
# test JSON format
with gzip.open(tmp.name) as f:
assert json.load(f) == [{}]
# test indentation
with gzip.open(tmp.name) as f:
assert f.read() == b'[\n {}\n]'
### MAIN
if __name__ == '__main__':
repo_name = parse_repo_name(find_origin_url())
dump_issues(repo_name)
|