summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2021-07-16 15:21:48 +0200
committerStefan Behnel <stefan_ml@behnel.de>2021-07-16 15:25:50 +0200
commit5bf5aa63d6b6742144071a2af896067c21b3752a (patch)
tree584b0735cf7fdb88313576a8f5a35131eccb8165
parent6e2d9fd2d7f47b87c8cfa2e23a880177e8d69630 (diff)
downloadcython-5bf5aa63d6b6742144071a2af896067c21b3752a.tar.gz
Add a script for backing up Github issues.
-rw-r--r--Tools/dump_github_issues.py142
1 files changed, 142 insertions, 0 deletions
diff --git a/Tools/dump_github_issues.py b/Tools/dump_github_issues.py
new file mode 100644
index 000000000..daec51c50
--- /dev/null
+++ b/Tools/dump_github_issues.py
@@ -0,0 +1,142 @@
+"""
+Dump the GitHub issues of the current project to a file (.json.gz).
+
+Usage: python3 Tools/dump_github_issues.py
+"""
+
+import configparser
+import gzip
+import json
+import os.path
+
+from datetime import datetime
+from urllib.request import urlopen
+
+GIT_CONFIG_FILE = ".git/config"
+
+
+class RateLimitReached(Exception):
+ pass
+
+
+def gen_urls(repo):
+ i = 0
+ while True:
+ yield f"https://api.github.com/repos/{repo}/issues?state=all&per_page=100&page={i}"
+ i += 1
+
+
+def read_rate_limit():
+ with urlopen("https://api.github.com/rate_limit") as p:
+ return json.load(p)
+
+
+def parse_rate_limit(limits):
+ limits = limits['resources']['core']
+ return limits['limit'], limits['remaining'], datetime.fromtimestamp(limits['reset'])
+
+
+def load_url(url):
+ with urlopen(url) as p:
+ data = json.load(p)
+ if isinstance(data, dict) and 'rate limit' in data.get('message', ''):
+ raise RateLimitReached()
+
+ assert isinstance(data, list), type(data)
+ return data or None # None indicates empty last page
+
+
+def join_list_data(lists):
+ result = []
+ for data in lists:
+ if not data:
+ break
+ result.extend(data)
+ return result
+
+
+def output_filename(repo):
+ timestamp = datetime.now()
+ return f"github_issues_{repo.replace('/', '_')}_{timestamp.strftime('%Y%m%d_%H%M%S')}.json.gz"
+
+
+def write_gzjson(file_name, data, indent=2):
+ with gzip.open(file_name, "wt", encoding='utf-8') as gz:
+ json.dump(data, gz, indent=indent)
+
+
+def find_origin_url(git_config=GIT_CONFIG_FILE):
+ assert os.path.exists(git_config)
+ parser = configparser.ConfigParser()
+ parser.read(git_config)
+ return parser.get('remote "origin"', 'url')
+
+
+def parse_repo_name(git_url):
+ if git_url.endswith('.git'):
+ git_url = git_url[:-4]
+ return '/'.join(git_url.split('/')[-2:])
+
+
+def dump_issues(repo):
+ """Main entry point."""
+ print(f"Reading issues from repo '{repo}'")
+ urls = gen_urls(repo)
+ try:
+ paged_data = map(load_url, urls)
+ issues = join_list_data(paged_data)
+ except RateLimitReached:
+ limit, remaining, reset_time = parse_rate_limit(read_rate_limit())
+ print(f"FAILURE: Rate limits ({limit}) reached, remaining: {remaining}, reset at {reset_time}")
+ return
+
+ filename = output_filename(repo)
+ print(f"Writing {len(issues)} to {filename}")
+ write_gzjson(filename, issues)
+
+
+### TESTS
+
+def test_join_list_data():
+ assert join_list_data([]) == []
+ assert join_list_data([[1,2]]) == [1,2]
+ assert join_list_data([[1,2], [3]]) == [1,2,3]
+ assert join_list_data([[0], [1,2], [3]]) == [0,1,2,3]
+ assert join_list_data([[0], [1,2], [[[]],[]]]) == [0,1,2,[[]],[]]
+
+
+def test_output_filename():
+ filename = output_filename("re/po")
+ import re
+ assert re.match(r"github_issues_re_po_[0-9]{8}_[0-9]{6}\.json", filename)
+
+
+def test_find_origin_url():
+ assert find_origin_url()
+
+
+def test_parse_repo_name():
+ assert parse_repo_name("https://github.com/cython/cython") == "cython/cython"
+ assert parse_repo_name("git+ssh://git@github.com/cython/cython.git") == "cython/cython"
+ assert parse_repo_name("git+ssh://git@github.com/fork/cython.git") == "fork/cython"
+
+
+def test_write_gzjson():
+ import tempfile
+ with tempfile.NamedTemporaryFile() as tmp:
+ write_gzjson(tmp.name, [{}])
+
+ # test JSON format
+ with gzip.open(tmp.name) as f:
+ assert json.load(f) == [{}]
+
+ # test indentation
+ with gzip.open(tmp.name) as f:
+ assert f.read() == b'[\n {}\n]'
+
+
+### MAIN
+
+if __name__ == '__main__':
+ repo_name = parse_repo_name(find_origin_url())
+ dump_issues(repo_name)