-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithub_fetch.py
More file actions
78 lines (65 loc) · 2 KB
/
github_fetch.py
File metadata and controls
78 lines (65 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import click
import requests
import pickle
def fetch_responses(token, url):
responses = []
while True:
headers = {'Authorization': 'token ' + token}
rsp = requests.get(url, headers=headers)
responses.append(rsp)
if 'next' in rsp.links:
url = rsp.links['next']['url']
else:
break
return responses
def parse_pull_requests(responses):
pull_requests = []
for r in responses:
for issue in r.json():
# Every PR is an issue but not every issue is a PR
if 'pull_request' in issue:
pull_requests.append(issue)
return pull_requests
# General fetch+parse function
def fetch_data(token, url, get_remaining=False):
responses = fetch_responses(token, url)
remaining = responses[-1].headers['X-RateLimit-Remaining']
ret = []
for r in responses:
data = r.json()
if type(data) is list:
ret += data
else:
ret.append(data)
if get_remaining:
return ret, remaining
return ret
def parse_requests(input_file, output_file):
with open(input_file, 'rb') as f:
data = pickle.load(f)
assert set([r.status_code for r in data]) == set([200])
pr = parse_pull_requests(data)
print(f"Found {len(pr)} prs")
with open(output_file, 'wb') as f:
pickle.dump(pr, f, protocol=pickle.HIGHEST_PROTOCOL)
@click.command()
@click.option('--token', '-t', help='GitHub API user authentication token')
@click.option('--url', '-u', help='URL to start fetching')
@click.option('--out', '-o', help='Output file')
def run(token: str, url: str, out: str):
"""
Ex. usage:
python3 github_fetch.py -t $token -u "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=100&state=open" -o issues.pickle
python3
import pickle
from github_fetch import *
parse_requests('issues.pickle', 'pr.pickle')
"""
responses = fetch_responses(token, url)
print(len(responses), "responses", responses[:3])
with open(out, 'wb') as f:
pickle.dump(responses, f, protocol=pickle.HIGHEST_PROTOCOL)
if responses:
print(responses[-1].headers['X-RateLimit-Remaining'], "remaining api calls.")
if __name__ == '__main__':
run()