-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithub_scraper.py
More file actions
109 lines (84 loc) · 3.87 KB
/
github_scraper.py
File metadata and controls
109 lines (84 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
IGNORE_EXTS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg', '.mp4', '.mp3',
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.zip', '.tar', '.gz', '.rar',
'.7z', '.exe', '.dll', '.jar', '.war', '.class']
class GitHubScraper:
def __init__(self, github_url, branch=None, condensed=False):
self.github_url = github_url
self.owner, self.repo = self.get_github_repo_info()
self.branch = self.get_default_branch() if branch is None else branch
self.root_readme = ""
self.file_paths = []
self.set_files(condensed)
# Getters
def get_repo_name(self):
return self.repo
def get_file_paths(self):
return self.file_paths
def get_readme(self):
return self.root_readme
def get_github_repo_info(self):
parts = self.github_url.split('/')
owner = parts[-2]
repo = parts[-1]
return owner, repo
def get_default_branch(self):
url = f"https://api.github.com/repos/{self.owner}/{self.repo}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
return data.get('default_branch', 'master')
else:
print("Error fetching default branch:", response.status_code, response.text)
return None
def set_files(self, condensed=False):
url = f"https://api.github.com/repos/{self.owner}/{self.repo}/git/trees/{self.branch}?recursive=1"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
files = []
for file in data['tree']:
if file['type'] == 'blob':
file_name = file['path'].lower()
file_extension = file_name.split('.')[-1]
if file_extension not in IGNORE_EXTS:
if file_name == 'readme.md':
# must use correct casing to get file
self.root_readme = self.get_file_raw(file['path'])
files.append(file['path'])
if condensed:
files = self._condense_file_structure(files)
self.file_paths = files
else:
print("Error:", response.status_code, response.text)
def get_file_raw(self, file_path):
url = f'https://api.github.com/repos/{self.owner}/{self.repo}/contents/{file_path}?ref={self.branch}'
headers = {'Accept': 'application/vnd.github.v3.raw'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve file: {response.status_code}")
return None
def _condense_file_structure(self, file_paths):
formatted_structure = ""
directory_depths = {}
file_paths.sort() # Ensure paths are processed in a sorted order for correct structure
for file_path in file_paths:
path_segments = file_path.split('/')
# Determine the current depth and adjust if necessary
current_depth = 0
for segment in path_segments[:-1]:
if segment not in directory_depths or directory_depths[segment] != current_depth:
formatted_structure += " " * current_depth + segment + "/\n"
directory_depths[segment] = current_depth
current_depth += 1
# Add the file at the correct depth
formatted_structure += " " * current_depth + path_segments[-1] + "\n"
return formatted_structure
if __name__ == "__main__":
# Replace with your GitHub URL
github_url = 'https://github.com/Stability-AI/generative-models'
scraper = GitHubScraper(github_url)
print(scraper.root_readme)
print(scraper.file_paths)