-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathindexer.py
More file actions
84 lines (75 loc) · 3.16 KB
/
indexer.py
File metadata and controls
84 lines (75 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import glob
import json
import os
from tqdm import tqdm
from bs4 import BeautifulSoup
class Indexer:
"""
Utility class to index OSF pages.
Specifically, the project dashboard page, the registration dashboard page, and the user profile page.
This utility class could be deprecated if SHARE harvesters can be used to index the OSF.
"""
def __init__(self):
"""
Constructor of the Indexer class.
self.index ==> the index object
self.project_path_list ==> the list of all .html file in the root folder of each project
self.profile_path_list ==> the list of all .html file in the root folder of each profile
self.registration_path_list ==> the list of all .html file in the root folder of each registration
"""
self.index = {}
self.project_path_list = glob.glob("archive/project/*/*.html")
self.profile_path_list = glob.glob("archive/profile/*/*.html")
self.registration_path_list = glob.glob("archive/registration/*/*.html")
def index_projects(self):
"""
Method for indexing the project dashboard pages.
:return:
"""
for element in tqdm(self.project_path_list):
page = BeautifulSoup(open(element), "html.parser")
page_url = element.replace("archive/project", "")
page_id = page_url.strip("/")
title = page.find("title").text
content = ' '.join(page.find(id="projectScope").text.split())
entry = {}
entry['title'] = title
entry['description'] = content
entry['url'] = page_url
self.index[page_id] = entry
def index_registrations(self):
"""
Method for indexing the registration dashboard pages.
:return:
"""
for element in tqdm(self.registration_path_list):
page = BeautifulSoup(open(element), "html.parser")
page_url = element.replace("archive/registration", "")
print(page_url)
title = page.find("title").text
content = ' '.join(page.find(id="projectScope").text.split())
entry = {}
entry['title'] = title
entry['description'] = content
entry['url'] = page_url
self.index[page_url] = entry
def index_profiles(self):
"""
Method for indexing the user profile pages.
:return:
"""
for element in tqdm(self.profile_path_list):
page = BeautifulSoup(open(element), "html.parser")
page_url = element.replace("archive/profile", "")
title = page.find("title").text
description = (' '.join(page.find(id="social").text.split()) + ' ' +
' '.join(page.find(id="jobs").text.split()) + ' ' +
' '.join(page.find(id="schools").text.split()))\
.replace("Not provided ", "").replace("Not provided", "")
entry = {
'url': page_url,
'title': title
}
if description:
entry['description'] = description
self.index[page_url] = entry