-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper_scholar.py
More file actions
46 lines (41 loc) · 1.77 KB
/
scraper_scholar.py
File metadata and controls
46 lines (41 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# scraper_scholar.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
def get_scholar_data(scholar_id):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
url = f'https://scholar.google.com/citations?user={scholar_id}'
driver.get(url)
time.sleep(1)
articles = []
while True:
try:
button = driver.find_element(By.ID, 'gsc_bpf_more')
button.click()
time.sleep(3)
if(button.get_attribute('disabled')):
rows = driver.find_elements(By.CSS_SELECTOR, '.gsc_a_tr')
for row in rows:
title = row.find_element(By.CSS_SELECTOR, '.gsc_a_at').text
authors = row.find_elements(By.CSS_SELECTOR, '.gs_gray')[0].text
journal = row.find_elements(By.CSS_SELECTOR, '.gs_gray')[1].text
citations = row.find_element(By.CSS_SELECTOR, '.gsc_a_ac').text
year = row.find_element(By.CSS_SELECTOR, '.gsc_a_h').text
link = row.find_element(By.CSS_SELECTOR, '.gsc_a_at').get_attribute('href')
articles.append({
'title': title,
'link': link,
'authors': authors,
'journal': journal,
'citations': citations,
'year': year,
})
break
except Exception:
break
driver.quit()
return articles