python_microservice_scraper/scraper_scholar.py at main · GrimsAlphaDev/python_microservice_scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# scraper_scholar.py
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

def get_scholar_data(scholar_id):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = f'https://scholar.google.com/citations?user={scholar_id}'
    driver.get(url)
    time.sleep(1)

    articles = []

    while True:
        try:
            button = driver.find_element(By.ID, 'gsc_bpf_more')
            button.click()
            time.sleep(3)

            if(button.get_attribute('disabled')):
                rows = driver.find_elements(By.CSS_SELECTOR, '.gsc_a_tr')
                for row in rows:
                    title = row.find_element(By.CSS_SELECTOR, '.gsc_a_at').text
                    authors = row.find_elements(By.CSS_SELECTOR, '.gs_gray')[0].text
                    journal = row.find_elements(By.CSS_SELECTOR, '.gs_gray')[1].text
                    citations = row.find_element(By.CSS_SELECTOR, '.gsc_a_ac').text
                    year = row.find_element(By.CSS_SELECTOR, '.gsc_a_h').text
                    link = row.find_element(By.CSS_SELECTOR, '.gsc_a_at').get_attribute('href')
                    articles.append({
                        'title': title,
                        'link': link,
                        'authors': authors,
                        'journal': journal,
                        'citations': citations,
                        'year': year,
                    })
                break
        except Exception:
            break
    driver.quit()
    return articles