-
Notifications
You must be signed in to change notification settings - Fork 0
Feat/fetch image script #41
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f2f8ea9
501055d
29a38fa
c37cd66
fdfe9b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,3 +22,5 @@ dist-ssr | |
| *.njsproj | ||
| *.sln | ||
| *.sw? | ||
|
|
||
| public/data/rolling-images.json | ||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| .venv | ||
| __pycache__/ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 3.13 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| # Pexels Image Fetcher | ||
|
|
||
| A python script to fetch the links of all my featured photos on Pexels. |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,156 @@ | ||||||
| from fileinput import filename | ||||||
| import sys | ||||||
| import time | ||||||
| import json | ||||||
| from typing import List | ||||||
| from bs4 import BeautifulSoup | ||||||
| from selenium import webdriver | ||||||
| from selenium.webdriver.chrome.service import Service | ||||||
| from selenium.webdriver.common.by import By | ||||||
| from selenium.common.exceptions import StaleElementReferenceException | ||||||
| from chromedriver_autoinstaller import install as install_chromedriver | ||||||
|
|
||||||
| def find_load_more_button(driver): | ||||||
| """ | ||||||
| Find the Load More button by searching through all buttons and examining their text content. | ||||||
| Returns the button element if found, None otherwise. | ||||||
| """ | ||||||
| try: | ||||||
| # Find all buttons on the page | ||||||
| buttons = driver.find_elements(By.TAG_NAME, "button") | ||||||
|
|
||||||
| for button in buttons: | ||||||
| if button and button.is_displayed(): | ||||||
| try: | ||||||
| # Get the text content of the button including all nested elements | ||||||
| button_text = driver.execute_script(""" | ||||||
| function getTextContent(element) { | ||||||
| // Get text from the element itself | ||||||
| let text = element.textContent || element.innerText || ''; | ||||||
|
|
||||||
| // Also check all child elements for text | ||||||
| const children = element.querySelectorAll('*'); | ||||||
| for (let child of children) { | ||||||
| if (child.textContent) { | ||||||
| text += ' ' + child.textContent; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| return text.trim().toLowerCase(); | ||||||
| } | ||||||
| return getTextContent(arguments[0]); | ||||||
| """, button) | ||||||
|
|
||||||
| # Check if the button contains "load more" text | ||||||
| if button_text and "load more" in button_text: | ||||||
| return button | ||||||
|
|
||||||
| except StaleElementReferenceException: | ||||||
| continue | ||||||
|
|
||||||
| return None | ||||||
|
|
||||||
| except Exception as e: | ||||||
| print(f"Error finding load more button: {e}") | ||||||
| return None | ||||||
|
|
||||||
| def get_image_links_selenium(url): | ||||||
| """ | ||||||
| Crawls a Pexels page using Selenium to fetch the links of all featured images. | ||||||
| Auto-clicks the "Load More" button if exists to load more images. | ||||||
| """ | ||||||
| driver = None # Initialize driver to None | ||||||
| try: | ||||||
| # Automatically install and set up chromedriver | ||||||
| service = Service(install_chromedriver()) | ||||||
|
|
||||||
| # Set up Chrome options for headless mode | ||||||
| options = webdriver.ChromeOptions() | ||||||
| options.add_argument('--headless') | ||||||
| options.add_argument('--disable-gpu') | ||||||
| options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36') | ||||||
|
|
||||||
| driver = webdriver.Chrome(service=service, options=options) | ||||||
|
|
||||||
| print("Fetching URL with Selenium...") | ||||||
| driver.get(url) | ||||||
| # Wait for initial page load | ||||||
| time.sleep(3) | ||||||
|
|
||||||
| max_clicks = 5 # Safety limit to prevent infinite loops | ||||||
| click_count = 0 | ||||||
|
|
||||||
| # Try clicking the Load More button | ||||||
| while click_count < max_clicks: | ||||||
| load_more_button = find_load_more_button(driver) | ||||||
|
|
||||||
| if load_more_button and load_more_button.is_displayed(): | ||||||
| try: | ||||||
| # Click the button | ||||||
| print(f"Found Load More button, clicking... (attempt {click_count + 1})") | ||||||
| click_count += 1 | ||||||
| load_more_button.click() | ||||||
|
|
||||||
| except (StaleElementReferenceException, Exception) as e: | ||||||
| # print(f"Error clicking button: {e}") | ||||||
|
||||||
| # print(f"Error clicking button: {e}") | |
| print(f"Error clicking button: {e}") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| #!/bin/bash | ||
| if [ -f "public/data/rolling-images.json" ]; then | ||
| echo "rolling-images.json already exists, skipping..." | ||
| exit 0 | ||
| fi | ||
| cd scripts || exit | ||
| uv sync | ||
| uv run python main.py "../public/data/rolling-images.json" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| [project] | ||
| name = "scripts" | ||
| version = "0.1.0" | ||
| description = "Add your description here" | ||
| readme = "README.md" | ||
| requires-python = ">=3.13" | ||
| dependencies = [ | ||
| "bs4>=0.0.2", | ||
| "chromedriver-autoinstaller>=0.6.4", | ||
| "selenium>=4.36.0", | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
fileinputmodule import is unused and should be removed. Thefilenameidentifier from this import is not used anywhere in the code.