From 8cc882a7bf046c8a1edc142cd324b30af9f76ae4 Mon Sep 17 00:00:00 2001 From: Grace Date: Fri, 27 Jun 2025 08:50:57 -0700 Subject: [PATCH 1/4] Adding to readme --- readme.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/readme.md b/readme.md index fa61a1f..50f1932 100644 --- a/readme.md +++ b/readme.md @@ -45,4 +45,11 @@ Note: When running in background you can use "docker logs "task": "justinsight.tasks.your_task_name", "schedule": x, #where x is the number of seconds between when the task should happen "args": (), #potential arguments for your task - }, \ No newline at end of file + }, + + +## How to check what's in the database +Please run: docker compose up -d +Then: docker exec -it mongo mongosh -u myuser -p mypassword +Then: db.articles.find().pretty() +Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to docker compose down as the containers will be running in the background \ No newline at end of file From 7f3246ecec07ceb4d90f333133bebf804440013d Mon Sep 17 00:00:00 2001 From: Grace Date: Fri, 27 Jun 2025 08:51:41 -0700 Subject: [PATCH 2/4] Readmen line adjustments --- readme.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/readme.md b/readme.md index 50f1932..6fe53e5 100644 --- a/readme.md +++ b/readme.md @@ -49,7 +49,7 @@ Note: When running in background you can use "docker logs ## How to check what's in the database -Please run: docker compose up -d -Then: docker exec -it mongo mongosh -u myuser -p mypassword -Then: db.articles.find().pretty() -Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to docker compose down as the containers will be running in the background \ No newline at end of file +Please run: docker compose up -d +Then: docker exec -it mongo mongosh -u myuser -p mypassword +Then: db.articles.find().pretty() +Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to docker compose down as the containers will be running in the background. \ No newline at end of file From 1b7d87e7383e3b732f48a1ac90da1483fefd96fe Mon Sep 17 00:00:00 2001 From: Grace Date: Fri, 27 Jun 2025 09:56:59 -0700 Subject: [PATCH 3/4] Scaffolding for local testing without docker up and running - instructions in the test_trivial.py find in /tests --- .gitignore | 9 ++++++++- docker-compose.yml | 7 ++++++- pyproject.toml | 19 +++++++++++++++++++ readme.md | 3 ++- src/ingest/base_ingestor.py | 2 +- src/ingest/bbc_ingestor.py | 2 +- src/ingest/save_to_database.py | 11 ++++++++--- src/justinsight/tasks.py | 9 ++++----- tests/test_trivial.py | 23 +++++++++++++++++++++-- 9 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index 3742713..290f23e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ celerybeat-schedule.* src/celerybeat-schedule.* feed*.json .DS_Store +.venv # Ignore all __pycache__ directories at any level **/__pycache__/ @@ -12,4 +13,10 @@ feed*.json *.pyd #Ignore saved data -data/* \ No newline at end of file +data/* + +# Packaging +*.egg +*.egg-info/ +dist/ +build/ \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 8d43ec9..b6fe913 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,8 @@ services: build: context: . dockerfile: .docker/Dockerfile - + environment: + - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin # bind-mount your repo and the shared EBS volume volumes: - ./:/workspace:cached @@ -26,6 +27,8 @@ services: build: context: . dockerfile: .docker/Dockerfile + environment: + - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin volumes: - .:/workspace:cached - mongo_data:/data/db @@ -39,6 +42,8 @@ services: build: context: . dockerfile: .docker/Dockerfile + environment: + - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin volumes: - .:/workspace:cached depends_on: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e90d35c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "justinsight" +version = "0.1.0" +description = "Your project description" +authors = [{ name = "Grace Madison" }] +dependencies = [] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-ra -q" +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/readme.md b/readme.md index 6fe53e5..8335988 100644 --- a/readme.md +++ b/readme.md @@ -51,5 +51,6 @@ Note: When running in background you can use "docker logs ## How to check what's in the database Please run: docker compose up -d Then: docker exec -it mongo mongosh -u myuser -p mypassword +Then: use justinsightdb Then: db.articles.find().pretty() -Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to docker compose down as the containers will be running in the background. \ No newline at end of file +Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to 'docker compose down' as the containers will be running in the background. \ No newline at end of file diff --git a/src/ingest/base_ingestor.py b/src/ingest/base_ingestor.py index 0c5eaee..e2ce315 100644 --- a/src/ingest/base_ingestor.py +++ b/src/ingest/base_ingestor.py @@ -2,7 +2,7 @@ import feedparser import hashlib import re -from src.ingest.save_to_database import save_entry +from ingest.save_to_database import save_entry class BaseIngestor: RSS_URL = None #will be set by the subclasses diff --git a/src/ingest/bbc_ingestor.py b/src/ingest/bbc_ingestor.py index 71d5542..5bcf2e7 100644 --- a/src/ingest/bbc_ingestor.py +++ b/src/ingest/bbc_ingestor.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup import requests -from src.ingest.base_ingestor import BaseIngestor +from ingest.base_ingestor import BaseIngestor class BBCIngestor(BaseIngestor): RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml" diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py index 3feb6f7..72ac544 100644 --- a/src/ingest/save_to_database.py +++ b/src/ingest/save_to_database.py @@ -1,7 +1,13 @@ +import os from pymongo import MongoClient +# Default to local MongoDB when not using docker +mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017") + +#mongodb_uri = os.getenv("mongodb://localhost:27017", "mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin") + # Include username, password, and authentication database -client = MongoClient("mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin") +client = MongoClient(mongodb_uri) # Get (or create) a database db = client["justinsightdb"] @@ -14,5 +20,4 @@ def save_entry(entry): entry_hash = entry["id"] if collection.count_documents({"id": entry_hash}) == 0: collection.insert_one(entry) - else: - print(f"{entry['title']} already in use!") + print(f"I have now saved: {entry['title']}") diff --git a/src/justinsight/tasks.py b/src/justinsight/tasks.py index 7066141..c00475e 100644 --- a/src/justinsight/tasks.py +++ b/src/justinsight/tasks.py @@ -1,4 +1,3 @@ -# tasks.py from celery import shared_task from ingest.bbc_ingestor import BBCIngestor @@ -15,9 +14,9 @@ def bbcLogger_task(): ingestor.check_and_save_new_entries() # this will invoke the inherited logic return "BBC RSS Feed checked." -@shared_task -def nytLogger_task(): - check_and_save_nyt() - return "NYT RSS Feed checked." +# @shared_task +# def nytLogger_task(): +# check_and_save_nyt() +# return "NYT RSS Feed checked." #Add more tasks here in the format of the one above \ No newline at end of file diff --git a/tests/test_trivial.py b/tests/test_trivial.py index bacf00e..728f100 100644 --- a/tests/test_trivial.py +++ b/tests/test_trivial.py @@ -1,6 +1,25 @@ #this is a dummy test for checking unit test system -import sys -sys.path.insert(0, 'src') + +#NEED to run to set up testing +#pip install pytest-cov +#python3 -m venv .venv +#source .venv/bin/activate +#pip install -r requirements.txt +#brew tap mongodb/brew +#brew install mongodb-community +#brew services start mongodb-community +#deactivate + +#then we can start testing +#first, activate your virtual environment: source .venv/bin/activate +#then, run: pytest +#or if you want to see printed output from your tests: pytest -s + +from justinsight.tasks import bbcLogger_task + def test_add(): assert 1 + 1 == 2 + +def test_bbcCheck(): + bbcLogger_task() From 25147e5d931065a846acf0935f53bd477b9ffc1b Mon Sep 17 00:00:00 2001 From: Grace Date: Fri, 27 Jun 2025 11:04:40 -0700 Subject: [PATCH 4/4] CNN ingestor running and BBC full text bug fixed --- src/ingest/base_ingestor.py | 8 -------- src/ingest/bbc_ingestor.py | 2 +- src/ingest/cnn_ingestor.py | 33 +++++++++++++++++++++++++++++++++ src/justinsight/celery.py | 10 +++++----- src/justinsight/tasks.py | 10 ++++++---- 5 files changed, 45 insertions(+), 18 deletions(-) create mode 100644 src/ingest/cnn_ingestor.py diff --git a/src/ingest/base_ingestor.py b/src/ingest/base_ingestor.py index e2ce315..e399c46 100644 --- a/src/ingest/base_ingestor.py +++ b/src/ingest/base_ingestor.py @@ -7,12 +7,6 @@ class BaseIngestor: RSS_URL = None #will be set by the subclasses - def slugify(self, text): - # Convert title to a filesystem-friendly slug - text = text.lower() - text = re.sub(r'[^a-z0-9]+', '-', text) - return text.strip('-') - def format_date(self, entry): # Extract and format the published date try: @@ -31,8 +25,6 @@ def generate_entry_hash(self, entry): return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() def format_entry(self, entry): - title_slug = self.slugify(entry.title) - date_str = self.format_date(entry) full_text = self.fetch_full_text(entry.link) data = { diff --git a/src/ingest/bbc_ingestor.py b/src/ingest/bbc_ingestor.py index 5bcf2e7..fb1d8b6 100644 --- a/src/ingest/bbc_ingestor.py +++ b/src/ingest/bbc_ingestor.py @@ -6,7 +6,7 @@ class BBCIngestor(BaseIngestor): RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml" def fetch_full_text(self, article_url): - response = requests.get(self.RSS_URL) + response = requests.get(article_url) soup = BeautifulSoup(response.content, 'html.parser') article = soup.find('article') diff --git a/src/ingest/cnn_ingestor.py b/src/ingest/cnn_ingestor.py new file mode 100644 index 0000000..116fdb9 --- /dev/null +++ b/src/ingest/cnn_ingestor.py @@ -0,0 +1,33 @@ +from bs4 import BeautifulSoup +import requests +from ingest.base_ingestor import BaseIngestor + +class CNNIngestor(BaseIngestor): + RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss" + + def fetch_full_text(self, url): + try: + headers = { + "User-Agent": "Mozilla/5.0" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # CNN article content is usually within
or
+ article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content') + + if not article_section: + print("No CNN article body found.") + return "" + + paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p') + + full_text = "\n".join(p.get_text(strip=True) for p in paragraphs) + + return full_text.strip() + + except Exception as e: + print(f"Error fetching CNN article: {e}") + return "" \ No newline at end of file diff --git a/src/justinsight/celery.py b/src/justinsight/celery.py index b5b241a..e93a418 100644 --- a/src/justinsight/celery.py +++ b/src/justinsight/celery.py @@ -22,11 +22,11 @@ "args": (), }, - # "check-NYTfeed-every-5-minutes": { - # "task": "justinsight.tasks.nytLogger_task", - # "schedule": 300.0, - # "args": (), - # }, + "check-CNNfeed-every-5-minutes": { + "task": "justinsight.tasks.cnnLogger_task", + "schedule": 5.0, + "args": (), + }, #schedule more tasks here } diff --git a/src/justinsight/tasks.py b/src/justinsight/tasks.py index c00475e..f545dd2 100644 --- a/src/justinsight/tasks.py +++ b/src/justinsight/tasks.py @@ -1,5 +1,6 @@ from celery import shared_task from ingest.bbc_ingestor import BBCIngestor +from ingest.cnn_ingestor import CNNIngestor @shared_task @@ -14,9 +15,10 @@ def bbcLogger_task(): ingestor.check_and_save_new_entries() # this will invoke the inherited logic return "BBC RSS Feed checked." -# @shared_task -# def nytLogger_task(): -# check_and_save_nyt() -# return "NYT RSS Feed checked." +@shared_task +def cnnLogger_task(): + ingestor = CNNIngestor() + ingestor.check_and_save_new_entries() # this will invoke the inherited logic + return "CNN RSS Feed checked." #Add more tasks here in the format of the one above \ No newline at end of file