diff --git a/.gitignore b/.gitignore index 3742713..290f23e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ celerybeat-schedule.* src/celerybeat-schedule.* feed*.json .DS_Store +.venv # Ignore all __pycache__ directories at any level **/__pycache__/ @@ -12,4 +13,10 @@ feed*.json *.pyd #Ignore saved data -data/* \ No newline at end of file +data/* + +# Packaging +*.egg +*.egg-info/ +dist/ +build/ \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 8d43ec9..b6fe913 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,8 @@ services: build: context: . dockerfile: .docker/Dockerfile - + environment: + - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin # bind-mount your repo and the shared EBS volume volumes: - ./:/workspace:cached @@ -26,6 +27,8 @@ services: build: context: . dockerfile: .docker/Dockerfile + environment: + - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin volumes: - .:/workspace:cached - mongo_data:/data/db @@ -39,6 +42,8 @@ services: build: context: . dockerfile: .docker/Dockerfile + environment: + - MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin volumes: - .:/workspace:cached depends_on: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e90d35c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "justinsight" +version = "0.1.0" +description = "Your project description" +authors = [{ name = "Grace Madison" }] +dependencies = [] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-ra -q" +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/readme.md b/readme.md index fa61a1f..8335988 100644 --- a/readme.md +++ b/readme.md @@ -45,4 +45,12 @@ Note: When running in background you can use "docker logs "task": "justinsight.tasks.your_task_name", "schedule": x, #where x is the number of seconds between when the task should happen "args": (), #potential arguments for your task - }, \ No newline at end of file + }, + + +## How to check what's in the database +Please run: docker compose up -d +Then: docker exec -it mongo mongosh -u myuser -p mypassword +Then: use justinsightdb +Then: db.articles.find().pretty() +Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to 'docker compose down' as the containers will be running in the background. \ No newline at end of file diff --git a/src/ingest/base_ingestor.py b/src/ingest/base_ingestor.py index 0c5eaee..e399c46 100644 --- a/src/ingest/base_ingestor.py +++ b/src/ingest/base_ingestor.py @@ -2,17 +2,11 @@ import feedparser import hashlib import re -from src.ingest.save_to_database import save_entry +from ingest.save_to_database import save_entry class BaseIngestor: RSS_URL = None #will be set by the subclasses - def slugify(self, text): - # Convert title to a filesystem-friendly slug - text = text.lower() - text = re.sub(r'[^a-z0-9]+', '-', text) - return text.strip('-') - def format_date(self, entry): # Extract and format the published date try: @@ -31,8 +25,6 @@ def generate_entry_hash(self, entry): return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() def format_entry(self, entry): - title_slug = self.slugify(entry.title) - date_str = self.format_date(entry) full_text = self.fetch_full_text(entry.link) data = { diff --git a/src/ingest/bbc_ingestor.py b/src/ingest/bbc_ingestor.py index 71d5542..fb1d8b6 100644 --- a/src/ingest/bbc_ingestor.py +++ b/src/ingest/bbc_ingestor.py @@ -1,12 +1,12 @@ from bs4 import BeautifulSoup import requests -from src.ingest.base_ingestor import BaseIngestor +from ingest.base_ingestor import BaseIngestor class BBCIngestor(BaseIngestor): RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml" def fetch_full_text(self, article_url): - response = requests.get(self.RSS_URL) + response = requests.get(article_url) soup = BeautifulSoup(response.content, 'html.parser') article = soup.find('article') diff --git a/src/ingest/cnn_ingestor.py b/src/ingest/cnn_ingestor.py new file mode 100644 index 0000000..116fdb9 --- /dev/null +++ b/src/ingest/cnn_ingestor.py @@ -0,0 +1,33 @@ +from bs4 import BeautifulSoup +import requests +from ingest.base_ingestor import BaseIngestor + +class CNNIngestor(BaseIngestor): + RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss" + + def fetch_full_text(self, url): + try: + headers = { + "User-Agent": "Mozilla/5.0" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # CNN article content is usually within
or
+ article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content') + + if not article_section: + print("No CNN article body found.") + return "" + + paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p') + + full_text = "\n".join(p.get_text(strip=True) for p in paragraphs) + + return full_text.strip() + + except Exception as e: + print(f"Error fetching CNN article: {e}") + return "" \ No newline at end of file diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py index 3feb6f7..72ac544 100644 --- a/src/ingest/save_to_database.py +++ b/src/ingest/save_to_database.py @@ -1,7 +1,13 @@ +import os from pymongo import MongoClient +# Default to local MongoDB when not using docker +mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017") + +#mongodb_uri = os.getenv("mongodb://localhost:27017", "mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin") + # Include username, password, and authentication database -client = MongoClient("mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin") +client = MongoClient(mongodb_uri) # Get (or create) a database db = client["justinsightdb"] @@ -14,5 +20,4 @@ def save_entry(entry): entry_hash = entry["id"] if collection.count_documents({"id": entry_hash}) == 0: collection.insert_one(entry) - else: - print(f"{entry['title']} already in use!") + print(f"I have now saved: {entry['title']}") diff --git a/src/justinsight/celery.py b/src/justinsight/celery.py index b5b241a..e93a418 100644 --- a/src/justinsight/celery.py +++ b/src/justinsight/celery.py @@ -22,11 +22,11 @@ "args": (), }, - # "check-NYTfeed-every-5-minutes": { - # "task": "justinsight.tasks.nytLogger_task", - # "schedule": 300.0, - # "args": (), - # }, + "check-CNNfeed-every-5-minutes": { + "task": "justinsight.tasks.cnnLogger_task", + "schedule": 5.0, + "args": (), + }, #schedule more tasks here } diff --git a/src/justinsight/tasks.py b/src/justinsight/tasks.py index 7066141..f545dd2 100644 --- a/src/justinsight/tasks.py +++ b/src/justinsight/tasks.py @@ -1,6 +1,6 @@ -# tasks.py from celery import shared_task from ingest.bbc_ingestor import BBCIngestor +from ingest.cnn_ingestor import CNNIngestor @shared_task @@ -16,8 +16,9 @@ def bbcLogger_task(): return "BBC RSS Feed checked." @shared_task -def nytLogger_task(): - check_and_save_nyt() - return "NYT RSS Feed checked." +def cnnLogger_task(): + ingestor = CNNIngestor() + ingestor.check_and_save_new_entries() # this will invoke the inherited logic + return "CNN RSS Feed checked." #Add more tasks here in the format of the one above \ No newline at end of file diff --git a/tests/test_trivial.py b/tests/test_trivial.py index bacf00e..728f100 100644 --- a/tests/test_trivial.py +++ b/tests/test_trivial.py @@ -1,6 +1,25 @@ #this is a dummy test for checking unit test system -import sys -sys.path.insert(0, 'src') + +#NEED to run to set up testing +#pip install pytest-cov +#python3 -m venv .venv +#source .venv/bin/activate +#pip install -r requirements.txt +#brew tap mongodb/brew +#brew install mongodb-community +#brew services start mongodb-community +#deactivate + +#then we can start testing +#first, activate your virtual environment: source .venv/bin/activate +#then, run: pytest +#or if you want to see printed output from your tests: pytest -s + +from justinsight.tasks import bbcLogger_task + def test_add(): assert 1 + 1 == 2 + +def test_bbcCheck(): + bbcLogger_task()