diff --git a/docker-compose.yml b/docker-compose.yml index 1fe54f8..8d43ec9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,6 +31,7 @@ services: - mongo_data:/data/db depends_on: - redis + - mongo #entrypoint: ["/usr/local/bin/entrypoint.sh"] command: ["celery", "-A", justinsight.celery, "worker", "--loglevel=info"] #the lowercase j is actually so important diff --git a/src/ingest/base_ingestor.py b/src/ingest/base_ingestor.py new file mode 100644 index 0000000..0c5eaee --- /dev/null +++ b/src/ingest/base_ingestor.py @@ -0,0 +1,52 @@ +import datetime +import feedparser +import hashlib +import re +from src.ingest.save_to_database import save_entry + +class BaseIngestor: + RSS_URL = None #will be set by the subclasses + + def slugify(self, text): + # Convert title to a filesystem-friendly slug + text = text.lower() + text = re.sub(r'[^a-z0-9]+', '-', text) + return text.strip('-') + + def format_date(self, entry): + # Extract and format the published date + try: + dt = datetime.datetime(*entry.published_parsed[:6]) + return dt.strftime("%Y-%m-%d") + except: + return "unknown-date" + + def fetch_full_text(self, url): + raise NotImplementedError( + "Subclass must implement fetch_full_text()" + ) + + def generate_entry_hash(self, entry): + hash_input = f"{entry.title}{entry.link}{entry.get('published', '')}" + return hashlib.sha256(hash_input.encode('utf-8')).hexdigest() + + def format_entry(self, entry): + title_slug = self.slugify(entry.title) + date_str = self.format_date(entry) + full_text = self.fetch_full_text(entry.link) + + data = { + "title": entry.title, + "link": entry.link, + "published": entry.get("published", ""), + "summary": entry.get("summary", ""), + "full_text": full_text, + "id": self.generate_entry_hash(entry) + } + return data + + def check_and_save_new_entries(self): + feed = feedparser.parse(self.RSS_URL) + + for entry in feed.entries: + save_entry(self.format_entry(entry)) diff --git a/src/ingest/bbc_ingestor.py b/src/ingest/bbc_ingestor.py new file mode 100644 index 0000000..71d5542 --- /dev/null +++ b/src/ingest/bbc_ingestor.py @@ -0,0 +1,15 @@ +from bs4 import BeautifulSoup +import requests +from src.ingest.base_ingestor import BaseIngestor + +class BBCIngestor(BaseIngestor): + RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml" + + def fetch_full_text(self, article_url): + response = requests.get(self.RSS_URL) + soup = BeautifulSoup(response.content, 'html.parser') + + article = soup.find('article') + + if article: + return(article.get_text()) \ No newline at end of file diff --git a/src/ingest/save_to_database.py b/src/ingest/save_to_database.py new file mode 100644 index 0000000..3feb6f7 --- /dev/null +++ b/src/ingest/save_to_database.py @@ -0,0 +1,18 @@ +from pymongo import MongoClient + +# Include username, password, and authentication database +client = MongoClient("mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin") + +# Get (or create) a database +db = client["justinsightdb"] + +# Get (or create) a collection +collection = db["articles"] + +def save_entry(entry): + #check if the entry has already been saved and if it has not then save it + entry_hash = entry["id"] + if collection.count_documents({"id": entry_hash}) == 0: + collection.insert_one(entry) + else: + print(f"{entry['title']} already in use!") diff --git a/src/justinsight/celery.py b/src/justinsight/celery.py index 0292a0b..b5b241a 100644 --- a/src/justinsight/celery.py +++ b/src/justinsight/celery.py @@ -18,15 +18,15 @@ "check-BBCfeed-every-5-minutes": { "task": "justinsight.tasks.bbcLogger_task", - "schedule": 300.0, + "schedule": 5.0, "args": (), }, - "check-NYTfeed-every-5-minutes": { - "task": "justinsight.tasks.nytLogger_task", - "schedule": 300.0, - "args": (), - }, + # "check-NYTfeed-every-5-minutes": { + # "task": "justinsight.tasks.nytLogger_task", + # "schedule": 300.0, + # "args": (), + # }, #schedule more tasks here } diff --git a/src/justinsight/tasks.py b/src/justinsight/tasks.py index 36b6c33..7066141 100644 --- a/src/justinsight/tasks.py +++ b/src/justinsight/tasks.py @@ -1,7 +1,6 @@ # tasks.py from celery import shared_task -from ingest.bbc_rss import check_and_save_new_entries as check_and_save_bbc -from ingest.nyt_rss import check_and_save_new_entries as check_and_save_nyt +from ingest.bbc_ingestor import BBCIngestor @shared_task @@ -11,7 +10,9 @@ def sample_task(): @shared_task def bbcLogger_task(): - check_and_save_bbc() + # Create an instance of the class + ingestor = BBCIngestor() + ingestor.check_and_save_new_entries() # this will invoke the inherited logic return "BBC RSS Feed checked." @shared_task