Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ celerybeat-schedule.*
src/celerybeat-schedule.*
feed*.json
.DS_Store
.venv

# Ignore all __pycache__ directories at any level
**/__pycache__/
Expand All @@ -12,4 +13,10 @@ feed*.json
*.pyd

#Ignore saved data
data/*
data/*

# Packaging
*.egg
*.egg-info/
dist/
build/
7 changes: 6 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ services:
build:
context: .
dockerfile: .docker/Dockerfile

environment:
- MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
# bind-mount your repo and the shared EBS volume
volumes:
- ./:/workspace:cached
Expand All @@ -26,6 +27,8 @@ services:
build:
context: .
dockerfile: .docker/Dockerfile
environment:
- MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
volumes:
- .:/workspace:cached
- mongo_data:/data/db
Expand All @@ -39,6 +42,8 @@ services:
build:
context: .
dockerfile: .docker/Dockerfile
environment:
- MONGODB_URI=mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin
volumes:
- .:/workspace:cached
depends_on:
Expand Down
19 changes: 19 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "justinsight"
version = "0.1.0"
description = "Your project description"
authors = [{ name = "Grace Madison" }]
dependencies = []

[tool.setuptools.packages.find]
where = ["src"]

[tool.pytest.ini_options]
minversion = "6.0"
addopts = "-ra -q"
testpaths = ["tests"]
pythonpath = ["src"]
10 changes: 9 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,12 @@ Note: When running in background you can use "docker logs <container_name_or_id>
"task": "justinsight.tasks.your_task_name",
"schedule": x, #where x is the number of seconds between when the task should happen
"args": (), #potential arguments for your task
},
},


## How to check what's in the database
Please run: docker compose up -d
Then: docker exec -it mongo mongosh -u myuser -p mypassword
Then: use justinsightdb
Then: db.articles.find().pretty()
Note - you may need to download mongosh for this to work and to exit the mongosh environment just run 'exit'. Remember to 'docker compose down' as the containers will be running in the background.
10 changes: 1 addition & 9 deletions src/ingest/base_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,11 @@
import feedparser
import hashlib
import re
from src.ingest.save_to_database import save_entry
from ingest.save_to_database import save_entry

class BaseIngestor:
RSS_URL = None #will be set by the subclasses

def slugify(self, text):
# Convert title to a filesystem-friendly slug
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')

def format_date(self, entry):
# Extract and format the published date
try:
Expand All @@ -31,8 +25,6 @@ def generate_entry_hash(self, entry):
return hashlib.sha256(hash_input.encode('utf-8')).hexdigest()

def format_entry(self, entry):
title_slug = self.slugify(entry.title)
date_str = self.format_date(entry)
full_text = self.fetch_full_text(entry.link)

data = {
Expand Down
4 changes: 2 additions & 2 deletions src/ingest/bbc_ingestor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from bs4 import BeautifulSoup
import requests
from src.ingest.base_ingestor import BaseIngestor
from ingest.base_ingestor import BaseIngestor

class BBCIngestor(BaseIngestor):
RSS_URL = "http://feeds.bbci.co.uk/news/world/rss.xml"

def fetch_full_text(self, article_url):
response = requests.get(self.RSS_URL)
response = requests.get(article_url)
soup = BeautifulSoup(response.content, 'html.parser')

article = soup.find('article')
Expand Down
33 changes: 33 additions & 0 deletions src/ingest/cnn_ingestor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from bs4 import BeautifulSoup
import requests
from ingest.base_ingestor import BaseIngestor

class CNNIngestor(BaseIngestor):
RSS_URL = "http://rss.cnn.com/rss/cnn_world.rss"

def fetch_full_text(self, url):
try:
headers = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')

# CNN article content is usually within <div class="article__content"> or <section id="body-text">
article_section = soup.find('section', id='body-text') or soup.find('div', class_='article__content')

if not article_section:
print("No CNN article body found.")
return ""

paragraphs = article_section.find_all('div', class_='paragraph') or article_section.find_all('p')

full_text = "\n".join(p.get_text(strip=True) for p in paragraphs)

return full_text.strip()

except Exception as e:
print(f"Error fetching CNN article: {e}")
return ""
11 changes: 8 additions & 3 deletions src/ingest/save_to_database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import os
from pymongo import MongoClient

# Default to local MongoDB when not using docker
mongodb_uri = os.getenv("MONGODB_URI", "mongodb://localhost:27017")

#mongodb_uri = os.getenv("mongodb://localhost:27017", "mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin")

# Include username, password, and authentication database
client = MongoClient("mongodb://myuser:mypassword@mongo:27017/justinsightdb?authSource=admin")
client = MongoClient(mongodb_uri)

# Get (or create) a database
db = client["justinsightdb"]
Expand All @@ -14,5 +20,4 @@ def save_entry(entry):
entry_hash = entry["id"]
if collection.count_documents({"id": entry_hash}) == 0:
collection.insert_one(entry)
else:
print(f"{entry['title']} already in use!")
print(f"I have now saved: {entry['title']}")
10 changes: 5 additions & 5 deletions src/justinsight/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
"args": (),
},

# "check-NYTfeed-every-5-minutes": {
# "task": "justinsight.tasks.nytLogger_task",
# "schedule": 300.0,
# "args": (),
# },
"check-CNNfeed-every-5-minutes": {
"task": "justinsight.tasks.cnnLogger_task",
"schedule": 5.0,
"args": (),
},

#schedule more tasks here
}
9 changes: 5 additions & 4 deletions src/justinsight/tasks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# tasks.py
from celery import shared_task
from ingest.bbc_ingestor import BBCIngestor
from ingest.cnn_ingestor import CNNIngestor


@shared_task
Expand All @@ -16,8 +16,9 @@ def bbcLogger_task():
return "BBC RSS Feed checked."

@shared_task
def nytLogger_task():
check_and_save_nyt()
return "NYT RSS Feed checked."
def cnnLogger_task():
ingestor = CNNIngestor()
ingestor.check_and_save_new_entries() # this will invoke the inherited logic
return "CNN RSS Feed checked."

#Add more tasks here in the format of the one above
23 changes: 21 additions & 2 deletions tests/test_trivial.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
#this is a dummy test for checking unit test system
import sys
sys.path.insert(0, 'src')

#NEED to run to set up testing
#pip install pytest-cov
#python3 -m venv .venv
#source .venv/bin/activate
#pip install -r requirements.txt
#brew tap mongodb/brew
#brew install mongodb-community
#brew services start mongodb-community
#deactivate

#then we can start testing
#first, activate your virtual environment: source .venv/bin/activate
#then, run: pytest
#or if you want to see printed output from your tests: pytest -s

from justinsight.tasks import bbcLogger_task


def test_add():
assert 1 + 1 == 2

def test_bbcCheck():
bbcLogger_task()
Loading