diff --git a/.DS_Store b/.DS_Store index 04725a5..58839f9 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..007d4ed --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +# Set environment variable to force Python output to be unbuffered +ENV PYTHONUNBUFFERED=1 + +# Set the working directory to /app +WORKDIR /app + +# Copy requirements.txt and install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the entire project into the container +COPY . . + +# Expose port 5000 (Cloud Run sets the PORT env variable) +EXPOSE 8080 + +# Run the API Gateway. +# Cloud Run will set PORT, so we use that environment variable. +CMD ["sh", "-c", "python backend/api_gateway/api_gateway.py ${PORT:-8080}"] \ No newline at end of file diff --git a/backend/.DS_Store b/backend/.DS_Store index 5ef017d..58da69c 100644 Binary files a/backend/.DS_Store and b/backend/.DS_Store differ diff --git a/backend/api_gateway/api_gateway.py b/backend/api_gateway/api_gateway.py index 5b69660..f05ed18 100644 --- a/backend/api_gateway/api_gateway.py +++ b/backend/api_gateway/api_gateway.py @@ -1,68 +1,129 @@ + #!/usr/bin/env python3 -"""api_gateway.py - API Gateway for the News Aggregator Backend -This Flask application aggregates endpoints from various microservices. +"""API Gateway for the News Aggregator Backend + +This module serves as the central API Gateway for the News Aggregator application. +It provides a unified interface for clients to interact with various microservices +including news fetching, summarization, authentication, and story tracking. + +Key Features: +- RESTful API endpoints using Flask-RestX +- JWT-based authentication +- CORS support for cross-origin requests +- Swagger documentation +- Error handling and logging +- Integration with multiple microservices + +Endpoints: +- /api/news: News fetching and processing +- /health: System health check +- /summarize: Article summarization +- /api/user: User profile management +- /api/auth: Authentication operations +- /api/bookmarks: Bookmark management +- /api/story_tracking: Story tracking functionality """ -from flask import Flask, jsonify, request, make_response +# Standard library imports +from flask import Blueprint, Flask, jsonify, request, make_response from flask_cors import CORS -from flask_restx import Api, Resource, fields +from flask_restx import Api, Resource, fields, Namespace import sys import os import jwt import json import uuid import datetime +from datetime import datetime, timedelta from functools import wraps + +# Add project root to Python path for relative imports sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -# load env +print("[DEBUG] [api_gateway] [startup] API Gateway starting up...") + +# Load environment variables from .env file from dotenv import load_dotenv load_dotenv() +print("[DEBUG] [api_gateway] [startup] Environment variables loaded") - - +# Import microservices and utilities from backend.microservices.summarization_service import run_summarization, process_articles from backend.microservices.news_fetcher import fetch_news from backend.core.config import Config from backend.core.utils import setup_logger, log_exception from backend.microservices.auth_service import load_users from backend.microservices.news_storage import store_article_in_supabase, log_user_search, add_bookmark, get_user_bookmarks, delete_bookmark -# Initialize logger +from backend.microservices.story_tracking_service import get_tracked_stories, create_tracked_story, get_story_details, delete_tracked_story + + +# Initialize logger for the API Gateway logger = setup_logger(__name__) +print("[DEBUG] [api_gateway] [startup] Logger initialized") -# Initialize Flask app with CORS support +# Initialize Flask application with security configurations app = Flask(__name__) -app.config['SECRET_KEY'] = os.getenv('JWT_SECRET_KEY', 'your-secret-key') # Change this in production -CORS(app, origins=Config.CORS_ORIGINS, supports_credentials=True, allow_headers=['Content-Type', 'Authorization', 'Access-Control-Allow-Origin'], methods=['GET', 'POST', 'PUT', 'DELETE', 'OPTIONS'], expose_headers=['Access-Control-Allow-Origin']) - -# Initialize Flask-RestX +app.config['SECRET_KEY'] = os.getenv('JWT_SECRET_KEY', 'your-secret-key') # JWT secret key for token signing +print("[DEBUG] [api_gateway] [startup] Flask app initialized with secret key") + +# Configure CORS to allow specific origins and methods +CORS(app, + origins=["http://localhost:5173", "http://localhost:5001"], + supports_credentials=True, + allow_headers=["Content-Type", "Authorization"], + methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"]) +print("[DEBUG] [api_gateway] [startup] CORS configured") + +# Initialize Flask-RestX for API documentation api = Api(app, version='1.0', title='News Aggregator API', description='A news aggregation and summarization API') +print("[DEBUG] [api_gateway] [startup] Flask-RestX API initialized") -# Define namespaces +# Define API namespaces for logical grouping of endpoints news_ns = api.namespace('api/news', description='News operations') health_ns = api.namespace('health', description='Health check operations') summarize_ns = api.namespace('summarize', description='Text summarization operations') user_ns = api.namespace('api/user', description='User operations') auth_ns = api.namespace('api/auth', description='Authentication operations') bookmark_ns = api.namespace('api/bookmarks', description='Bookmark operations') +story_tracking_ns = api.namespace('api/story_tracking', description='Story tracking operations') +print("[DEBUG] [api_gateway] [startup] API namespaces defined") def token_required(f): + """Decorator to protect routes that require authentication. + + This decorator validates the JWT token in the Authorization header. + It ensures that only authenticated users can access protected endpoints. + + Args: + f: The function to be decorated. + + Returns: + decorated: The decorated function that includes token validation. + + Raises: + 401: If the token is missing or invalid. + """ @wraps(f) def decorated(*args, **kwargs): + print("[DEBUG] [api_gateway] [token_required] Checking token in request") auth_header = request.headers.get('Authorization') if not auth_header: + print("[DEBUG] [api_gateway] [token_required] Authorization header missing") return {'error': 'Authorization header missing'}, 401 try: - token = auth_header.split()[1] + token = auth_header.split()[1] # Extract token from 'Bearer ' + print(f"[DEBUG] [api_gateway] [token_required] Decoding token: {token[:10]}...") payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') + print(f"[DEBUG] [api_gateway] [token_required] Token decoded successfully, user: {payload.get('sub', 'unknown')}") return f(*args, **kwargs) except Exception as e: + print(f"[DEBUG] [api_gateway] [token_required] Token validation error: {str(e)}") return {'error': 'Invalid token', 'message': str(e)}, 401 return decorated -# Define models for documentation +# Define API models for request/response documentation article_model = api.model('Article', { 'article_text': fields.String(required=True, description='The text to summarize') }) @@ -76,7 +137,7 @@ def decorated(*args, **kwargs): 'avatarUrl': fields.String(description='URL to user avatar') }) -# Model for user signup +# Model for user registration signup_model = api.model('Signup', { 'username': fields.String(required=True, description='Username'), 'password': fields.String(required=True, description='Password'), @@ -85,22 +146,41 @@ def decorated(*args, **kwargs): 'lastName': fields.String(required=False, description='Last name') }) -# Health check endpoint +print("[DEBUG] [api_gateway] [startup] API models defined") + +# Health check endpoint for system monitoring @health_ns.route('/') class HealthCheck(Resource): def get(self): - """Check if API Gateway is healthy""" + """Check the health status of the API Gateway. + + Returns: + dict: A dictionary containing the health status. + int: HTTP 200 status code indicating success. + """ + print("[DEBUG] [api_gateway] [health_check] Called") return {"status": "API Gateway is healthy"}, 200 -# Summarization endpoint +# Endpoint for article summarization @summarize_ns.route('/') class Summarize(Resource): @summarize_ns.expect(article_model) def post(self): - """Summarize the given article text""" + """Summarize the provided article text. + + Expects a JSON payload with 'article_text' field. + Uses the summarization service to generate a concise summary. + + Returns: + dict: Contains the generated summary. + int: HTTP 200 status code on success. + """ + print("[DEBUG] [api_gateway] [summarize] Called") data = request.get_json() article_text = data.get('article_text', '') + print(f"[DEBUG] [api_gateway] [summarize] Summarizing text of length: {len(article_text)}") summary = run_summarization(article_text) + print(f"[DEBUG] [api_gateway] [summarize] Summarization complete, summary length: {len(summary)}") return {"summary": summary}, 200 @news_ns.route('/fetch') @@ -109,48 +189,77 @@ class NewsFetch(Resource): @news_ns.param('user_id', 'User ID for logging search history') @news_ns.param('session_id', 'Session ID for tracking requests') def get(self): - """ - Fetch news articles, store them in Supabase, and log user search history if a user ID is provided. + """Fetch news articles based on a keyword and store them in Supabase. + + This endpoint fetches news articles matching the provided keyword, + stores them in Supabase, and logs the search history if a user ID is provided. + + Args: + keyword (str): The search term for fetching news articles. + user_id (str, optional): User ID for logging search history. + session_id (str): Session ID for tracking the request. + + Returns: + dict: Contains the stored article IDs and success status. + int: HTTP 200 on success, 500 on error. """ try: keyword = request.args.get('keyword', '') user_id = request.args.get('user_id') # optional session_id = request.args.get('session_id') + print(f"[DEBUG] [api_gateway] [news_fetch] Called with keyword: '{keyword}', user_id: {user_id}, session_id: {session_id}") - # Fetch articles from your existing news_fetcher module. + print(f"[DEBUG] [api_gateway] [news_fetch] Fetching news articles for keyword: '{keyword}'") articles = fetch_news(keyword) # This returns a list of articles. + print(f"[DEBUG] [api_gateway] [news_fetch] Found {len(articles) if articles else 0} articles") stored_article_ids = [] for article in articles: - # Store each article in the database; get its unique id. + print(f"[DEBUG] [api_gateway] [news_fetch] Storing article: {article.get('title', 'No title')}") article_id = store_article_in_supabase(article) stored_article_ids.append(article_id) + print(f"[DEBUG] [api_gateway] [news_fetch] Stored article with ID: {article_id}") - # If the request included a user_id, log the search for this article. if user_id: + print(f"[DEBUG] [api_gateway] [news_fetch] Logging search for user {user_id}, article {article_id}") log_user_search(user_id, article_id, session_id) + print(f"[DEBUG] [api_gateway] [news_fetch] Returning {len(stored_article_ids)} article IDs") return make_response(jsonify({ 'status': 'success', 'data': stored_article_ids }), 200) except Exception as e: + print(f"[DEBUG] [api_gateway] [news_fetch] Error: {str(e)}") return make_response(jsonify({ 'status': 'error', 'message': str(e) }), 500) -# News processing endpoint @news_ns.route('/process') class NewsProcess(Resource): @news_ns.param('session_id', 'Session ID for tracking requests') def post(self): - """Process and summarize articles""" + """Process and summarize a batch of articles. + + This endpoint processes articles associated with the provided session ID, + generating summaries and performing any necessary data transformations. + + Args: + session_id (str): Session ID for tracking the request and identifying articles. + + Returns: + dict: Contains processed articles data and success status. + int: HTTP 200 on success, 500 on error. + """ try: session_id = request.args.get('session_id') + print(f"[DEBUG] [api_gateway] [news_process] Called with session_id: {session_id}") + print("[DEBUG] [api_gateway] [news_process] Processing articles...") summarized_articles = process_articles(session_id) + print(f"[DEBUG] [api_gateway] [news_process] Processed {len(summarized_articles) if summarized_articles else 0} articles") return { 'status': 'success', 'message': 'Articles processed and summarized successfully', @@ -158,32 +267,54 @@ def post(self): 'session_id': session_id }, 200 except Exception as e: + print(f"[DEBUG] [api_gateway] [news_process] Error: {str(e)}") logger.error(f"Error processing articles: {str(e)}") return { 'status': 'error', 'message': str(e) }, 500 -# User authentication endpoints @auth_ns.route('/signup') class Signup(Resource): @auth_ns.expect(signup_model) def post(self): - """Register a new user""" + """Register a new user in the system. + + Creates a new user account with the provided information and generates + a JWT token for immediate authentication. + + Expected JSON payload: + { + 'username': str (required), + 'password': str (required), + 'email': str (required), + 'firstName': str (optional), + 'lastName': str (optional) + } + + Returns: + dict: Contains user data (excluding password) and JWT token. + int: HTTP 201 on success, 400 on validation error, 500 on server error. + """ + print("[DEBUG] [api_gateway] [signup] User signup endpoint called") data = request.get_json() username = data.get('username') password = data.get('password') email = data.get('email') firstName = data.get('firstName', '') lastName = data.get('lastName', '') + print(f"[DEBUG] [api_gateway] [signup] Request for username: {username}, email: {email}") if not username or not password or not email: + print("[DEBUG] [api_gateway] [signup] Validation failed: missing required fields") return {'error': 'Username, password, and email are required'}, 400 users = load_users() + print(f"[DEBUG] [api_gateway] [signup] Loaded {len(users)} existing users") # Check if username already exists if any(u.get('username') == username for u in users): + print(f"[DEBUG] [api_gateway] [signup] Username {username} already exists") return {'error': 'Username already exists'}, 400 # Create new user with unique ID @@ -195,51 +326,80 @@ def post(self): 'firstName': firstName, 'lastName': lastName } + print(f"[DEBUG] [api_gateway] [signup] Created new user with ID: {new_user['id']}") users.append(new_user) try: # Save updated users list + print("[DEBUG] [api_gateway] [signup] Saving updated users list") with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'users.txt'), 'w') as f: json.dump(users, f, indent=4) + print("[DEBUG] [api_gateway] [signup] Users list saved successfully") except Exception as e: + print(f"[DEBUG] [api_gateway] [signup] Error saving user data: {str(e)}") return {'error': 'Failed to save user data', 'message': str(e)}, 500 # Generate JWT token + print("[DEBUG] [api_gateway] [signup] Generating JWT token") token = jwt.encode({ 'id': new_user['id'], 'username': new_user['username'], 'exp': datetime.datetime.utcnow() + datetime.timedelta(hours=1) }, app.config['SECRET_KEY'], algorithm='HS256') + print(f"[DEBUG] [api_gateway] [signup] Token generated: {token[:10]}...") # Exclude password from response user_data = {k: new_user[k] for k in new_user if k != 'password'} + print("[DEBUG] [api_gateway] [signup] Signup successful") return {'message': 'User registered successfully', 'user': user_data, 'token': token}, 201 @auth_ns.route('/login') class Login(Resource): def post(self): - """Login and get authentication token""" + """Authenticate user and generate JWT token. + + Validates user credentials and generates a JWT token for authenticated access. + + Expected JSON payload: + { + 'username': str (required), + 'password': str (required) + } + + Returns: + dict: Contains user data (excluding password) and JWT token. + int: HTTP 200 on success, 400 on validation error, 401 on invalid credentials. + """ + print("[DEBUG] [api_gateway] [login] Login endpoint called") data = request.get_json() username = data.get('username') password = data.get('password') + print(f"[DEBUG] [api_gateway] [login] Login attempt for username: {username}") if not username or not password: + print("[DEBUG] [api_gateway] [login] Validation failed: missing username or password") return {'error': 'Username and password are required'}, 400 users = load_users() + print(f"[DEBUG] [api_gateway] [login] Loaded {len(users)} users") user = next((u for u in users if u.get('username') == username and u.get('password') == password), None) if not user: + print(f"[DEBUG] [api_gateway] [login] Invalid credentials for username: {username}") return {'error': 'Invalid credentials'}, 401 + print(f"[DEBUG] [api_gateway] [login] Valid credentials for user: {user.get('id')}") + print("[DEBUG] [api_gateway] [login] Generating JWT token") token = jwt.encode({ 'id': user['id'], 'username': user['username'], 'exp': datetime.datetime.utcnow() + datetime.timedelta(hours=1) }, app.config['SECRET_KEY'], algorithm='HS256') + print(f"[DEBUG] [api_gateway] [login] Token generated: {token[:10]}...") user_data = {k: user[k] for k in user if k != 'password'} + print("[DEBUG] [api_gateway] [login] Login successful") return {'token': token, 'user': user_data} @user_ns.route('/profile') @@ -247,32 +407,55 @@ class UserProfile(Resource): @token_required @user_ns.marshal_with(user_profile_model) def get(self): - """Get user profile information""" + """Retrieve authenticated user's profile information. + + Requires a valid JWT token in the Authorization header. + Returns the user's profile data excluding sensitive information. + + Returns: + dict: User profile data including id, username, email, and names. + int: HTTP 200 on success, 404 if user not found. + """ + print("[DEBUG] [api_gateway] [user_profile] Called") auth_header = request.headers.get('Authorization') token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [user_profile] Decoding token: {token[:10]}...") payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256']) + print(f"[DEBUG] [api_gateway] [user_profile] Looking up user with ID: {payload.get('id')}") users = load_users() user = next((u for u in users if u.get('id') == payload.get('id')), None) if not user: + print(f"[DEBUG] [api_gateway] [user_profile] User not found with ID: {payload.get('id')}") return {'error': 'User not found'}, 404 + print(f"[DEBUG] [api_gateway] [user_profile] Found user: {user.get('username')}") return {k: user[k] for k in user if k != 'password'}, 200 @bookmark_ns.route('/') class Bookmark(Resource): @token_required def get(self): - """Get all bookmarked articles for the authenticated user""" + """Retrieve all bookmarks for the authenticated user. + + Requires a valid JWT token in the Authorization header. + Returns a list of bookmarked articles for the current user. + + Returns: + dict: Contains list of bookmarked articles and success status. + int: HTTP 200 on success, 500 on error. + """ try: - # Get the user ID from the token + print("[DEBUG] [api_gateway] [get_bookmarks] Called") auth_header = request.headers.get('Authorization') token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [get_bookmarks] Decoding token: {token[:10]}...") payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') user_id = payload.get('sub') + print(f"[DEBUG] [api_gateway] [get_bookmarks] Getting bookmarks for user: {user_id}") - # Get bookmarks using the news_storage service bookmarks = get_user_bookmarks(user_id) + print(f"[DEBUG] [api_gateway] [get_bookmarks] Found {len(bookmarks)} bookmarks") return { 'status': 'success', @@ -280,6 +463,7 @@ def get(self): }, 200 except Exception as e: + print(f"[DEBUG] [api_gateway] [get_bookmarks] Error: {str(e)}") logger.error(f"Error fetching bookmarks: {str(e)}") return { 'status': 'error', @@ -288,24 +472,40 @@ def get(self): @token_required def post(self): - """Add a bookmark for a news article""" + """Add a new bookmark for the authenticated user. + + Requires a valid JWT token in the Authorization header. + Creates a bookmark linking the user to a specific news article. + + Expected JSON payload: + { + 'news_id': str (required) + } + + Returns: + dict: Contains bookmark ID and success status. + int: HTTP 201 on success, 400 on validation error, 500 on server error. + """ try: - # Get the user ID from the token + print("[DEBUG] [api_gateway] [add_bookmark] Called") auth_header = request.headers.get('Authorization') token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [add_bookmark] Decoding token: {token[:10]}...") payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') user_id = payload.get('sub') + print(f"[DEBUG] [api_gateway] [add_bookmark] Adding bookmark for user: {user_id}") - # Get the news article ID from the request body data = request.get_json() news_id = data.get('news_id') + print(f"[DEBUG] [api_gateway] [add_bookmark] News article ID: {news_id}") if not news_id: + print("[DEBUG] [api_gateway] [add_bookmark] News article ID missing in request") return {'error': 'News article ID is required'}, 400 - # Add the bookmark using the news_storage service - # bookmark = add_bookmark(user_id, '054c021a-f6f3-44b2-a43f-1ca0d211eb15') + print(f"[DEBUG] [api_gateway] [add_bookmark] Adding bookmark for user {user_id}, article {news_id}") bookmark = add_bookmark(user_id, news_id) + print(f"[DEBUG] [api_gateway] [add_bookmark] Bookmark added with ID: {bookmark['id'] if isinstance(bookmark, dict) else bookmark}") return { 'status': 'success', @@ -316,6 +516,7 @@ def post(self): }, 201 except Exception as e: + print(f"[DEBUG] [api_gateway] [add_bookmark] Error: {str(e)}") logger.error(f"Error adding bookmark: {str(e)}") return { 'status': 'error', @@ -326,16 +527,29 @@ def post(self): class BookmarkDelete(Resource): @token_required def delete(self, bookmark_id): - """Remove a bookmark for a news article""" + """Remove a bookmark for a news article. + + Requires a valid JWT token in the Authorization header. + Deletes the specified bookmark for the authenticated user. + + Args: + bookmark_id (str): The ID of the bookmark to be deleted. + + Returns: + dict: Contains success message. + int: HTTP 200 on success, 500 on error. + """ try: - # Get the user ID from the token + print(f"[DEBUG] [api_gateway] [delete_bookmark] Called for bookmark: {bookmark_id}") auth_header = request.headers.get('Authorization') token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [delete_bookmark] Decoding token: {token[:10]}...") payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') user_id = payload.get('sub') + print(f"[DEBUG] [api_gateway] [delete_bookmark] Deleting bookmark {bookmark_id} for user {user_id}") - # Delete the bookmark using the news_storage service result = delete_bookmark(user_id, bookmark_id) + print(f"[DEBUG] [api_gateway] [delete_bookmark] Deletion result: {result}") return { 'status': 'success', @@ -343,12 +557,280 @@ def delete(self, bookmark_id): }, 200 except Exception as e: + print(f"[DEBUG] [api_gateway] [delete_bookmark] Error: {str(e)}") logger.error(f"Error removing bookmark: {str(e)}") return { 'status': 'error', 'message': str(e) }, 500 +@story_tracking_ns.route('/') +class StoryTracking(Resource): + @story_tracking_ns.param('keyword', 'Keyword to track for news updates') + def get(self): + """Fetch latest news for a tracked keyword. + + Retrieves and processes the latest news articles for a given keyword. + + Args: + keyword (str): The keyword to search for news articles. + + Returns: + dict: Contains list of processed articles and success status. + int: HTTP 200 on success, 400 if keyword is missing, 500 on error. + """ + try: + print("[DEBUG] [api_gateway] [story_tracking] Story tracking get endpoint called") + keyword = request.args.get('keyword') + print(f"[DEBUG] [api_gateway] [story_tracking] Requested keyword: '{keyword}'") + if not keyword: + print("[DEBUG] [api_gateway] [story_tracking] Keyword parameter missing") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Keyword parameter is required' + }), 400) + + print(f"[DEBUG] [api_gateway] [story_tracking] Fetching news for keyword: '{keyword}'") + articles = fetch_news(keyword) + print(f"[DEBUG] [api_gateway] [story_tracking] Found {len(articles) if articles else 0} articles") + + processed_articles = [] + for article in articles: + print(f"[DEBUG] [api_gateway] [story_tracking] Processing article: {article.get('title', 'No title')}") + article_id = store_article_in_supabase(article) + print(f"[DEBUG] [api_gateway] [story_tracking] Stored article with ID: {article_id}") + processed_articles.append({ + 'id': article_id, + 'title': article.get('title'), + 'url': article.get('url'), + 'source': article.get('source', {}).get('name') if isinstance(article.get('source'), dict) else article.get('source'), + 'publishedAt': article.get('publishedAt', datetime.now().isoformat()) + }) + + print(f"[DEBUG] [api_gateway] [story_tracking] Returning {len(processed_articles)} processed articles") + return make_response(jsonify({ + 'status': 'success', + 'articles': processed_articles + }), 200) + + except Exception as e: + print(f"[DEBUG] [api_gateway] [story_tracking] Error: {str(e)}") + logger.error(f"Error in story tracking: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + + @token_required + def post(self): + """Create a new tracked story. + + Requires a valid JWT token in the Authorization header. + Creates a new tracked story for the authenticated user based on a keyword and source article. + + Expected JSON payload: + { + 'keyword': str (required), + 'sourceArticleId': str (optional) + } + + Returns: + dict: Contains created story details and success status. + int: HTTP 201 on success, 400 on validation error, 500 on server error. + """ + try: + print("[DEBUG] [api_gateway] [story_tracking] Called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [story_tracking] Decoding token: {token[:10]}...") + payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + print(f"[DEBUG] [api_gateway] [story_tracking] Creating tracked story for user: {user_id}") + + data = request.get_json() + keyword = data.get('keyword') + source_article_id = data.get('sourceArticleId') + print(f"[DEBUG] [api_gateway] [story_tracking] Story details - Keyword: '{keyword}', Source article: {source_article_id}") + + if not keyword: + print("[DEBUG] [api_gateway] [story_tracking] Keyword parameter missing in request") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Keyword is required' + }), 400) + + print(f"[DEBUG] [api_gateway] [story_tracking] Calling create_tracked_story with user_id: {user_id}, keyword: '{keyword}'") + tracked_story = create_tracked_story(user_id, keyword, source_article_id) + print(f"[DEBUG] [api_gateway] [story_tracking] Tracked story created with ID: {tracked_story['id'] if tracked_story else 'unknown'}") + + print(f"[DEBUG] [api_gateway] [story_tracking] Getting full story details for story: {tracked_story['id']}") + story_with_articles = get_story_details(tracked_story['id']) + print(f"[DEBUG] [api_gateway] [story_tracking] Found {len(story_with_articles.get('articles', [])) if story_with_articles else 0} related articles") + + return make_response(jsonify({ + 'status': 'success', + 'data': story_with_articles + }), 201) + + except Exception as e: + print(f"[DEBUG] [api_gateway] [story_tracking] Error: {str(e)}") + logger.error(f"Error creating tracked story: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@story_tracking_ns.route('/user') +class UserStoryTracking(Resource): + @token_required + def get(self): + """Get all tracked stories for the authenticated user. + + Requires a valid JWT token in the Authorization header. + Retrieves all tracked stories associated with the authenticated user. + + Returns: + dict: Contains list of tracked stories and success status. + int: HTTP 200 on success, 500 on error. + """ + try: + print("[DEBUG] [api_gateway] [user_story_tracking] Called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [user_story_tracking] Decoding token: {token[:10]}...") + payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + print(f"[DEBUG] [api_gateway] [user_story_tracking] Getting tracked stories for user: {user_id}") + + print(f"[DEBUG] [api_gateway] [user_story_tracking] Calling get_tracked_stories") + tracked_stories = get_tracked_stories(user_id) + print(f"[DEBUG] [api_gateway] [user_story_tracking] Found {len(tracked_stories)} tracked stories") + + return make_response(jsonify({ + 'status': 'success', + 'data': tracked_stories + }), 200) + + except Exception as e: + print(f"[DEBUG] [api_gateway] [user_story_tracking] Error: {str(e)}") + logger.error(f"Error getting tracked stories: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@story_tracking_ns.route('/') +class StoryTrackingDetail(Resource): + @token_required + def get(self, story_id): + """Get details for a specific tracked story. + + Requires a valid JWT token in the Authorization header. + Retrieves detailed information about a specific tracked story. + + Args: + story_id (str): The ID of the tracked story to retrieve. + + Returns: + dict: Contains story details and success status. + int: HTTP 200 on success, 404 if story not found, 500 on error. + """ + try: + print(f"[DEBUG] [api_gateway] [story_tracking_detail] Called for story: {story_id}") + print(f"[DEBUG] [api_gateway] [story_tracking_detail] Calling get_story_details for story: {story_id}") + story = get_story_details(story_id) + + if not story: + print(f"[DEBUG] [api_gateway] [story_tracking_detail] No story found with ID: {story_id}") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Tracked story not found' + }), 404) + + print(f"[DEBUG] [api_gateway] [story_tracking_detail] Found story: {story['keyword']}") + print(f"[DEBUG] [api_gateway] [story_tracking_detail] Story has {len(story.get('articles', []))} articles") + return make_response(jsonify({ + 'status': 'success', + 'data': story + }), 200) + + except Exception as e: + print(f"[DEBUG] [api_gateway] [story_tracking_detail] Error: {str(e)}") + logger.error(f"Error getting story details: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + + @token_required + def delete(self, story_id): + """Stop tracking a story. + + Requires a valid JWT token in the Authorization header. + Deletes a tracked story for the authenticated user. + + Args: + story_id (str): The ID of the tracked story to delete. + + Returns: + dict: Contains success message. + int: HTTP 200 on success, 404 if story not found, 500 on error. + """ + try: + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Called for story: {story_id}") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Decoding token: {token[:10]}...") + payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Deleting tracked story {story_id} for user {user_id}") + + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Calling delete_tracked_story") + success = delete_tracked_story(user_id, story_id) + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Delete result: {success}") + + if not success: + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Failed to delete story or story not found") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Failed to delete tracked story or story not found' + }), 404) + + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Story deleted successfully") + return make_response(jsonify({ + 'status': 'success', + 'message': 'Tracked story deleted successfully' + }), 200) + + except Exception as e: + print(f"[DEBUG] [api_gateway] [delete_story_tracking] Error: {str(e)}") + logger.error(f"Error deleting tracked story: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@app.route('/api/story_tracking', methods=['OPTIONS']) +def story_tracking_options(): + """Handle OPTIONS requests for the story tracking endpoint. + + This function sets the necessary CORS headers for preflight requests + to the story tracking endpoint. + + Returns: + Response: A Flask response object with appropriate CORS headers. + """ + print("[DEBUG] [api_gateway] [story_tracking_options] Called") + response = make_response() + response.headers.add("Access-Control-Allow-Origin", "*") + response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization") + response.headers.add("Access-Control-Allow-Methods", "GET,POST,PUT,DELETE,OPTIONS") + print("[DEBUG] [api_gateway] [story_tracking_options] Responding with CORS headers") + return response + if __name__ == '__main__': - port = int(sys.argv[1]) if len(sys.argv) > 1 else Config.API_PORT - app.run(host=Config.API_HOST, port=port, debug=True) + # Read the port from the environment (Cloud Run sets the PORT variable) + port = int(os.environ.get("PORT", 8080)) + # Listen on 0.0.0.0 so the service is reachable externally + print(f"[DEBUG] [api_gateway] [main] Starting on {Config.API_HOST}:{port} with debug={True}") + app.run(host="0.0.0.0", port=port, debug=True) diff --git a/backend/core/__pycache__/config.cpython-312.pyc b/backend/core/__pycache__/config.cpython-312.pyc index a3e04fc..551f678 100644 Binary files a/backend/core/__pycache__/config.cpython-312.pyc and b/backend/core/__pycache__/config.cpython-312.pyc differ diff --git a/backend/core/config.py b/backend/core/config.py index 3116038..08b791c 100644 --- a/backend/core/config.py +++ b/backend/core/config.py @@ -11,7 +11,7 @@ class Config: # API Configuration API_HOST = os.getenv('API_HOST', 'localhost') - API_PORT = int(os.getenv('API_PORT', 5001)) + API_PORT = int(os.getenv('API_PORT', 8080)) # Redis Configuration REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') diff --git a/backend/data/story_tracking_schema.sql b/backend/data/story_tracking_schema.sql new file mode 100644 index 0000000..60e3636 --- /dev/null +++ b/backend/data/story_tracking_schema.sql @@ -0,0 +1,70 @@ +-- story_tracking_schema.sql +-- Schema for story tracking feature + +-- Table for tracked stories +CREATE TABLE tracked_stories ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + user_id UUID NOT NULL REFERENCES auth.users(id), + keyword VARCHAR(255) NOT NULL, + created_at TIMESTAMP NOT NULL DEFAULT NOW(), + last_updated TIMESTAMP NOT NULL DEFAULT NOW() +); + +-- Table for articles related to tracked stories +CREATE TABLE tracked_story_articles ( + tracked_story_id UUID REFERENCES tracked_stories(id) ON DELETE CASCADE, + news_id UUID REFERENCES news_articles(id) ON DELETE CASCADE, + added_at TIMESTAMP NOT NULL DEFAULT NOW(), + PRIMARY KEY (tracked_story_id, news_id) +); + +-- Index for faster lookups +CREATE INDEX idx_tracked_stories_user_id ON tracked_stories(user_id); +CREATE INDEX idx_tracked_stories_keyword ON tracked_stories(keyword); +CREATE INDEX idx_tracked_story_articles_story_id ON tracked_story_articles(tracked_story_id); + +-- RLS Policies for tracked_stories +ALTER TABLE tracked_stories ENABLE ROW LEVEL SECURITY; + +-- Allow users to view only their own tracked stories +CREATE POLICY tracked_stories_select_policy ON tracked_stories + FOR SELECT USING (auth.uid() = user_id); + +-- Allow users to insert their own tracked stories +CREATE POLICY tracked_stories_insert_policy ON tracked_stories + FOR INSERT WITH CHECK (auth.uid() = user_id); + +-- Allow users to update only their own tracked stories +CREATE POLICY tracked_stories_update_policy ON tracked_stories + FOR UPDATE USING (auth.uid() = user_id); + +-- Allow users to delete only their own tracked stories +CREATE POLICY tracked_stories_delete_policy ON tracked_stories + FOR DELETE USING (auth.uid() = user_id); + +-- RLS Policies for tracked_story_articles +ALTER TABLE tracked_story_articles ENABLE ROW LEVEL SECURITY; + +-- Allow users to view only articles related to their tracked stories +CREATE POLICY tracked_story_articles_select_policy ON tracked_story_articles + FOR SELECT USING ( + tracked_story_id IN ( + SELECT id FROM tracked_stories WHERE user_id = auth.uid() + ) + ); + +-- Allow users to insert only articles related to their tracked stories +CREATE POLICY tracked_story_articles_insert_policy ON tracked_story_articles + FOR INSERT WITH CHECK ( + tracked_story_id IN ( + SELECT id FROM tracked_stories WHERE user_id = auth.uid() + ) + ); + +-- Allow users to delete only articles related to their tracked stories +CREATE POLICY tracked_story_articles_delete_policy ON tracked_story_articles + FOR DELETE USING ( + tracked_story_id IN ( + SELECT id FROM tracked_stories WHERE user_id = auth.uid() + ) + ); \ No newline at end of file diff --git a/backend/microservices/__pycache__/summarization_service.cpython-312.pyc b/backend/microservices/__pycache__/summarization_service.cpython-312.pyc index 8a0c2c1..8c95596 100644 Binary files a/backend/microservices/__pycache__/summarization_service.cpython-312.pyc and b/backend/microservices/__pycache__/summarization_service.cpython-312.pyc differ diff --git a/backend/microservices/auth_service.py b/backend/microservices/auth_service.py index 9f065e3..2864313 100644 --- a/backend/microservices/auth_service.py +++ b/backend/microservices/auth_service.py @@ -1,7 +1,23 @@ #!/usr/bin/env python3 """ -auth_service.py - Microservice for Authentication -Handles user authentication, JWT token generation, and user profile management. +Authentication Service Module + +This microservice handles all authentication-related functionality including: +- User authentication and authorization +- JWT token generation and validation +- User profile management and storage +- Session handling + +The service uses a file-based storage system for user data (users.txt) +and JWT tokens for maintaining user sessions. It provides RESTful endpoints +for user registration, login, and profile management. + +Environment Variables: + JWT_SECRET_KEY: Secret key for JWT token generation (required) + +Typical usage example: + POST /auth/login {'username': 'user', 'password': 'pass'} + GET /auth/profile with JWT token in Authorization header """ from flask import Flask, request, jsonify @@ -35,7 +51,27 @@ ], f) def load_users(): - """Load users from the users.txt file""" + """Load user data from the users.txt file. + + This function reads the JSON-formatted user data from the users.txt file + and returns it as a Python list of dictionaries. Each dictionary contains + user information including id, username, password, email, and name. + + Returns: + list: A list of dictionaries containing user data. + Each dictionary has the following structure: + { + 'id': int, + 'username': str, + 'password': str, + 'email': str, + 'firstName': str, + 'lastName': str + } + + Raises: + Exception: If there's an error reading or parsing the users file. + """ try: with open(USERS_FILE, 'r') as f: return json.load(f) diff --git a/backend/microservices/news_fetcher.py b/backend/microservices/news_fetcher.py index 818c52a..b5ab9d9 100644 --- a/backend/microservices/news_fetcher.py +++ b/backend/microservices/news_fetcher.py @@ -1,3 +1,21 @@ +"""News Fetcher Service + +This module is responsible for fetching news articles from the News API based on +keywords and managing the storage of fetched articles. It provides functionality +to search for news articles and optionally save them to files with session-based +organization. + +The module uses the News API (https://newsapi.org/) as its primary data source +and supports session-based article management for multi-user scenarios. + +Typical usage: + articles = fetch_news('technology') + write_to_file(articles, 'user_session_123') + +Environment Variables Required: + NEWS_API_KEY: API key for accessing the News API service +""" + import os import requests from dotenv import load_dotenv @@ -5,20 +23,39 @@ from pathlib import Path from backend.core.config import Config - -# Load environment variables from .env file +# Load environment variables from .env file for configuration load_dotenv() -# Get the News API key from environment variables +# Initialize the News API key from environment variables NEWS_API_KEY = os.getenv('NEWS_API_KEY') def fetch_news(keyword='', session_id=None): - # Define the News API endpoint and parameters + """Fetch news articles from News API based on a keyword search. + + This function queries the News API to retrieve articles matching the provided + keyword. It supports session-based tracking of requests and can handle empty + keyword searches. + + Args: + keyword (str, optional): The search term to find relevant articles. + Defaults to empty string which returns top headlines. + session_id (str, optional): Unique identifier for the current user session. + Used for organizing saved articles. Defaults to None. + + Returns: + list: A list of dictionaries containing article data with fields like + 'title', 'description', 'url', etc. Returns None on error. + + Raises: + requests.exceptions.RequestException: If there's an error communicating + with the News API. + """ + # Configure the News API endpoint and request parameters url = "https://newsapi.org/v2/everything" params = { - 'q': keyword, # Use the keyword for search + 'q': keyword, # Search query parameter 'apiKey': NEWS_API_KEY, - 'pageSize': 10 + 'pageSize': 1 # Limit results to 10 articles per request } try: @@ -52,17 +89,36 @@ def fetch_news(keyword='', session_id=None): print(f"Error fetching news: {e}") def write_to_file(articles, session_id=None): - # Define the file path with session_id + """Save fetched news articles to a JSON file. + + This function stores the provided articles in a JSON file, organizing them + by session ID. It creates the necessary directories if they don't exist. + + Args: + articles (list): List of article dictionaries to save. + session_id (str, optional): Unique identifier for the current session. + Used to create a unique filename. Defaults to 'default' if None. + + Returns: + None + + Raises: + IOError: If there's an error writing to the file system. + """ + # Use default session ID if none provided if not session_id: session_id = 'default' + + # Generate a unique filename using the session ID file_name = f'{session_id}_news_data.json' + # Construct the full file path using the configured data directory file_path = Config.NEWS_DATA_DIR / file_name try: - # Write articles to the file in JSON format + # Save the articles as formatted JSON for better readability with open(file_path, 'w') as file: json.dump(articles, file, indent=4) - print(f"Articles saved to {file_path}") + print(f"Articles successfully saved to {file_path}") except IOError as e: print(f"Error writing to file: {e}") diff --git a/backend/microservices/news_storage.py b/backend/microservices/news_storage.py index a8ae7e9..c9eedf4 100644 --- a/backend/microservices/news_storage.py +++ b/backend/microservices/news_storage.py @@ -1,34 +1,68 @@ # backend/microservices/news_storage.py +""" +News Storage Service - Supabase Database Integration Module + +This module provides functions for storing and retrieving news articles and user interactions +with the Supabase database. It handles article storage, user search history logging, and bookmark +management operations. + +The module uses the Supabase client to interact with the following tables: +- news_articles: Stores article content and metadata +- user_search_history: Tracks user search interactions +- user_bookmarks: Manages user article bookmarks + +Environment Variables Required: +- VITE_SUPABASE_URL: Supabase project URL +- VITE_SUPABASE_ANON_KEY: Supabase anonymous key for client operations +""" import os import datetime -from supabase import create_client, Client # Make sure you're using supabase-py or your preferred client +from supabase import create_client, Client from dotenv import load_dotenv +# Load environment variables from .env file load_dotenv('../../.env') -# Use your service key here for secure server-side operations. +# Initialize Supabase client with environment variables SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") -SUPABASE_SERVICE_KEY = os.getenv("VITE_SUPABASE_ANON_KEY") +SUPABASE_SERVICE_KEY = os.getenv("VITE_SUPABASE_ANON_KEY") # Using anon key for server-side operations supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) def store_article_in_supabase(article): """ Inserts a news article into the Supabase news_articles table if it doesn't already exist. - Uniqueness is enforced by the URL field (which is UNIQUE in the table). + + This function first checks if an article with the same URL already exists in the database. + If it does, the function returns the existing article's ID. Otherwise, it inserts the new + article and returns the newly created ID. Uniqueness is enforced by the URL field (which + is UNIQUE in the table). + + Args: + article (dict): A dictionary containing article data with the following keys: + - title (str): The article title + - summary (str, optional): A summary of the article + - content (str, optional): The full article content + - source (dict or str): The source of the article + - publishedAt (str): Publication date in ISO format + - url (str): The unique URL to the article + - urlToImage (str, optional): URL to the article's image + + Returns: + str: The ID of the article (either existing or newly created) """ - # Check if the article already exists using the URL as unique identifier. + # Check if the article already exists using the URL as unique identifier existing = supabase.table("news_articles").select("*").eq("url", article["url"]).execute() if existing.data and len(existing.data) > 0: - # Article already exists; return its id. + # Article already exists; return its id return existing.data[0]["id"] else: - # Insert a new article. + # Insert a new article with all available fields result = supabase.table("news_articles").insert({ "title": article["title"], "summary": article.get("summary", ""), "content": article.get("content", ""), - # The source can be a dict (from API) or a plain string. + # Handle source field which can be a dict (from API) or a plain string "source": article["source"]["name"] if isinstance(article.get("source"), dict) else article["source"], "published_at": article["publishedAt"], "url": article["url"], @@ -39,11 +73,26 @@ def store_article_in_supabase(article): def log_user_search(user_id, news_id, session_id): """ Logs a search event by inserting a record into the user_search_history join table. + + This function creates a record of a user viewing or searching for a specific article, + which can be used for analytics, personalization, and tracking user activity across sessions. + + Args: + user_id (str): The ID of the user performing the search + news_id (str): The ID of the news article that was viewed/searched + session_id (str): The current session identifier for tracking user activity + + Returns: + dict: The Supabase response object containing the result of the insert operation """ + # Create a timestamp for when the search occurred + current_time = datetime.datetime.utcnow().isoformat() + + # Insert the search record with all required fields result = supabase.table("user_search_history").insert({ "user_id": user_id, "news_id": news_id, - "searched_at": datetime.datetime.utcnow().isoformat(), + "searched_at": current_time, "session_id": session_id, }).execute() return result @@ -51,25 +100,56 @@ def log_user_search(user_id, news_id, session_id): def add_bookmark(user_id, news_id): """ Adds a bookmark by inserting a record into the user_bookmarks table. - Returns the created bookmark record if successful. + + This function creates a bookmark relationship between a user and a news article, + allowing users to save articles for later reading. + + Args: + user_id (str): The ID of the user adding the bookmark + news_id (str): The ID of the news article to bookmark + + Returns: + dict or None: The created bookmark record if successful, None otherwise + + Raises: + Exception: If there's an error during the database operation """ try: + # Insert a new bookmark record linking user to article result = supabase.table("user_bookmarks").insert({ "user_id": user_id, "news_id": news_id, }).execute() + + # Return the first data item if available, otherwise None return result.data[0] if result.data else None except Exception as e: print(f"Error adding bookmark: {str(e)}") + # Re-raise the exception for proper error handling upstream raise e def get_user_bookmarks(user_id): """ Retrieves all bookmarked articles for a user with full article details. - Returns a list of bookmarked articles with their details. + + This function performs a join between the user_bookmarks table and the news_articles table + to retrieve complete article information for all articles bookmarked by the specified user. + The results are transformed into a more user-friendly format where each article includes its + bookmark_id for reference. + + Args: + user_id (str): The ID of the user whose bookmarks should be retrieved + + Returns: + list: A list of dictionaries, each containing the full details of a bookmarked article + with an additional 'bookmark_id' field + + Raises: + Exception: If there's an error during the database operation """ try: # Query user_bookmarks and join with news_articles to get full article details + # This uses Supabase's foreign key relationships to perform the join result = supabase.table("user_bookmarks") \ .select( "id," @@ -78,26 +158,51 @@ def get_user_bookmarks(user_id): .eq("user_id", user_id) \ .execute() - # Transform the result to a more friendly format + # Transform the nested result structure to a more friendly format + # by flattening the news_articles data and adding the bookmark_id bookmarks = [] for item in result.data: article = item["news_articles"] - article["bookmark_id"] = item["id"] + article["bookmark_id"] = item["id"] # Add bookmark ID to article for reference bookmarks.append(article) return bookmarks except Exception as e: print(f"Error fetching bookmarks: {str(e)}") + # Re-raise the exception for proper error handling upstream raise e def delete_bookmark(user_id, bookmark_id): """ Deletes a bookmark from the user_bookmarks table. - Returns True if successful, False otherwise. + + This function removes a bookmark relationship between a user and an article. + It ensures that users can only delete their own bookmarks by checking both the + bookmark_id and user_id in the query. + + Args: + user_id (str): The ID of the user who owns the bookmark + bookmark_id (str): The ID of the bookmark to delete + + Returns: + bool: True if the bookmark was successfully deleted, False if no bookmark was found + or if the deletion was unsuccessful + + Raises: + Exception: If there's an error during the database operation """ try: - result = supabase.table("user_bookmarks").delete().eq("id", bookmark_id).eq("user_id", user_id).execute() + # Delete the bookmark, ensuring it belongs to the specified user + # This double condition prevents users from deleting other users' bookmarks + result = supabase.table("user_bookmarks") \ + .delete() \ + .eq("id", bookmark_id) \ + .eq("user_id", user_id) \ + .execute() + + # Return True if at least one record was deleted, False otherwise return len(result.data) > 0 except Exception as e: print(f"Error deleting bookmark: {str(e)}") + # Re-raise the exception for proper error handling upstream raise e \ No newline at end of file diff --git a/backend/microservices/story_tracking_service.py b/backend/microservices/story_tracking_service.py index 51f1911..89b4975 100755 --- a/backend/microservices/story_tracking_service.py +++ b/backend/microservices/story_tracking_service.py @@ -1,18 +1,425 @@ #!/usr/bin/env python3 """ story_tracking_service.py - Microservice for Story Tracking -Wraps the story clustering logic and provides API endpoints for tracking stories. + +This service provides functionality for tracking news stories by keyword and finding related articles. +It integrates with Supabase for data persistence and manages user story tracking preferences. + +Key Features: +- Story tracking by keywords +- Related article discovery +- User story management +- Automatic story updates + +The service uses clustering algorithms to group similar articles and maintains +relationships between tracked stories and their associated articles. + +Database Tables Used: +- tracked_stories: Stores user story tracking preferences +- tracked_story_articles: Links stories to their related articles +- news_articles: Stores article content and metadata + +Environment Variables Required: +- VITE_SUPABASE_URL: Supabase project URL +- SUPABASE_SERVICE_ROLE_KEY: Service role key for admin access """ -from summarization.story_tracking.story_tracking import cluster_articles +#TODO: Implement proper background processing: Use a task queue like Celery to handle article fetching in the background + +import os +import datetime +from supabase import create_client, Client +from dotenv import load_dotenv +# from summarization.story_tracking.story_tracking import cluster_articles +from backend.microservices.news_fetcher import fetch_news + +# Service initialization logging +print("[DEBUG] [story_tracking_service] [main] Story tracking service starting...") + +# Load environment variables from .env file +load_dotenv() +print("[DEBUG] [story_tracking_service] [main] Environment variables loaded") + +# Initialize Supabase client with service role key for admin access to bypass RLS +# RLS (Row Level Security) policies are bypassed when using the service role key +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") + +print(f"[DEBUG] [story_tracking_service] [main] Supabase URL: {SUPABASE_URL}") +print(f"[DEBUG] [story_tracking_service] [main] Supabase Key: {SUPABASE_SERVICE_KEY[:5]}..." if SUPABASE_SERVICE_KEY else "[DEBUG] [story_tracking_service] [main] Supabase Key: None") + +# Create Supabase client for database operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +print("[DEBUG] [story_tracking_service] [main] Supabase client initialized") def run_story_tracking(article_embeddings): + """ + Runs the story tracking algorithm on a set of article embeddings to identify story clusters. + + This function uses clustering algorithms to group similar articles together based on their + vector embeddings, helping to identify distinct news stories or topics. + + Args: + article_embeddings: List of vector embeddings for articles. Each embedding should be + a numerical vector representing the article's content. + + Returns: + list: A list of cluster labels indicating which story cluster each article belongs to. + Empty list is returned if article_embeddings is None or empty. + """ + print(f"[DEBUG] [story_tracking_service] [run_story_tracking] Running story tracking with {len(article_embeddings) if article_embeddings else 0} embeddings") labels = cluster_articles(article_embeddings) + print(f"[DEBUG] [story_tracking_service] [run_story_tracking] Clustering complete, found {len(labels) if labels else 0} labels") return labels +def create_tracked_story(user_id, keyword, source_article_id=None): + """ + Creates a new tracked story for a user based on a keyword. + + Args: + user_id: The ID of the user tracking the story + keyword: The keyword/topic to track + source_article_id: Optional ID of the source article that initiated tracking + + Returns: + The created tracked story record + """ + + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Creating tracked story for user {user_id}, keyword: '{keyword}', source_article: {source_article_id}") + try: + # Check if the user is already tracking this keyword + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Checking if user already tracks keyword '{keyword}'") + existing = supabase.table("tracked_stories") \ + .select("*") \ + .eq("user_id", user_id) \ + .eq("keyword", keyword) \ + .execute() + + if existing.data and len(existing.data) > 0: + # User is already tracking this keyword + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] User already tracking this keyword, found {len(existing.data)} existing entries") + return existing.data[0] + + # Create a new tracked story + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Creating new tracked story record") + current_time = datetime.datetime.utcnow().isoformat() + result = supabase.table("tracked_stories").insert({ + "user_id": user_id, + "keyword": keyword, + "created_at": current_time, + "last_updated": current_time + }).execute() + + if not result.data: + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Failed to create tracked story: {result}") + return None + + tracked_story = result.data[0] if result.data else None + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Tracked story created with ID: {tracked_story['id'] if tracked_story else None}") + + # If a source article was provided, link it to the tracked story + if tracked_story and source_article_id: + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Linking source article {source_article_id} to tracked story") + supabase.table("tracked_story_articles").insert({ + "tracked_story_id": tracked_story["id"], + "news_id": source_article_id, + "added_at": datetime.datetime.utcnow().isoformat() + }).execute() + + # Log that we're skipping synchronous article fetching + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Skipping synchronous article fetching to avoid resource contention") + find_related_articles(tracked_story["id"], keyword) + + return tracked_story + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Error creating tracked story: {str(e)}") + raise e + +def get_tracked_stories(user_id): + """ + Gets all tracked stories for a user. + + Args: + user_id: The ID of the user + + Returns: + List of tracked stories with their related articles + """ + print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Getting tracked stories for user {user_id}") + try: + # Get all tracked stories for the user + result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("user_id", user_id) \ + .order("created_at", desc=True) \ + .execute() + + tracked_stories = result.data if result.data else [] + print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Found {len(tracked_stories)} tracked stories") + + # For each tracked story, get its related articles + for story in tracked_stories: + print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Getting articles for story {story['id']}") + story["articles"] = get_story_articles(story["id"]) + print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Found {len(story['articles'])} articles for story {story['id']}") + + return tracked_stories + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Error getting tracked stories: {str(e)}") + raise e + +def get_story_details(story_id): + """ + Gets details for a specific tracked story including related articles. + + Args: + story_id: The ID of the tracked story + + Returns: + The tracked story with its related articles + """ + print(f"[DEBUG] [story_tracking_service] [get_story_details] Getting story details for story ID {story_id}") + try: + # Get the tracked story + result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("id", story_id) \ + .execute() + + if not result.data or len(result.data) == 0: + print(f"[DEBUG] [story_tracking_service] [get_story_details] No story found with ID {story_id}") + return None + + story = result.data[0] + print(f"[DEBUG] [story_tracking_service] [get_story_details] Found story: {story['keyword']}") + + # Get related articles + print(f"[DEBUG] [story_tracking_service] [get_story_details] Getting related articles") + story["articles"] = get_story_articles(story_id) + print(f"[DEBUG] [story_tracking_service] [get_story_details] Found {len(story['articles'])} related articles") + + return story + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [get_story_details] Error getting story details: {str(e)}") + raise e + +def delete_tracked_story(user_id, story_id): + """ + Deletes a tracked story for a user. + + Args: + user_id: The ID of the user + story_id: The ID of the tracked story to delete + + Returns: + True if successful, False otherwise + """ + print(f"[DEBUG] [story_tracking_service] [delete_tracked_story] Deleting tracked story {story_id} for user {user_id}") + try: + # Delete the tracked story (related articles will be deleted via CASCADE) + result = supabase.table("tracked_stories") \ + .delete() \ + .eq("id", story_id) \ + .eq("user_id", user_id) \ + .execute() + + success = len(result.data) > 0 + print(f"[DEBUG] [story_tracking_service] [delete_tracked_story] Delete operation {'successful' if success else 'failed'}") + return success + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [delete_tracked_story] Error deleting tracked story: {str(e)}") + raise e + +def get_story_articles(story_id): + """ + Gets all articles related to a tracked story. + + Args: + story_id: The ID of the tracked story + + Returns: + List of articles related to the tracked story + """ + print(f"[DEBUG] [story_tracking_service] [get_story_articles] Getting articles for story {story_id}") + try: + # Get all article IDs related to the tracked story + result = supabase.table("tracked_story_articles") \ + .select("news_id, added_at") \ + .eq("tracked_story_id", story_id) \ + .order("added_at", desc=True) \ + .execute() + + article_refs = result.data if result.data else [] + print(f"[DEBUG] [story_tracking_service] [get_story_articles] Found {len(article_refs)} article references") + + if not article_refs: + return [] + + # Get the full article details for each article ID + articles = [] + for ref in article_refs: + print(f"[DEBUG] [story_tracking_service] [get_story_articles] Getting details for article {ref['news_id']}") + article_result = supabase.table("news_articles") \ + .select("*") \ + .eq("id", ref["news_id"]) \ + .execute() + + if article_result.data and len(article_result.data) > 0: + article = article_result.data[0] + # Add the added_at timestamp from the join table + article["added_at"] = ref["added_at"] + articles.append(article) + print(f"[DEBUG] [story_tracking_service] [get_story_articles] Added article: {article.get('title', 'No title')}") + else: + print(f"[DEBUG] [story_tracking_service] [get_story_articles] No data found for article {ref['news_id']}") + + return articles + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [get_story_articles] Error getting story articles: {str(e)}") + raise e + +def find_related_articles(story_id, keyword): + """ + Finds and adds articles related to a tracked story based on its keyword. + + Args: + story_id: The ID of the tracked story + keyword: The keyword to search for + + Returns: + Number of new articles added + """ + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Finding related articles for story {story_id}, keyword: '{keyword}'") + try: + # Get the tracked story to check when it was last updated + story_result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("id", story_id) \ + .execute() + + if not story_result.data or len(story_result.data) == 0: + print(f"[DEBUG] [story_tracking_service] [find_related_articles] No story found with ID {story_id}") + return 0 + + story = story_result.data[0] + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Found story: {story['keyword']}") + + # Fetch articles related to the keyword + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Fetching articles for keyword '{keyword}'") + articles = fetch_news(keyword) + + if not articles: + print(f"[DEBUG] [story_tracking_service] [find_related_articles] No articles found for keyword '{keyword}'") + return 0 + + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Found {len(articles)} articles for keyword '{keyword}'") + + # Get existing article IDs for this story to avoid duplicates + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Getting existing article IDs for story {story_id}") + existing_result = supabase.table("tracked_story_articles") \ + .select("news_id") \ + .eq("tracked_story_id", story_id) \ + .execute() + + existing_ids = [item["news_id"] for item in existing_result.data] if existing_result.data else [] + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Found {len(existing_ids)} existing article IDs") + + # Process and add new articles + new_articles_count = 0 + for article in articles: + # First, store the article in the news_articles table + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Storing article: {article.get('title', 'No title')}") + from backend.microservices.news_storage import store_article_in_supabase + article_id = store_article_in_supabase(article) + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Article stored with ID: {article_id}") + + # If this article is not already linked to the story, add it + if article_id not in existing_ids: + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Linking new article {article_id} to story {story_id}") + supabase.table("tracked_story_articles").insert({ + "tracked_story_id": story_id, + "news_id": article_id, + "added_at": datetime.datetime.utcnow().isoformat() + }).execute() + new_articles_count += 1 + else: + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Article {article_id} already linked to story") + + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Added {new_articles_count} new articles to story {story_id}") + + # Update the last_updated timestamp of the tracked story + if new_articles_count > 0: + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Updating last_updated timestamp for story {story_id}") + supabase.table("tracked_stories") \ + .update({"last_updated": datetime.datetime.utcnow().isoformat()}) \ + .eq("id", story_id) \ + .execute() + + return new_articles_count + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [find_related_articles] Error finding related articles: {str(e)}") + raise e + +def update_all_tracked_stories(): + """ + Background job to update all tracked stories with new related articles. + + This function is designed to be run as a scheduled task to keep all tracked stories + up-to-date with the latest news articles. It iterates through all tracked stories in the + database and calls find_related_articles() for each one to fetch and link new articles. + + Returns: + dict: A dictionary containing statistics about the update operation: + - stories_updated: Number of stories that received new articles + - new_articles: Total number of new articles added across all stories + """ + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Starting update of all tracked stories") + try: + # Get all tracked stories + result = supabase.table("tracked_stories") \ + .select("id, keyword") \ + .execute() + + tracked_stories = result.data if result.data else [] + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Found {len(tracked_stories)} tracked stories to update") + + if not tracked_stories: + return {"stories_updated": 0, "new_articles": 0} + + # Update each tracked story + stories_updated = 0 + total_new_articles = 0 + + for story in tracked_stories: + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Updating story {story['id']}, keyword: '{story['keyword']}'") + new_articles = find_related_articles(story["id"], story["keyword"]) + if new_articles > 0: + stories_updated += 1 + total_new_articles += new_articles + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Added {new_articles} new articles to story {story['id']}") + else: + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] No new articles found for story {story['id']}") + + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Update complete. Updated {stories_updated} stories with {total_new_articles} new articles") + return { + "stories_updated": stories_updated, + "new_articles": total_new_articles + } + + except Exception as e: + print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Error updating tracked stories: {str(e)}") + raise e + if __name__ == '__main__': - # Example dummy embeddings: - import numpy as np - dummy_embeddings = [np.random.rand(10) for _ in range(10)] - print("Story Tracking Service Output:") - print(run_story_tracking(dummy_embeddings)) + # Example usage - this code runs when the script is executed directly + print("[DEBUG] [story_tracking_service] [main] Running story_tracking_service.py as main") + result = update_all_tracked_stories() + print(f"[DEBUG] [story_tracking_service] [main] Updated {result['stories_updated']} stories with {result['new_articles']} new articles") + print("[DEBUG] [story_tracking_service] [main] Execution completed") diff --git a/backend/microservices/summarization_service.py b/backend/microservices/summarization_service.py index 52c123b..c6074da 100755 --- a/backend/microservices/summarization_service.py +++ b/backend/microservices/summarization_service.py @@ -1,6 +1,15 @@ #!/usr/bin/env python3 """ -summarization_service.py - Microservice for Summarization +Summarization Service Module + +This module provides functionality for fetching, processing, and summarizing news articles. +It includes capabilities for content extraction, text summarization, and keyword extraction. + +Key Features: +- Article content fetching from URLs +- Text summarization using OpenAI's GPT models +- Keyword extraction using YAKE +- Integration with Supabase for data persistence """ import json @@ -31,6 +40,19 @@ @log_exception(logger) def fetch_article_content(url): + """ + Fetches and extracts the main content from a given URL. + + Args: + url (str): The URL of the article to fetch content from. + + Returns: + str or None: The extracted article content as plain text. + Returns None if the fetch fails or content is invalid. + + Raises: + Various requests exceptions are caught and logged internally. + """ try: # Check if URL is valid if not url or not url.startswith('http'): @@ -72,6 +94,21 @@ def fetch_article_content(url): @log_exception(logger) def run_summarization(text): + """ + Generates a concise summary of the provided text using OpenAI's GPT model. + + Args: + text (str): The input text to be summarized. + + Returns: + str: A summarized version of the input text (approximately 150 words). + Returns an error message if summarization fails. + + Note: + Uses OpenAI's GPT-4 model with specific parameters for optimal summarization: + - Temperature: 0.5 (balanced between creativity and consistency) + - Max tokens: 200 (ensures concise output) + """ try: return "Summarized text here" @@ -91,7 +128,17 @@ def run_summarization(text): return "Error generating summary" @log_exception(logger) -def get_keywords(text,num_keywords=1): +def get_keywords(text, num_keywords=1): + """ + Extracts key phrases from the input text using YAKE keyword extraction. + + Args: + text (str): The input text to extract keywords from. + num_keywords (int, optional): Number of keywords to extract. Defaults to 1. + + Returns: + list: A list of extracted keywords/key phrases. + """ kw_extractor = yake.KeywordExtractor(top=num_keywords, lan='en') keywords = kw_extractor.extract_keywords(text) return [kw[0] for kw in keywords] @@ -99,6 +146,28 @@ def get_keywords(text,num_keywords=1): @log_exception(logger) def process_articles(session_id): + """ + Processes a batch of articles associated with a specific session ID. + + This function performs the following operations: + 1. Retrieves articles from Supabase based on the session ID + 2. Fetches missing content for articles if needed + 3. Generates summaries for each article + 4. Extracts keywords for filtering + + Args: + session_id (str): The unique identifier for the user session. + + Returns: + list: A list of dictionaries containing processed article data including: + - Basic article metadata (title, author, source, etc.) + - Generated summary + - Extracted keywords + - Original and fetched content + + Raises: + Exception: If there's an error during processing, it's logged and re-raised. + """ try: # Query only articles that belong to the current session. # result = supabase.table("news_articles").select("*").eq("session_id", session_id).execute() diff --git a/backend/realtime/realtime.py b/backend/realtime/realtime.py deleted file mode 100644 index b899649..0000000 --- a/backend/realtime/realtime.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -""" -realtime.py - Real-time Communication Module -This example uses Flask-SocketIO to enable real-time updates. -""" - -from flask import Flask, render_template -from flask_socketio import SocketIO, emit - -app = Flask(__name__) -app.config['SECRET_KEY'] = 'secret!' -socketio = SocketIO(app) - -@app.route('/') -def index(): - return "Real-time communication server running." - -@socketio.on('connect') -def test_connect(): - emit('message', {'data': 'Connected to real-time server.'}) - -if __name__ == '__main__': - socketio.run(app, host='0.0.0.0', port=6000) diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..c04853d --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,8 @@ +steps: + - name: 'gcr.io/cloud-builders/docker' + args: ['build', '-t', 'gcr.io/$PROJECT_ID/news-aggregator:$COMMIT_SHA', '.'] +images: + - 'gcr.io/$PROJECT_ID/news-aggregator:$COMMIT_SHA' +options: + logging: CLOUD_LOGGING_ONLY # or NONE, if you prefer no logs +serviceAccount: "projects/$PROJECT_ID/serviceAccounts/99775608725-compute@developer.gserviceaccount.com" \ No newline at end of file diff --git a/processing/.DS_Store b/data/.DS_Store similarity index 79% rename from processing/.DS_Store rename to data/.DS_Store index 53efefb..d6eb9db 100644 Binary files a/processing/.DS_Store and b/data/.DS_Store differ diff --git a/frontend/.DS_Store b/frontend/.DS_Store deleted file mode 100644 index 406d437..0000000 Binary files a/frontend/.DS_Store and /dev/null differ diff --git a/frontend/mobile/README.md b/frontend/mobile/README.md deleted file mode 100644 index e87230e..0000000 --- a/frontend/mobile/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Mobile Application - -This directory is reserved for the mobile application of the News Aggregator. - -You may choose a cross-platform framework such as: -- React Native -- Flutter -- Ionic - -Alternatively, you can develop native applications for iOS and Android. - -Please include the appropriate project files and documentation in this folder. diff --git a/frontend/web/app.js b/frontend/web/app.js deleted file mode 100644 index fcc755e..0000000 --- a/frontend/web/app.js +++ /dev/null @@ -1,6 +0,0 @@ -// app.js - Main JavaScript file for the web interface - -document.addEventListener('DOMContentLoaded', function() { - const content = document.getElementById('content'); - content.innerHTML = '

Welcome to the News Aggregator! This is a placeholder for dynamic news content.

'; -}); diff --git a/frontend/web/index.html b/frontend/web/index.html deleted file mode 100644 index effb949..0000000 --- a/frontend/web/index.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - News Aggregator - - - -

News Aggregator

-
-
-

Test Article

- -

- -
-
-
- - - - diff --git a/frontend/web/style.css b/frontend/web/style.css deleted file mode 100644 index 06442d3..0000000 --- a/frontend/web/style.css +++ /dev/null @@ -1,18 +0,0 @@ -/* style.css - Basic styling for the web interface */ -body { - font-family: Arial, sans-serif; - margin: 0; - padding: 0; - background-color: #f5f5f5; -} - -header { - background-color: #004080; - color: #fff; - padding: 20px; - text-align: center; -} - -main { - padding: 20px; -} diff --git a/ingestion/.DS_Store b/ingestion/.DS_Store deleted file mode 100644 index 5961be2..0000000 Binary files a/ingestion/.DS_Store and /dev/null differ diff --git a/ingestion/api_connectors/api_connector.py b/ingestion/api_connectors/api_connector.py deleted file mode 100755 index cd729d5..0000000 --- a/ingestion/api_connectors/api_connector.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -""" -api_connector.py - Module to interface with external news APIs. -This script demonstrates how to fetch data from a news API. -""" - -import requests - -def fetch_news(api_url, params): - """ - Fetches news articles from the given API endpoint. - :param api_url: The API endpoint URL. - :param params: Dictionary of query parameters (including API keys). - :return: JSON response as a Python dict if successful, else empty dict. - """ - try: - response = requests.get(api_url, params=params) - response.raise_for_status() - return response.json() - except requests.RequestException as e: - print(f"API request error: {e}") - return {} - -if __name__ == '__main__': - # Example usage: Replace YOUR_API_KEY with a valid key. - sample_api_url = "https://newsapi.org/v2/top-headlines" - sample_params = {"country": "us", "apiKey": "YOUR_API_KEY"} - data = fetch_news(sample_api_url, sample_params) - print("API Response:") - print(data) diff --git a/ingestion/rss_reader/rss_reader.py b/ingestion/rss_reader/rss_reader.py deleted file mode 100755 index 0ada21d..0000000 --- a/ingestion/rss_reader/rss_reader.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python3 -""" -rss_reader.py - RSS feed reader for the news aggregator. -This script uses the feedparser library to parse RSS feeds. -""" - -import feedparser - -def parse_feed(feed_url): - """ - Parses an RSS feed from the provided URL. - :param feed_url: URL of the RSS feed. - :return: Parsed feed object. - """ - return feedparser.parse(feed_url) - -if __name__ == '__main__': - # Example usage: Parse a sample RSS feed. - sample_feed = "https://example.com/rss" - feed = parse_feed(sample_feed) - print("Feed Title:", feed.feed.get("title", "No Title")) - for entry in feed.entries: - print("Article Title:", entry.get("title", "No Title")) diff --git a/ingestion/scraper/scraper.py b/ingestion/scraper/scraper.py deleted file mode 100755 index 6912435..0000000 --- a/ingestion/scraper/scraper.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 -""" -scraper.py - Web scraper module for the news aggregator. -This script uses requests and BeautifulSoup to fetch and parse news articles. -""" - -import requests -from bs4 import BeautifulSoup - -def scrape(url): - """ - Fetches the HTML content of a URL and extracts text. - :param url: The URL of the news page to scrape. - :return: The extracted text content. - """ - try: - response = requests.get(url) - response.raise_for_status() - soup = BeautifulSoup(response.text, 'html.parser') - return soup.get_text() - except requests.RequestException as e: - print(f"Error scraping {url}: {e}") - return None - -if __name__ == '__main__': - # Example usage: Scrape a sample URL. - sample_url = "https://example.com/news" - content = scrape(sample_url) - if content: - print("Scraped Content:") - print(content) - else: - print("Scraping failed.") diff --git a/monitoring/.DS_Store b/monitoring/.DS_Store index b49efc1..e4e1aac 100644 Binary files a/monitoring/.DS_Store and b/monitoring/.DS_Store differ diff --git a/monitoring/ci_cd/Dockerfile b/monitoring/ci_cd/Dockerfile deleted file mode 100644 index 1b72406..0000000 --- a/monitoring/ci_cd/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -# Dockerfile - Container image for the News Aggregator backend services -FROM python:3.9-slim - -# Set working directory -WORKDIR /app - -# Install curl for health checks -RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* - -# Copy requirements file and install dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the entire project -COPY . . - -# Expose port 5000 for the API gateway -EXPOSE 5000 - -# Add health check -HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:5000/health || exit 1 - -# Set resource limits -ENV PYTHONUNBUFFERED=1 -ENV PYTHONIOENCODING=UTF-8 - -# Command to run the API gateway -CMD ["python", "news-aggregator/backend/api_gateway/api_gateway.py"] diff --git a/monitoring/ci_cd/docker-compose.yml b/monitoring/ci_cd/docker-compose.yml deleted file mode 100644 index 1e7edec..0000000 --- a/monitoring/ci_cd/docker-compose.yml +++ /dev/null @@ -1,60 +0,0 @@ -# docker-compose.yml - Compose file to run multiple services -version: '3.8' - -services: - api_gateway: - build: . - container_name: api_gateway - ports: - - "5000:5000" - environment: - - FLASK_ENV=production - - API_HOST=0.0.0.0 - - API_PORT=5000 - - CORS_ORIGINS=http://localhost:3000,http://localhost:5173 - - LOG_LEVEL=INFO - volumes: - - ./data:/app/data - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:5000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 5s - deploy: - resources: - limits: - cpus: '0.50' - memory: 512M - reservations: - cpus: '0.25' - memory: 256M - restart: unless-stopped - - realtime: - build: - context: . - dockerfile: Dockerfile - container_name: realtime_service - ports: - - "6000:6000" - environment: - - FLASK_ENV=production - - API_HOST=0.0.0.0 - - API_PORT=6000 - - LOG_LEVEL=INFO - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:6000"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 5s - deploy: - resources: - limits: - cpus: '0.50' - memory: 512M - reservations: - cpus: '0.25' - memory: 256M - restart: unless-stopped diff --git a/processing/caching/cache.py b/processing/caching/cache.py deleted file mode 100755 index 639f4b0..0000000 --- a/processing/caching/cache.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 -""" -cache.py - Caching module. -This script sets up Redis caching to store frequently accessed data. -""" - -import redis - -# Configure Redis client -redis_client = redis.Redis(host='localhost', port=6379, db=0) - -def set_cache(key, value, expire=3600): - """ - Stores a key-value pair in Redis cache with expiration. - :param key: Cache key. - :param value: Value to cache. - :param expire: Expiration time in seconds (default: 1 hour). - """ - redis_client.setex(key, expire, value) - -def get_cache(key): - """ - Retrieves a value from Redis cache by key. - :param key: Cache key. - :return: Cached value or None. - """ - return redis_client.get(key) - -if __name__ == '__main__': - set_cache('sample_key', 'sample_value') - cached = get_cache('sample_key') - print("Cached Value:", cached.decode('utf-8') if cached else "None") diff --git a/processing/preprocessing/preprocess.py b/processing/preprocessing/preprocess.py deleted file mode 100755 index 47d1e81..0000000 --- a/processing/preprocessing/preprocess.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python3 -""" -preprocess.py - Data preprocessing module. -This script includes functions for cleaning raw data, -removing unwanted HTML, and normalizing text. -""" - -def clean_data(raw_data): - """ - Cleans raw text data. - :param raw_data: The raw text to be cleaned. - :return: Cleaned text. - """ - # TODO: Implement more sophisticated cleaning if needed. - return raw_data.strip() - -if __name__ == '__main__': - sample_data = " Sample News Content " - cleaned = clean_data(sample_data) - print("Cleaned Data:", cleaned) diff --git a/processing/storage/config.py b/processing/storage/config.py deleted file mode 100644 index 1bbe4a7..0000000 --- a/processing/storage/config.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -config.py - Storage configuration module. -This file contains configuration settings for databases and blob storage. -""" - -# Database configuration (example: PostgreSQL) -DATABASE_CONFIG = { - "host": "localhost", - "port": 5432, - "user": "news_user", - "password": "password", - "database": "news_db" -} - -# Blob storage configuration (example: AWS S3) -S3_CONFIG = { - "bucket_name": "news-aggregator-bucket", - "region": "us-east-1" -} diff --git a/requirements.txt b/requirements.txt index 6287bf2..f24af4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,8 @@ requests beautifulsoup4 feedparser +aiohttp + # transformer torch @@ -11,14 +13,25 @@ torch numpy scikit-learn openai==0.28 +yake +nltk # Backend & API Flask flask-socketio python-dotenv +flask-cors==4.0.0 +gunicorn==21.2.0 +pydantic +tenacity +loguru +PyJWT==2.8.0 +flask-jwt-extended==4.5.3 # Caching & Storage redis +supabase +psycopg2-binary # Testing pytest diff --git a/run.sh b/run.sh deleted file mode 100644 index 23b21e9..0000000 --- a/run.sh +++ /dev/null @@ -1,6 +0,0 @@ - # Create and activate a virtual environment (recommended) -python -m venv venv -source venv/bin/activate # On Windows, use: venv\Scripts\activate - -# Install dependencies -pip install -r /Users/akalpitdawkhar/prog_news/news-aggregator/requirements.txt \ No newline at end of file diff --git a/summarization/.DS_Store b/summarization/.DS_Store deleted file mode 100644 index 1724e27..0000000 Binary files a/summarization/.DS_Store and /dev/null differ diff --git a/summarization/story_tracking/story_tracking.py b/summarization/story_tracking/story_tracking.py deleted file mode 100755 index 99c7ad8..0000000 --- a/summarization/story_tracking/story_tracking.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -""" -story_tracking.py - Story Clustering & Tracking Module -This module groups similar articles together and tracks the evolution -of news stories over time. -""" - -from sklearn.cluster import KMeans -import numpy as np - -def cluster_articles(article_embeddings, num_clusters=5): - """ - Clusters article embeddings into stories. - :param article_embeddings: List of vector embeddings for articles. - :param num_clusters: Number of clusters/stories to form. - :return: Cluster labels for each article. - """ - if not article_embeddings: - return [] - kmeans = KMeans(n_clusters=num_clusters, random_state=42) - labels = kmeans.fit_predict(np.array(article_embeddings)) - return labels - -if __name__ == '__main__': - # Example usage with dummy embeddings: - dummy_embeddings = [np.random.rand(10) for _ in range(20)] - labels = cluster_articles(dummy_embeddings) - print("Article Cluster Labels:") - print(labels) diff --git a/summarization/summarization_service/summarize.py b/summarization/summarization_service/summarize.py deleted file mode 100755 index 4b44268..0000000 --- a/summarization/summarization_service/summarize.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 -""" -summarize.py - LLM Summarization Service -This module uses an LLM (or an external API like OpenAI's GPT) -to generate summaries of news articles. -""" - -def generate_summary(article_text, style="default"): - """ - Generates a summary for the provided article text. - :param article_text: Raw text of the article. - :param style: Summary style (e.g., 'default', 'Opposite Sides', 'Explain Like I\'m 5'). - :return: Generated summary as a string. - """ - # Placeholder for LLM integration: - summary = f"Summary ({style}): " + article_text[:150] + "..." - return summary - -if __name__ == '__main__': - sample_article = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." - print("Generated Summary:") - print(generate_summary(sample_article)) diff --git a/summarization/summary_cache/summary_cache.py b/summarization/summary_cache/summary_cache.py deleted file mode 100755 index d1a2876..0000000 --- a/summarization/summary_cache/summary_cache.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python3 -""" -summary_cache.py - Caching for Summaries -This module caches generated summaries to avoid redundant processing. -It can be integrated with the caching layer (e.g., Redis) if needed. -""" - -# For simplicity, using a dictionary as an in-memory cache. -cache = {} - -def cache_summary(article_id, summary): - """ - Stores the summary in the cache. - :param article_id: Unique identifier for the article. - :param summary: Generated summary text. - """ - cache[article_id] = summary - -def get_cached_summary(article_id): - """ - Retrieves a cached summary if it exists. - :param article_id: Unique identifier for the article. - :return: Cached summary or None. - """ - return cache.get(article_id) - -if __name__ == '__main__': - # Example usage: - article_id = "article_123" - summary_text = "This is a sample summary." - cache_summary(article_id, summary_text) - print("Cached Summary for article_123:") - print(get_cached_summary(article_id)) diff --git a/tests/.DS_Store b/tests/.DS_Store deleted file mode 100644 index a7b610f..0000000 Binary files a/tests/.DS_Store and /dev/null differ diff --git a/tests/test_sample.py b/tests/test_sample.py deleted file mode 100755 index b48121a..0000000 --- a/tests/test_sample.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -""" -test_sample.py - Sample test file for the News Aggregator Project. -This file contains a simple test to verify the testing setup. -""" - -def test_example(): - # Basic arithmetic test. - assert 1 + 1 == 2 - -if __name__ == '__main__': - import pytest - pytest.main()