Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/justinsight/celery.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
"args": (),
},

# Checked and good
# "check-CBSfeed-every-5-minutes": {
# "task": "justinsight.tasks.cbsLogger_task",
# "schedule": 5.0,
Expand Down
7 changes: 4 additions & 3 deletions src/nlp/base_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ def __init__(self, task: str, model_name: str, aggregation_strategy: str = None)
self.pipeline = self.load_pipeline()

def load_pipeline(self):
args = {"model": self.model_name}
if self.task == "ner" and self.aggregation_strategy:
return pipeline(self.task, model=self.model_name, aggregation_strategy=self.aggregation_strategy)

args["aggregation_strategy"] = self.aggregation_strategy
print(f"pipeline loading...")
return pipeline(self.task, model=self.model_name)
return pipeline(self.task, args)

def process_article(self, article_id: str):
#Retrieve article by ID
Expand Down
2 changes: 1 addition & 1 deletion src/nlp/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import argparse
from nlp.core import process_article # adjust if your actual import path differs
from nlp.ner_core import process_article # adjust if your actual import path differs

def run_cli(article_id: str):
entities = process_article(article_id)
Expand Down
8 changes: 5 additions & 3 deletions src/nlp/ner_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def process_article(self, article_id: str):
print(f"No article found with ID: {article_id}")
return []

if article.get("processed") is True:
if article.get("ner_processed") is True:
print(f"Article {article_id} already processed.")
return []

Expand All @@ -30,12 +30,14 @@ def process_article(self, article_id: str):
# return

# Run NER
print("about to run NER")
entities = self.pipeline(full_text) # run_ner_hf(full_text)

print("ran NER yay")

# Update article in DB
self.addToEntryInDB(article_id, {
"ner": entities,
"processed": True
"ner_processed": True
})

return entities
Expand Down
54 changes: 54 additions & 0 deletions src/nlp/sent_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from nlp.base_core import BaseCore
from transformers import pipeline
from bson import ObjectId

class SentCore(BaseCore):
def __init__(self):
super().__init__(
task="sentiment-analysis",
model_name="distilbert-base-uncased-finetuned-sst-2-english"
)

def process_article(self, article_id: str):
#Retrieve article by ID
article = self.collection.find_one({"_id": ObjectId(article_id)})

if not article:
print(f"No article found with ID: {article_id}")
return []

if article.get("sentiment_processed") is True:
print(f"Article {article_id} already processed.")
return []

# We have a check running so only articles with full text are saved
full_text = article.get("full_text", "")
# if not full_text:
# print(f"Article {article_id} has no full text.")
# return

# Run Sentiment Analysis
print("about to run Sentiment Analysis")
results = self.pipeline(full_text)
print("ran Sentiment Analysis yay")

# Update article in DB
self.addToEntryInDB(article_id, {
"sentiment analysis": results,
"sentiment_processed": True
})

return results

def addToEntryInDB(self, entry_id, updates):
print("Adding Sentiment Analysis results to database\r\r\r")

for res in updates.get("sentiment", []):
if "score" in res:
res["score"] = float(res["score"]) # convert np.float32 to Python float

id = ObjectId(entry_id)
self.collection.update_one(
{"_id": id},
{"$set": updates}
)
58 changes: 58 additions & 0 deletions src/nlp/summ_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from nlp.base_core import BaseCore
from transformers import pipeline
from bson import ObjectId

class SummCore(BaseCore):
def __init__(self):
super().__init__(
task="summarization",
model_name="facebook/bart-large-cnn"
)

def process_article(self, article_id: str):
#Retrieve article by ID
article = self.collection.find_one({"_id": ObjectId(article_id)})

if not article:
print(f"No article found with ID: {article_id}")
return []

if article.get("summary_processed") is True:
print(f"Article {article_id} already processed.")
return []

# We have a check running so only articles with full text are saved
full_text = article.get("full_text", "")
# if not full_text:
# print(f"Article {article_id} has no full text.")
# return

# Run Summarization
print("about to run Summarization")
summary = self.pipeline(
full_text,
max_length=150,
min_length=30,
do_sample=False) # run_ner_hf(full_text)
print("ran Summarization yay")

# Update article in DB
self.addToEntryInDB(article_id, {
"hf summary": summary,
"summary_processed": True
})

return summary

def addToEntryInDB(self, entry_id, updates):
print("Adding Summarization results to database\r\r\r")

if "hf summary" in updates:
for ent in updates["summarization"]:
ent["score"] = float(ent["score"]) # convert np.float32 to Python float

id = ObjectId(entry_id)
self.collection.update_one(
{"_id": id},
{"$set": updates}
)
2 changes: 1 addition & 1 deletion src/nlp/tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from nlp.core import run_ner_hf
from nlp.ner_core import run_ner_hf
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

Expand Down