Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion app/api/endpoints/analysis_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from app.schemas.analysis_schema import AnalysisRead
from app.core.exceptions import NotFoundException
from fastapi.responses import StreamingResponse
from app.core.scoring import get_percentile

router = APIRouter(prefix="/analysis", tags=["analysis"])
logger = logging.getLogger(__name__)
Expand All @@ -41,7 +42,11 @@ async def get_analysis(
analysis = await analysis_service.get_analysis(
analysis_id=analysis_id, include_sources=include_sources, include_feedback=include_feedback
)
return AnalysisRead.model_validate(analysis)
raw_score = analysis.confidence_score
percentile = (get_percentile(raw_score)) / 100.0
analysis = AnalysisRead.model_validate(analysis)
analysis.confidence_percentile = percentile
return analysis
except NotFoundException as e:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=str(e))

Expand Down
40 changes: 40 additions & 0 deletions app/core/scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import pandas as pd
import numpy as np
import logging
import os

logger = logging.getLogger(__name__)

# Global variable to hold data in RAM
_REFERENCE_SCORES = np.array([])
_IS_LOADED = False


def load_distribution(csv_path: str):
"""Loads CSV into the global variable. Fails gracefully."""
global _REFERENCE_SCORES, _IS_LOADED

if not os.path.exists(csv_path):
logger.warning(f"Distribution file not found at {csv_path}. Skipping preload.")
return # App continues running, just without normalization

try:
logger.info(f"Loading distribution from {csv_path}...")
df = pd.read_csv(csv_path, usecols=["confidence_score"])
_REFERENCE_SCORES = df["confidence_score"].dropna().values
_REFERENCE_SCORES.sort()
_IS_LOADED = True
logger.info("Distribution loaded into RAM.")
except Exception as e:
logger.error(f"ERROR: Failed to load distribution: {e}")
# We catch the error so the app doesn't crash


def get_percentile(score: float) -> float:
"""Calculates percentile. Returns 0 if data isn't loaded."""
if not _IS_LOADED or len(_REFERENCE_SCORES) == 0:
return 0.0

# Fast binary search
idx = np.searchsorted(_REFERENCE_SCORES, score, side="left")
return (idx / len(_REFERENCE_SCORES)) * 100
Loading