diff --git a/.github/workflows/deploy-api-dev.yml b/.github/workflows/deploy-api-dev.yml index 63cf3cce..e0a31cca 100644 --- a/.github/workflows/deploy-api-dev.yml +++ b/.github/workflows/deploy-api-dev.yml @@ -70,7 +70,7 @@ jobs: image: ${{ steps.build-image.outputs.image }} - name: Deploy Amazon ECS task definition - uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + uses: aws-actions/amazon-ecs-deploy-task-definition@v2 with: task-definition: ${{ steps.task-def.outputs.task-definition }} service: pephub-service-dev diff --git a/.github/workflows/deploy-api.yml b/.github/workflows/deploy-api.yml index 214dff0c..7ad0df8c 100644 --- a/.github/workflows/deploy-api.yml +++ b/.github/workflows/deploy-api.yml @@ -70,7 +70,7 @@ jobs: image: ${{ steps.build-image.outputs.image }} - name: Deploy Amazon ECS task definition - uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + uses: aws-actions/amazon-ecs-deploy-task-definition@v2 with: task-definition: ${{ steps.task-def.outputs.task-definition }} service: pephub-service-primary diff --git a/deployment/dockerhub/dev.Dockerfile b/deployment/dockerhub/dev.Dockerfile index e5e112dc..3e344171 100644 --- a/deployment/dockerhub/dev.Dockerfile +++ b/deployment/dockerhub/dev.Dockerfile @@ -1,7 +1,7 @@ # ------------- # BUILD BACKEND # ------------- -FROM python:3.10-slim +FROM python:3.13-slim LABEL authors="Nathan LeRoy, Nathan Sheffield, Oleksandr Khoroshevskyi" RUN apt-get update @@ -17,6 +17,7 @@ WORKDIR /app COPY . /app RUN python -m pip install --upgrade pip -RUN pip install -r requirements/requirements-all.txt --no-cache-dir +RUN python -m pip install uv +RUN uv pip install -r requirements/requirements-all.txt --no-cache-dir --system CMD ["uvicorn", "pephub.main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/deployment/dockerhub/primary.Dockerfile b/deployment/dockerhub/primary.Dockerfile index d2332677..3e344171 100644 --- a/deployment/dockerhub/primary.Dockerfile +++ b/deployment/dockerhub/primary.Dockerfile @@ -1,8 +1,8 @@ # ------------- # BUILD BACKEND # ------------- -FROM python:3.10-slim -LABEL authors="Nathan LeRoy, Nathan Sheffield" +FROM python:3.13-slim +LABEL authors="Nathan LeRoy, Nathan Sheffield, Oleksandr Khoroshevskyi" RUN apt-get update RUN apt-get install -y gcc @@ -17,6 +17,7 @@ WORKDIR /app COPY . /app RUN python -m pip install --upgrade pip -RUN pip install -r requirements/requirements-all.txt --no-cache-dir +RUN python -m pip install uv +RUN uv pip install -r requirements/requirements-all.txt --no-cache-dir --system CMD ["uvicorn", "pephub.main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/pephub/_version.py b/pephub/_version.py index 8066fa27..8782a8b4 100644 --- a/pephub/_version.py +++ b/pephub/_version.py @@ -1 +1 @@ -__version__ = "0.15.3" +__version__ = "0.15.4" diff --git a/pephub/const.py b/pephub/const.py index d54c536a..cf8e9c84 100644 --- a/pephub/const.py +++ b/pephub/const.py @@ -135,9 +135,11 @@ DEFAULT_PEP_SCHEMA = "databio/pep:2.1.0" DEFAULT_TAG = "default" -DEFAULT_QDRANT_SCORE_THRESHOLD = ( - 0.72 # empirical value, highly dependent on the model used -) +# DEFAULT_QDRANT_SCORE_THRESHOLD = ( +# 0.72 # empirical value, highly dependent on the model used +# ) + +DEFAULT_QDRANT_SCORE_THRESHOLD = 0.15 ARCHIVE_URL_PATH = "https://cloud2.databio.org/pephub/" diff --git a/pephub/dependencies.py b/pephub/dependencies.py index 8d4587b4..6c47a92e 100644 --- a/pephub/dependencies.py +++ b/pephub/dependencies.py @@ -98,12 +98,57 @@ def jwt_encode_user_data(user_data: dict, exp: datetime = None) -> str: ) # sentence_transformer model +_LOGGER_PEPHUB.info(f"HF MODEL IN USE: {os.getenv('HF_MODEL', DEFAULT_HF_MODEL)}") embedding_model = Embedding( model_name=os.getenv("HF_MODEL", DEFAULT_HF_MODEL), max_length=512 ) # embedding_model = None +## Qdrant connection +def parse_boolean_env_var(env_var: str) -> bool: + """ + Helper function to parse a boolean environment variable + """ + return env_var.lower() in ["true", "1", "t", "y", "yes"] + + +def initialize_qdrant_client() -> Union[QdrantClient, None]: + """ + Initialize Qdrant client if enabled + """ + + if parse_boolean_env_var(os.environ.get("QDRANT_ENABLED", "false")): + try: + qdrant = QdrantClient( + url=os.environ.get("QDRANT_HOST", DEFAULT_QDRANT_HOST), + port=os.environ.get("QDRANT_PORT", DEFAULT_QDRANT_PORT), + api_key=os.environ.get("QDRANT_API_KEY", None), + ) + qdrant.list_full_snapshots() + return qdrant + except Exception as e: + _LOGGER_PEPHUB.error(f"Error connecting to Qdrant: {e}") + + else: + _LOGGER_PEPHUB.warning( + "QDRANT_ENABLED is not set to true. Qdrant features will be disabled.\ + To enable Qdrant, set the environment variable QDRANT_ENABLED to 'true'." + ) + return None + + +qdrant = initialize_qdrant_client() + + +def get_qdrant() -> Union[QdrantClient, None]: + """ + Return connection to qdrant client + """ + + return qdrant + + def generate_random_auth_code() -> str: """ Generate a random 32-digit code. @@ -337,50 +382,6 @@ def verify_user_can_fork( raise HTTPException(401, "Unauthorized to fork this repo") -def parse_boolean_env_var(env_var: str) -> bool: - """ - Helper function to parse a boolean environment variable - """ - return env_var.lower() in ["true", "1", "t", "y", "yes"] - - -def get_qdrant_enabled() -> bool: - """ - Check if qdrant is enabled - """ - return parse_boolean_env_var(os.environ.get("QDRANT_ENABLED", "false")) - - -def get_qdrant( - qdrant_enabled: bool = Depends(get_qdrant_enabled), -) -> Union[QdrantClient, None]: # type: ignore - """ - Return connection to qdrant client - """ - # return None if qdrant is not enabled - if not qdrant_enabled: - try: - yield None - finally: - pass - # else try to connect, test connectiona and return client if connection is successful. - qdrant = QdrantClient( - url=os.environ.get("QDRANT_HOST", DEFAULT_QDRANT_HOST), - port=os.environ.get("QDRANT_PORT", DEFAULT_QDRANT_PORT), - api_key=os.environ.get("QDRANT_API_KEY", None), - ) - try: - # test the connection first - qdrant.list_full_snapshots() - yield qdrant - except ResponseHandlingException as e: - print(f"Error getting qdrant client: {e}") - yield None - finally: - # no need to close the connection - pass - - def get_sentence_transformer() -> Embedding: """ Return sentence transformer encoder diff --git a/pephub/routers/api/v1/search.py b/pephub/routers/api/v1/search.py index ed071921..fb648a62 100644 --- a/pephub/routers/api/v1/search.py +++ b/pephub/routers/api/v1/search.py @@ -15,7 +15,9 @@ get_qdrant, get_sentence_transformer, ) -from ...models import SearchQuery +from ...models import SearchQuery, SearchReturnModel +from qdrant_client.models import ScoredPoint +from pepdbagent.models import Namespace load_dotenv() @@ -35,14 +37,14 @@ async def search_for_namespaces( # perform a search -@search.post("/", summary="Search for a PEP") +@search.post("/", summary="Search for a PEP", response_model=SearchReturnModel) async def search_for_pep( query: SearchQuery, qdrant: QdrantClient = Depends(get_qdrant), model: Embedding = Depends(get_sentence_transformer), agent: PEPDatabaseAgent = Depends(get_db), namespace_access: List[str] = Depends(get_namespace_access_list), -): +) -> SearchReturnModel: """ Perform a search for PEPs. This can be done using qdrant (semantic search), or with basic SQL string matches. @@ -50,127 +52,57 @@ async def search_for_pep( limit = query.limit offset = query.offset score_threshold = query.score_threshold - if qdrant is not None: - try: - # get the embeding for the query - query_vec = list(model.embed(query.query))[0] - # get actual results using the limit and offset - vector_results = qdrant.search( - collection_name=( - query.collection_name or DEFAULT_QDRANT_COLLECTION_NAME - ), - query_vector=query_vec, - limit=limit, - offset=offset, - score_threshold=score_threshold, - ) + # get namespaces: + namespaces: list[Namespace] = agent.namespace.get( + query=query.query, admin=namespace_access, limit=limit, offset=offset + ).results - # get sql results using the limit and offset - sql_results = agent.annotation.get( - query=query.query, - limit=limit, - offset=offset, - namespace=None, - admin=namespace_access, - ) + if qdrant is not None: + query_vec = list(model.embed(query.query))[0] - # map the results to the format we want - vector_results_mapped = [r.model_dump() for r in vector_results] - sql_results_mapped = [ - { - "id": r.digest, - "version": 0, - "score": 1.0, # Its a SQL search, so we just set the score to 1.0 - "payload": { - "description": r.description, - "registry": f"{r.namespace}/{r.name}:{r.tag}", - }, - "vector": None, - } - for r in sql_results.results - ] - results = vector_results_mapped + sql_results_mapped - namespaces = agent.namespace.get(admin=namespace_access) - namespace_hits = [ - n.namespace - for n in namespaces.results - if query.query.lower() in n.namespace.lower() - ] - namespace_hits.extend( - [ - n - for n in list( - set( - [ - r.model_dump()["payload"]["registry"].split("/")[0] - for r in vector_results - ] - ) - ) - if n not in namespace_hits - ] - ) + vector_results = qdrant.query_points( + collection_name=(query.collection_name or DEFAULT_QDRANT_COLLECTION_NAME), + query=query_vec, + limit=limit, + offset=offset, + score_threshold=score_threshold, + ).points - # finally, sort the results by score - results = sorted(results, key=lambda x: x["score"], reverse=True) + return SearchReturnModel( + query=query.query, + results=vector_results, + namespace_hits=namespaces, + limit=limit, + offset=offset, + total=len(vector_results), + ) - return JSONResponse( - content={ - "query": query.query, - "results": results, - "namespace_hits": namespace_hits, - "limit": limit, - "offset": offset, - "total": len(vector_results) + sql_results.count, - } - ) - except Exception as e: - # TODO: this isnt proper error handling. Also we need to use a logger - print("Qdrant search failed, falling back to SQL search. Reason: ", e) else: # fallback to SQL search - namespaces = agent.namespace.get(admin=namespace_access).results - results = agent.annotation.get( - query=query.query, limit=limit, offset=offset - ).results + results = agent.annotation.get(query=query.query, limit=limit, offset=offset) # emulate qdrant response from the SQL search # for frontend compatibility parsed_results = [ - { - "id": None, - "version": 0, - "score": None, - "payload": { + ScoredPoint( + id=f"{r.namespace}/{r.name}:{r.tag}", + version=0, + score=1.0, # SQL search, so we just set the score to 1.0 + payload={ "description": r.description, "registry": f"{r.namespace}/{r.name}:{r.tag}", }, - "vector": None, - } - for r in results + vector=None, + ) + for r in results.results ] - namespace_hits = [ - n.namespace - for n in namespaces - if query.query.lower() in n.namespace.lower() - ] - namespace_hits.extend( - [ - n - for n in list( - set( - [r["payload"]["registry"].split("/")[0] for r in parsed_results] - ) - ) - if n not in namespace_hits - ] - ) - return JSONResponse( - content={ - "query": query.query, - "results": parsed_results, - "namespace_hits": namespace_hits, - } + return SearchReturnModel( + query=query.query, + results=parsed_results, + namespace_hits=namespaces, + limit=limit, + offset=offset, + total=results.count, ) diff --git a/pephub/routers/models.py b/pephub/routers/models.py index 37bb3ee7..defeb78f 100644 --- a/pephub/routers/models.py +++ b/pephub/routers/models.py @@ -1,9 +1,11 @@ from typing import List, Optional, Dict, Union from pepdbagent.const import DEFAULT_TAG -from pepdbagent.models import UpdateItems, ListOfNamespaceInfo +from pepdbagent.models import UpdateItems, ListOfNamespaceInfo, Namespace from pydantic import BaseModel, ConfigDict, Field +from qdrant_client.models import ScoredPoint + from ..const import DEFAULT_QDRANT_SCORE_THRESHOLD @@ -24,6 +26,15 @@ class SearchQuery(BaseModel): score_threshold: Optional[float] = DEFAULT_QDRANT_SCORE_THRESHOLD +class SearchReturnModel(BaseModel): + query: str + results: List[ScoredPoint] + namespace_hits: List[Namespace] + limit: int + offset: int + total: int + + class RawValidationQuery(BaseModel): project_config: str sample_table: Optional[str] = None diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 1c1d2408..c718a7d1 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -8,12 +8,12 @@ jinja2>=3.1.2 python-multipart>=0.0.5 uvicorn python-dotenv -qdrant-client +qdrant-client>=1.16.0 requests pyjwt[crypto] coloredlogs -fastembed -numpy<2.0.0 +fastembed>=0.7.4 +numpy>2.0.0 slowapi cachetools>=4.2.4 # bedms>=0.2.0 \ No newline at end of file diff --git a/web/package.json b/web/package.json index 30c06791..c0a240fe 100644 --- a/web/package.json +++ b/web/package.json @@ -1,7 +1,7 @@ { "name": "web", "private": true, - "version": "0.0.0", + "version": "0.15.4", "type": "module", "scripts": { "dev": "vite", @@ -63,7 +63,7 @@ "@vitejs/plugin-react-swc": "^3.0.0", "browserslist": "^4.22.2", "browserslist-to-esbuild": "^2.1.1", - "typescript": "^4.9.3", + "typescript": "^5.3.3", "vite": "^5.0.0" } } diff --git a/web/src/api/search.ts b/web/src/api/search.ts index 2014de7e..a348062a 100644 --- a/web/src/api/search.ts +++ b/web/src/api/search.ts @@ -1,6 +1,7 @@ import axios from 'axios'; import { SearchHit } from '../../types'; +import { NamespaceHit } from '../components/search/search-results'; const API_HOST = import.meta.env.VITE_API_HOST || ''; const API_BASE = `${API_HOST}/api/v1`; @@ -8,7 +9,7 @@ const API_BASE = `${API_HOST}/api/v1`; export interface SearchResult { query: string; results: SearchHit[]; - namespace_hits: string[]; + namespace_hits: NamespaceHit[]; limit: number; offset: number; total: number; diff --git a/web/src/components/search/search-results.tsx b/web/src/components/search/search-results.tsx index 1498ea4b..1c53a41d 100644 --- a/web/src/components/search/search-results.tsx +++ b/web/src/components/search/search-results.tsx @@ -143,15 +143,21 @@ export const ProjectSearchResults: FC = ({ hits, offset, setO ); }; +export type NamespaceHit = { + namespace: string; + number_of_projects: number; + number_of_samples: number; +}; + interface NamespaceProps { - hits: string[]; + hits: NamespaceHit[]; } export const NamespaceSearchResults: FC = ({ hits }) => { if (hits.length === 0) { return (
-

No namespaces found :(

+

No namespaces found with current query, and offset(

Try broadening your search

); @@ -160,11 +166,12 @@ export const NamespaceSearchResults: FC = ({ hits }) => {

Namespaces

{hits.map((hit) => ( -
- - -
{hit}
+
+ + +
{hit.namespace}
+ {hit.number_of_projects} projects
))} diff --git a/web/src/hooks/queries/useSearch.ts b/web/src/hooks/queries/useSearch.ts index 4372013d..5ae8a8f8 100644 --- a/web/src/hooks/queries/useSearch.ts +++ b/web/src/hooks/queries/useSearch.ts @@ -6,7 +6,7 @@ import { useSession } from '../../contexts/session-context'; const DEFAULT_LIMIT = 10; const DEFAULT_OFFSET = 0; const DEFAULT_AUTO_RUN = false; -const DEFAULT_SCORE_THRESHOLD = 0.65; +const DEFAULT_SCORE_THRESHOLD = 0.15; interface SearchParams { q: string;