From 8d178aeb8153daa0708559bde39470cc3a327712 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 8 Sep 2025 16:36:40 -0700 Subject: [PATCH 1/8] Update S3 client type hints and refactor workspace naming convention in integration tests --- extralit-server/pdm.lock | 14 +- extralit-server/pyproject.toml | 1 + .../src/extralit_server/contexts/files.py | 134 +++++++++++++----- .../src/extralit_server/helpers.py | 6 +- extralit/tests/integration/conftest.py | 8 +- 5 files changed, 125 insertions(+), 38 deletions(-) diff --git a/extralit-server/pdm.lock b/extralit-server/pdm.lock index 8569fb8e8..7ef544fcd 100644 --- a/extralit-server/pdm.lock +++ b/extralit-server/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "postgresql", "test"] strategy = [] lock_version = "4.5.0" -content_hash = "sha256:fbc8bf451fd50920ff7fdacd4851dcf9d32afa86cb37091bf0e672f87bf09799" +content_hash = "sha256:4c6bbe5afeffb90981e7b5edf598edebf662bbdad882f7658b1cf1b7986e4e07" [[metadata.targets]] requires_python = ">=3.10" @@ -2634,6 +2634,18 @@ files = [ {file = "typer-0.9.4.tar.gz", hash = "sha256:f714c2d90afae3a7929fcd72a3abb08df305e1ff61719381384211c4070af57f"}, ] +[[package]] +name = "types-aiobotocore-s3" +version = "2.24.2" +summary = "" +dependencies = [ + "typing-extensions; python_full_version < \"3.12\"", +] +files = [ + {file = "types_aiobotocore_s3-2.24.2-py3-none-any.whl", hash = "sha256:b5541e994f9c5f41d551570be8788d4189fb55759f8f7d15a2893b29eae3114f"}, + {file = "types_aiobotocore_s3-2.24.2.tar.gz", hash = "sha256:7274bcca558385ef5f98037f5474f4583663634141d46d9d978ab7d37ca96324"}, +] + [[package]] name = "types-deprecated" version = "1.2.9.20240311" diff --git a/extralit-server/pyproject.toml b/extralit-server/pyproject.toml index a716e5d75..b5f6231ae 100644 --- a/extralit-server/pyproject.toml +++ b/extralit-server/pyproject.toml @@ -69,6 +69,7 @@ dependencies = [ "Jinja2>=3.1.4", # Used by huggingface-hub to render dataset card templates # For file storage "aioboto3>=13.1.1", + "types-aiobotocore-s3==2.24.2", # For document processing "ocrmypdf>=16.10.4", "pdf2image>=1.17.0", diff --git a/extralit-server/src/extralit_server/contexts/files.py b/extralit-server/src/extralit_server/contexts/files.py index 1ed041c68..076a8d492 100644 --- a/extralit-server/src/extralit_server/contexts/files.py +++ b/extralit-server/src/extralit_server/contexts/files.py @@ -15,7 +15,7 @@ import hashlib import logging from collections.abc import AsyncGenerator -from typing import Any, BinaryIO +from typing import TYPE_CHECKING, Any, BinaryIO from uuid import UUID from botocore.exceptions import ClientError @@ -24,13 +24,16 @@ from extralit_server.api.schemas.v1.files import FileObjectResponse, ListObjectsResponse, ObjectMetadata from extralit_server.helpers import shared_resources +if TYPE_CHECKING: + from types_aiobotocore_s3.client import S3Client + EXCLUDED_VERSIONING_PREFIXES = ["pdf"] CHUNK_LENGTH_MB = 10 * 1024 * 1024 _LOGGER = logging.getLogger(__name__) -async def get_s3_client(): +async def get_s3_client() -> "S3Client": """Dependency function to get shared S3 client.""" s3_client = shared_resources.get("s3_client") if s3_client is None: @@ -45,7 +48,9 @@ async def get_s3_client(): return s3_client -async def get_file_chunk(s3_client, bucket_name: str, key: str, chunk_length: int) -> AsyncGenerator[bytes, None]: +async def get_file_chunk( + s3_client: "S3Client", bucket_name: str, key: str, chunk_length: int +) -> AsyncGenerator[bytes, None]: """Async generator to get file chunks for streaming.""" head = await s3_client.head_object(Bucket=bucket_name, Key=key) content_length = head["ContentLength"] @@ -58,19 +63,24 @@ async def get_file_chunk(s3_client, bucket_name: str, key: str, chunk_length: in yield await stream.read() -async def get_object_with_range(s3_client, bucket: str, key: str, start: int, end: int): +async def get_object_with_range(s3_client: "S3Client", bucket: str, key: str, start: int, end: int): """Get S3 object with byte range support for streaming.""" response = await s3_client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}") return response -async def get_object_metadata(s3_client, bucket: str, key: str): +async def get_object_metadata(s3_client: "S3Client", bucket: str, key: str): """Get S3 object metadata using head_object.""" return await s3_client.head_object(Bucket=bucket, Key=key) async def put_object_to_s3( - s3_client, bucket: str, key: str, data: BinaryIO | bytes, content_type: str, metadata: dict[str, Any] | None = None + s3_client: "S3Client", + bucket: str, + key: str, + data: BinaryIO | bytes, + content_type: str, + metadata: dict[str, Any] | None = None, ): """Put object to S3.""" kwargs = { @@ -85,12 +95,15 @@ async def put_object_to_s3( return await s3_client.put_object(**kwargs) -async def delete_object_from_s3(s3_client, bucket: str, key: str): +async def delete_object_from_s3(s3_client: "S3Client", bucket: str, key: str, version_id: str | None = None): """Delete object from S3.""" - return await s3_client.delete_object(Bucket=bucket, Key=key) + kwargs = {"Bucket": bucket, "Key": key} + if version_id: + kwargs["VersionId"] = version_id + return await s3_client.delete_object(**kwargs) -async def list_objects_from_s3(s3_client, bucket: str, prefix: str | None = None): +async def list_objects_from_s3(s3_client: "S3Client", bucket: str, prefix: str | None = None): """List objects in S3 bucket.""" kwargs = {"Bucket": bucket} if prefix: @@ -98,7 +111,7 @@ async def list_objects_from_s3(s3_client, bucket: str, prefix: str | None = None return await s3_client.list_objects_v2(**kwargs) -async def create_bucket_in_s3(s3_client, bucket_name: str): +async def create_bucket_in_s3(s3_client: "S3Client", bucket_name: str): """Create S3 bucket.""" try: return await s3_client.create_bucket(Bucket=bucket_name) @@ -189,7 +202,7 @@ async def get_presigned_url_from_document_url(s3_client, document_url: str, expi async def list_objects( - s3_client, + s3_client: "S3Client", bucket: str, prefix: str | None = None, include_version=True, @@ -201,24 +214,74 @@ async def list_objects( kwargs = {"Bucket": bucket} if prefix: kwargs["Prefix"] = prefix - if start_after: - kwargs["StartAfter"] = start_after + if not recursive: + kwargs["Delimiter"] = "/" - response = await s3_client.list_objects_v2(**kwargs) objects = [] - for obj in response.get("Contents", []): - objects.append( - ObjectMetadata( - bucket_name=bucket, - object_name=obj["Key"], - etag=obj["ETag"].strip('"'), - size=obj["Size"], - last_modified=obj["LastModified"], - content_type="application/octet-stream", # Default, would need head_object for actual - metadata={}, + if include_version: + # Use list_object_versions to get all versions of objects + version_kwargs = {"Bucket": bucket} + if prefix: + version_kwargs["Prefix"] = prefix + if start_after: + version_kwargs["KeyMarker"] = start_after + if not recursive: + version_kwargs["Delimiter"] = "/" + + response = await s3_client.list_object_versions(**version_kwargs) + + # Process versions + for version in response.get("Versions", []): + objects.append( + ObjectMetadata( + bucket_name=bucket, + object_name=version.get("Key") or "", + etag=version.get("ETag", "").strip('"'), + size=version.get("Size"), + last_modified=version.get("LastModified"), + content_type="application/octet-stream", # Default, would need head_object for actual + version_id=version.get("VersionId"), + is_latest=version.get("IsLatest", False), + metadata={}, + ) + ) + + # Process delete markers if needed + for delete_marker in response.get("DeleteMarkers", []): + objects.append( + ObjectMetadata( + bucket_name=bucket, + object_name=delete_marker.get("Key") or "", + etag="", # Delete markers don't have ETags + size=0, + last_modified=delete_marker.get("LastModified"), + content_type="application/octet-stream", + version_id=delete_marker.get("VersionId"), + is_latest=delete_marker.get("IsLatest", False), + metadata={}, + ) + ) + else: + # Use list_objects_v2 for current versions only + if start_after: + kwargs["StartAfter"] = start_after + + response = await s3_client.list_objects_v2(**kwargs) + + for obj in response.get("Contents", []): + objects.append( + ObjectMetadata( + bucket_name=bucket, + object_name=obj.get("Key") or "", + etag=obj.get("ETag", "").strip('"'), + size=obj.get("Size"), + last_modified=obj.get("LastModified"), + content_type="application/octet-stream", # Default, would need head_object for actual + is_latest=True, # All objects from list_objects_v2 are latest versions + metadata={}, + ) ) - ) return ListObjectsResponse(objects=objects) except ClientError as e: @@ -227,7 +290,7 @@ async def list_objects( async def get_object( - s3_client, + s3_client: "S3Client", bucket: str, object: str, version_id: str | None = None, @@ -236,10 +299,16 @@ async def get_object( """Get object from S3 and return as FileObjectResponse.""" try: # Get object metadata first - head_response = await s3_client.head_object(Bucket=bucket, Key=object) + head_kwargs = {"Bucket": bucket, "Key": object} + if version_id: + head_kwargs["VersionId"] = version_id + head_response = await s3_client.head_object(**head_kwargs) # Get the actual object - get_response = await s3_client.get_object(Bucket=bucket, Key=object) + get_kwargs = {"Bucket": bucket, "Key": object} + if version_id: + get_kwargs["VersionId"] = version_id + get_response = await s3_client.get_object(**get_kwargs) metadata = ObjectMetadata( bucket_name=bucket, @@ -248,6 +317,7 @@ async def get_object( size=head_response["ContentLength"], last_modified=head_response["LastModified"], content_type=head_response.get("ContentType", "application/octet-stream"), + version_id=head_response.get("VersionId") or version_id, metadata=head_response.get("Metadata", {}), ) @@ -271,7 +341,7 @@ async def get_object( async def put_object( - s3_client, + s3_client: "S3Client", bucket: str, object: str, data: BinaryIO | bytes | str, @@ -313,7 +383,7 @@ async def put_object( async def delete_object(s3_client, bucket: str, object: str, version_id: str | None = None): """Delete object from S3.""" try: - await delete_object_from_s3(s3_client, bucket, object) + await delete_object_from_s3(s3_client, bucket, object, version_id=version_id) except ClientError as e: _LOGGER.error(f"Error deleting object {object} from bucket {bucket}: {e}") raise HTTPException(status_code=500, detail=f"Error deleting file: {e!s}") @@ -323,7 +393,7 @@ async def delete_object(s3_client, bucket: str, object: str, version_id: str | N async def create_bucket( - s3_client, + s3_client: "S3Client", workspace_name: str, excluded_prefixes: list[str] = EXCLUDED_VERSIONING_PREFIXES, ): @@ -346,7 +416,7 @@ async def create_bucket( async def put_document_file( - s3_client, + s3_client: "S3Client", workspace_name: str, document_id: UUID, file_data: bytes, diff --git a/extralit-server/src/extralit_server/helpers.py b/extralit-server/src/extralit_server/helpers.py index 4345d36af..91a723b1e 100644 --- a/extralit-server/src/extralit_server/helpers.py +++ b/extralit-server/src/extralit_server/helpers.py @@ -17,16 +17,20 @@ """ import logging +from typing import TYPE_CHECKING import aioboto3 from extralit_server.settings import settings +if TYPE_CHECKING: + from types_aiobotocore_s3.client import S3Client + _LOGGER = logging.getLogger("extralit_server") shared_resources = {} -async def create_s3_client(): +async def create_s3_client() -> "S3Client": """Initialize S3 client with settings configuration.""" if not all([settings.s3_endpoint, settings.s3_access_key, settings.s3_secret_key]): raise ValueError("S3 configuration required: s3_endpoint, s3_access_key, s3_secret_key") diff --git a/extralit/tests/integration/conftest.py b/extralit/tests/integration/conftest.py index 75aae0b8a..53629f1d2 100644 --- a/extralit/tests/integration/conftest.py +++ b/extralit/tests/integration/conftest.py @@ -24,7 +24,7 @@ def client() -> ex.Extralit: client = ex.Extralit() if len(list(client.workspaces)) == 0: - client.workspaces.add(ex.Workspace(name=f"test-{uuid.uuid4()}")) + client.workspaces.add(ex.Workspace(name=f"test_{uuid.uuid4()}")) yield client @@ -33,17 +33,17 @@ def client() -> ex.Extralit: def _cleanup(client: ex.Extralit): for dataset in client.datasets: - if dataset.name.startswith("test-"): + if dataset.name.startswith("test_"): dataset.delete() for workspace in client.workspaces: - if workspace.name.startswith("test-"): + if workspace.name.startswith("test_"): for dataset in workspace.datasets: dataset.delete() workspace.delete() for user in client.users: - if user.username.startswith("test-"): + if user.username.startswith("test_"): user.delete() From 37578726750f46368739321c130ad49c68647e64 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 8 Sep 2025 16:42:18 -0700 Subject: [PATCH 2/8] Enable versioning for S3 buckets during creation in the files context --- extralit-server/src/extralit_server/contexts/files.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/extralit-server/src/extralit_server/contexts/files.py b/extralit-server/src/extralit_server/contexts/files.py index 076a8d492..c189e3be7 100644 --- a/extralit-server/src/extralit_server/contexts/files.py +++ b/extralit-server/src/extralit_server/contexts/files.py @@ -401,8 +401,13 @@ async def create_bucket( try: await create_bucket_in_s3(s3_client, workspace_name) - # Note: Versioning configuration would need separate implementation - # depending on S3 provider support + await s3_client.put_bucket_versioning( + Bucket=workspace_name, + VersioningConfiguration={ + "Status": "Enabled", + "MFADelete": "Disabled", + }, + ) except ClientError as e: if e.response["Error"]["Code"] in ["BucketAlreadyOwnedByYou", "BucketAlreadyExists"]: From b72539971d0426e2bd506207db108d2d3d0b6c06 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 8 Sep 2025 16:49:18 -0700 Subject: [PATCH 3/8] refactor --- .../src/extralit_server/contexts/files.py | 51 +++---------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/extralit-server/src/extralit_server/contexts/files.py b/extralit-server/src/extralit_server/contexts/files.py index c189e3be7..7318c0f6e 100644 --- a/extralit-server/src/extralit_server/contexts/files.py +++ b/extralit-server/src/extralit_server/contexts/files.py @@ -63,18 +63,7 @@ async def get_file_chunk( yield await stream.read() -async def get_object_with_range(s3_client: "S3Client", bucket: str, key: str, start: int, end: int): - """Get S3 object with byte range support for streaming.""" - response = await s3_client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}") - return response - - -async def get_object_metadata(s3_client: "S3Client", bucket: str, key: str): - """Get S3 object metadata using head_object.""" - return await s3_client.head_object(Bucket=bucket, Key=key) - - -async def put_object_to_s3( +async def _put_object_to_s3( s3_client: "S3Client", bucket: str, key: str, @@ -95,33 +84,6 @@ async def put_object_to_s3( return await s3_client.put_object(**kwargs) -async def delete_object_from_s3(s3_client: "S3Client", bucket: str, key: str, version_id: str | None = None): - """Delete object from S3.""" - kwargs = {"Bucket": bucket, "Key": key} - if version_id: - kwargs["VersionId"] = version_id - return await s3_client.delete_object(**kwargs) - - -async def list_objects_from_s3(s3_client: "S3Client", bucket: str, prefix: str | None = None): - """List objects in S3 bucket.""" - kwargs = {"Bucket": bucket} - if prefix: - kwargs["Prefix"] = prefix - return await s3_client.list_objects_v2(**kwargs) - - -async def create_bucket_in_s3(s3_client: "S3Client", bucket_name: str): - """Create S3 bucket.""" - try: - return await s3_client.create_bucket(Bucket=bucket_name) - except ClientError as e: - if e.response["Error"]["Code"] in ["BucketAlreadyOwnedByYou", "BucketAlreadyExists"]: - pass # Bucket already exists, that's fine - else: - raise - - def compute_hash(data: bytes) -> str: return hashlib.md5(data).hexdigest() @@ -357,7 +319,7 @@ async def put_object( data = data.read() # Upload to S3 - await put_object_to_s3(s3_client, bucket, object, data, content_type, metadata) + await _put_object_to_s3(s3_client, bucket, object, data, content_type, metadata) # Get metadata for response head_response = await s3_client.head_object(Bucket=bucket, Key=object) @@ -383,7 +345,10 @@ async def put_object( async def delete_object(s3_client, bucket: str, object: str, version_id: str | None = None): """Delete object from S3.""" try: - await delete_object_from_s3(s3_client, bucket, object, version_id=version_id) + kwargs = {"Bucket": bucket, "Key": object} + if version_id: + kwargs["VersionId"] = version_id + await s3_client.delete_object(**kwargs) except ClientError as e: _LOGGER.error(f"Error deleting object {object} from bucket {bucket}: {e}") raise HTTPException(status_code=500, detail=f"Error deleting file: {e!s}") @@ -399,7 +364,7 @@ async def create_bucket( ): """Create S3 bucket.""" try: - await create_bucket_in_s3(s3_client, workspace_name) + await s3_client.create_bucket(Bucket=workspace_name) await s3_client.put_bucket_versioning( Bucket=workspace_name, @@ -462,7 +427,7 @@ async def put_document_file( should_upload = False if should_upload: - await put_object_to_s3( + await _put_object_to_s3( s3_client, workspace_name, object_path, From 52d64ac2cb3c1c66bf291e6907ae23d7deec8e33 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 8 Sep 2025 17:12:36 -0700 Subject: [PATCH 4/8] Add LocalFileClient for local file storage emulating S3Client interface, including methods for bucket and object management, versioning, and metadata handling. Enhance create_s3_client to fallback to LocalFileClient if S3 is not configured. --- .../src/extralit_server/helpers.py | 483 +++++++++++++++++- 1 file changed, 477 insertions(+), 6 deletions(-) diff --git a/extralit-server/src/extralit_server/helpers.py b/extralit-server/src/extralit_server/helpers.py index 91a723b1e..b2fe81979 100644 --- a/extralit-server/src/extralit_server/helpers.py +++ b/extralit-server/src/extralit_server/helpers.py @@ -16,25 +16,496 @@ Common helper functions """ +import hashlib +import io +import json import logging -from typing import TYPE_CHECKING +import os +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any import aioboto3 +import aiofiles +import aiofiles.os +from botocore.exceptions import ClientError +from types_aiobotocore_s3.client import S3Client from extralit_server.settings import settings -if TYPE_CHECKING: - from types_aiobotocore_s3.client import S3Client - _LOGGER = logging.getLogger("extralit_server") shared_resources = {} +def _compute_hash(data: bytes) -> str: + """Compute MD5 hash for data.""" + return hashlib.md5(data).hexdigest() + + +class LocalFileClient(S3Client): + """Local file storage implementation that mimics S3Client interface.""" + + def __init__(self, base_dir: str | Path): + self.base_dir = Path(base_dir) + + async def _ensure_base_dir(self): + """Ensure base directory exists.""" + await aiofiles.os.makedirs(self.base_dir, exist_ok=True) + + def _get_bucket_path(self, bucket_name: str) -> Path: + """Get bucket directory path.""" + return self.base_dir / bucket_name + + def _get_object_path(self, bucket_name: str, key: str) -> Path: + """Get object file path.""" + bucket_path = self._get_bucket_path(bucket_name) + return bucket_path / key + + def _get_version_path(self, bucket_name: str, key: str) -> Path: + """Get versions directory path for an object.""" + bucket_path = self._get_bucket_path(bucket_name) + return bucket_path / ".versions" / key + + def _get_metadata_path(self, bucket_name: str, key: str) -> Path: + """Get metadata file path for an object.""" + object_path = self._get_object_path(bucket_name, key) + return object_path.with_suffix(object_path.suffix + ".metadata.json") + + async def create_bucket(self, Bucket: str, **kwargs) -> dict[str, Any]: + """Create a bucket (directory).""" + bucket_path = self._get_bucket_path(Bucket) + await aiofiles.os.makedirs(bucket_path, exist_ok=True) + # Create versions directory + versions_path = bucket_path / ".versions" + await aiofiles.os.makedirs(versions_path, exist_ok=True) + return {} + + async def put_bucket_versioning(self, Bucket: str, VersioningConfiguration: dict, **kwargs) -> dict[str, Any]: + """Enable bucket versioning (just ensure versions directory exists).""" + bucket_path = self._get_bucket_path(Bucket) + versions_path = bucket_path / ".versions" + await aiofiles.os.makedirs(versions_path, exist_ok=True) + return {} + + async def head_object(self, Bucket: str, Key: str, VersionId: str | None = None, **kwargs) -> dict[str, Any]: + """Get object metadata.""" + try: + if VersionId: + version_path = self._get_version_path(Bucket, Key).with_suffix(f".{VersionId}") + if not version_path.exists(): + raise ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "The specified version does not exist"}}, + "HeadObject", + ) + file_path = version_path + else: + object_path = self._get_object_path(Bucket, Key) + if not object_path.exists(): + raise ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "The specified key does not exist"}}, "HeadObject" + ) + file_path = object_path + + # Get file stats + stat_result = await aiofiles.os.stat(file_path) + + # Get metadata + metadata_path = self._get_metadata_path(Bucket, Key) + if metadata_path.exists(): + async with aiofiles.open(metadata_path) as f: + metadata = json.loads(await f.read()) + else: + metadata = {} + + return { + "ContentLength": stat_result.st_size, + "LastModified": datetime.fromtimestamp(stat_result.st_mtime), + "ETag": f'"{metadata.get("etag", "")}"', + "VersionId": VersionId or metadata.get("version_id"), + "ContentType": metadata.get("content_type", "application/octet-stream"), + "Metadata": metadata.get("metadata", {}), + } + + except (FileNotFoundError, OSError): + raise ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "The specified key does not exist"}}, "HeadObject" + ) + + async def get_object( + self, Bucket: str, Key: str, VersionId: str | None = None, Range: str | None = None, **kwargs + ) -> dict[str, Any]: + """Get object content.""" + try: + if VersionId: + version_path = self._get_version_path(Bucket, Key).with_suffix(f".{VersionId}") + if not version_path.exists(): + raise ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "The specified version does not exist"}}, "GetObject" + ) + file_path = version_path + else: + object_path = self._get_object_path(Bucket, Key) + if not object_path.exists(): + raise ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "The specified key does not exist"}}, "GetObject" + ) + file_path = object_path + + # Read file content + async with aiofiles.open(file_path, "rb") as f: + if Range: + # Parse range like "bytes=0-1023" + range_match = Range.replace("bytes=", "").split("-") + start = int(range_match[0]) if range_match[0] else 0 + end = int(range_match[1]) if len(range_match) > 1 and range_match[1] else None + + await f.seek(start) + if end is not None: + content = await f.read(end - start + 1) + else: + content = await f.read() + else: + content = await f.read() + + # Get metadata for response + head_info = await self.head_object(Bucket, Key, VersionId) + + # Create a mock response body that can be read + body = MockAsyncStreamingBody(content) + + return { + "Body": body, + "ContentLength": len(content), + "LastModified": head_info["LastModified"], + "ETag": head_info["ETag"], + "ContentType": head_info["ContentType"], + "VersionId": head_info.get("VersionId"), + "Metadata": head_info.get("Metadata", {}), + } + + except (FileNotFoundError, OSError): + raise ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "The specified key does not exist"}}, "GetObject" + ) + + async def put_object( + self, + Bucket: str, + Key: str, + Body: Any, + ContentType: str = "application/octet-stream", + Metadata: dict | None = None, + **kwargs, + ) -> dict[str, Any]: + """Put object to storage.""" + await self._ensure_base_dir() + bucket_path = self._get_bucket_path(Bucket) + await aiofiles.os.makedirs(bucket_path, exist_ok=True) + + # Convert body to bytes + if hasattr(Body, "read"): + if hasattr(Body, "seek"): + Body.seek(0) + data_bytes = Body.read() + elif isinstance(Body, str): + data_bytes = Body.encode("utf-8") + else: + data_bytes = Body + + # Generate version ID and hash + content_hash = _compute_hash(data_bytes) + version_id = str(uuid.uuid4()) + + # Ensure versions directory exists + version_dir = self._get_version_path(Bucket, Key).parent + await aiofiles.os.makedirs(version_dir, exist_ok=True) + + # Write to version file + version_path = self._get_version_path(Bucket, Key).with_suffix(f".{version_id}") + async with aiofiles.open(version_path, "wb") as f: + await f.write(data_bytes) + + # Update main object path (symlink or copy) + object_path = self._get_object_path(Bucket, Key) + await aiofiles.os.makedirs(object_path.parent, exist_ok=True) + + # Remove existing file/symlink + if object_path.exists(): + await aiofiles.os.remove(object_path) + + # Create symlink to version file + try: + object_path.symlink_to(version_path) + except OSError: + # Fallback to copy if symlink fails + async with aiofiles.open(version_path, "rb") as src, aiofiles.open(object_path, "wb") as dst: + content = await src.read() + await dst.write(content) + + # Save metadata + metadata_info = { + "etag": content_hash, + "content_type": ContentType, + "version_id": version_id, + "metadata": Metadata or {}, + } + + metadata_path = self._get_metadata_path(Bucket, Key) + async with aiofiles.open(metadata_path, "w") as f: + await f.write(json.dumps(metadata_info, default=str)) + + return { + "ETag": f'"{content_hash}"', + "VersionId": version_id, + } + + async def delete_object(self, Bucket: str, Key: str, VersionId: str | None = None, **kwargs) -> dict[str, Any]: + """Delete object or specific version.""" + if VersionId: + version_path = self._get_version_path(Bucket, Key).with_suffix(f".{VersionId}") + if version_path.exists(): + await aiofiles.os.remove(version_path) + else: + object_path = self._get_object_path(Bucket, Key) + if object_path.exists(): + await aiofiles.os.remove(object_path) + + # Remove metadata + metadata_path = self._get_metadata_path(Bucket, Key) + if metadata_path.exists(): + await aiofiles.os.remove(metadata_path) + + return {} + + async def list_objects_v2( + self, + Bucket: str, + Prefix: str | None = None, + Delimiter: str | None = None, + StartAfter: str | None = None, + **kwargs, + ) -> dict[str, Any]: + """List current objects in bucket.""" + bucket_path = self._get_bucket_path(Bucket) + if not bucket_path.exists(): + return {"Contents": []} + + contents = [] + + # Recursively find files if no delimiter, otherwise only direct children + if Delimiter: + search_path = bucket_path / (Prefix or "") + else: + search_path = bucket_path + + try: + if search_path.exists(): + # Use Path.rglob for recursive search + if Delimiter: + files = [f for f in search_path.iterdir() if f.is_file()] + else: + files = [f for f in search_path.rglob("*") if f.is_file()] + + for file_path in files: + # Skip metadata files and version files + if file_path.name.endswith(".metadata.json") or ".versions" in str( + file_path.relative_to(bucket_path) + ): + continue + + key = str(file_path.relative_to(bucket_path)) + + # Apply prefix filter + if Prefix and not key.startswith(Prefix): + continue + + # Apply start_after filter + if StartAfter and key <= StartAfter: + continue + + try: + stat_result = await aiofiles.os.stat(file_path) + + # Try to get metadata + metadata_path = self._get_metadata_path(Bucket, key) + etag = "" + if metadata_path.exists(): + async with aiofiles.open(metadata_path) as f: + metadata = json.loads(await f.read()) + etag = metadata.get("etag", "") + + contents.append( + { + "Key": key, + "LastModified": datetime.fromtimestamp(stat_result.st_mtime), + "ETag": f'"{etag}"', + "Size": stat_result.st_size, + } + ) + except (OSError, json.JSONDecodeError): + continue + + except OSError: + pass + + # Sort by key + contents.sort(key=lambda x: x["Key"]) + + return {"Contents": contents} + + async def list_object_versions( + self, + Bucket: str, + Prefix: str | None = None, + Delimiter: str | None = None, + KeyMarker: str | None = None, + **kwargs, + ) -> dict[str, Any]: + """List all versions of objects.""" + bucket_path = self._get_bucket_path(Bucket) + versions_path = bucket_path / ".versions" + + versions = [] + delete_markers = [] + + if not versions_path.exists(): + return {"Versions": versions, "DeleteMarkers": delete_markers} + + try: + # Find all version files + for version_file in versions_path.rglob("*"): + if not version_file.is_file(): + continue + + # Parse version file name (key.version_id) + relative_path = version_file.relative_to(versions_path) + key_parts = str(relative_path).rsplit(".", 1) + if len(key_parts) != 2: + continue + + key, version_id = key_parts + + # Apply prefix filter + if Prefix and not key.startswith(Prefix): + continue + + # Apply key marker filter + if KeyMarker and key <= KeyMarker: + continue + + try: + stat_result = await aiofiles.os.stat(version_file) + + # Check if this is the latest version + current_object = self._get_object_path(Bucket, key) + is_latest = False + if current_object.exists(): + # Get current version from metadata + metadata_path = self._get_metadata_path(Bucket, key) + if metadata_path.exists(): + async with aiofiles.open(metadata_path) as f: + metadata = json.loads(await f.read()) + is_latest = metadata.get("version_id") == version_id + + # Get etag from version metadata (or compute from file) + etag = "" + version_metadata_path = version_file.with_suffix(version_file.suffix + ".metadata.json") + if version_metadata_path.exists(): + async with aiofiles.open(version_metadata_path) as f: + metadata = json.loads(await f.read()) + etag = metadata.get("etag", "") + else: + # Fallback: compute from file content + async with aiofiles.open(version_file, "rb") as f: + content = await f.read() + etag = _compute_hash(content) + + versions.append( + { + "Key": key, + "VersionId": version_id, + "IsLatest": is_latest, + "LastModified": datetime.fromtimestamp(stat_result.st_mtime), + "ETag": f'"{etag}"', + "Size": stat_result.st_size, + } + ) + + except (OSError, json.JSONDecodeError): + continue + + except OSError: + pass + + # Sort by key and version + versions.sort(key=lambda x: (x["Key"], x["LastModified"]), reverse=True) + + return {"Versions": versions, "DeleteMarkers": delete_markers} + + async def delete_bucket(self, Bucket: str, **kwargs) -> dict[str, Any]: + """Delete bucket and all its contents.""" + bucket_path = self._get_bucket_path(Bucket) + + if bucket_path.exists(): + # Remove all files recursively + import shutil + + shutil.rmtree(bucket_path) + + return {} + + async def generate_presigned_url( + self, ClientMethod: str, Params: dict | None = None, ExpiresIn: int = 3600, **kwargs + ) -> str: + """Generate a presigned URL (return local file path for local storage).""" + if not Params: + return "" + + bucket = Params.get("Bucket", "") + key = Params.get("Key", "") + + # For local files, return a proxy URL that matches the expected format + return f"/api/v1/file/{bucket}/{key}" + + # Additional methods needed for compatibility + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + pass + + +class MockAsyncStreamingBody: + """Mock streaming body for local file content.""" + + def __init__(self, content: bytes): + self._content = content + self._stream = io.BytesIO(content) + + async def read(self, amt: int | None = None) -> bytes: + """Read content.""" + return self._stream.read(amt) + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + pass + + async def create_s3_client() -> "S3Client": - """Initialize S3 client with settings configuration.""" + """Initialize S3 client with settings configuration or LocalFileClient as fallback.""" + # Check if S3 is configured if not all([settings.s3_endpoint, settings.s3_access_key, settings.s3_secret_key]): - raise ValueError("S3 configuration required: s3_endpoint, s3_access_key, s3_secret_key") + # Use local file storage as fallback + _LOGGER.info("S3 not configured, using local file storage at %s", settings.home_path) + local_client = LocalFileClient(settings.home_path or os.path.expanduser("~/.extralit")) + await local_client._ensure_base_dir() + shared_resources["s3_client"] = local_client + return local_client + # Use real S3 client session = aioboto3.Session( aws_access_key_id=settings.s3_access_key, aws_secret_access_key=settings.s3_secret_key, From cd3e3dd78060f161d8d094ec76d9edac94e4343f Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 8 Sep 2025 17:24:10 -0700 Subject: [PATCH 5/8] Update GitHub Actions workflow to use the latest extralit-hf-space image and remove MinIO service configuration --- .github/workflows/extralit.yml | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/.github/workflows/extralit.yml b/.github/workflows/extralit.yml index 7ad51606a..269143329 100644 --- a/.github/workflows/extralit.yml +++ b/.github/workflows/extralit.yml @@ -30,7 +30,7 @@ jobs: if: github.event.pull_request.draft == false services: extralit-container: - image: extralitdev/extralit-hf-space:develop + image: extralitdev/extralit-hf-space:latest ports: - 6900:6900 env: @@ -39,19 +39,6 @@ jobs: USERNAME: extralit PASSWORD: 12345678 API_KEY: extralit.apikey - EXTRALIT_S3_ENDPOINT: "http://minio:9000" - EXTRALIT_S3_ACCESS_KEY: "minioadmin" - EXTRALIT_S3_SECRET_KEY: "minioadmin" - minio: - image: lazybit/minio - volumes: - - /data:/data - env: - MINIO_ACCESS_KEY: minioadmin - MINIO_SECRET_KEY: minioadmin - options: --name=minio --health-cmd "curl http://localhost:9000/minio/health/live" --health-interval=30s --health-timeout=10s --health-retries=3 - ports: - - 9000:9000 runs-on: ubuntu-22.04 defaults: run: From ea2edcce7f88b0e9f4aafc711208449e64b0df2a Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Mon, 8 Sep 2025 17:36:31 -0700 Subject: [PATCH 6/8] Update GitHub Actions workflow to disable telemetry and remove local storage directory creation in extralit.yml; modify Dockerfile to install only postgresql dependencies. --- .../extralit-server.build-docker-images.yml | 2 +- .github/workflows/extralit.yml | 5 +- .kiro/steering/product.md | 22 -- .kiro/steering/structure.md | 225 ------------------ .kiro/steering/tech.md | 130 ---------- extralit-server/docker/server/Dockerfile | 2 +- 6 files changed, 3 insertions(+), 383 deletions(-) delete mode 100644 .kiro/steering/product.md delete mode 100644 .kiro/steering/structure.md delete mode 100644 .kiro/steering/tech.md diff --git a/.github/workflows/extralit-server.build-docker-images.yml b/.github/workflows/extralit-server.build-docker-images.yml index c69463a69..ee16416ec 100644 --- a/.github/workflows/extralit-server.build-docker-images.yml +++ b/.github/workflows/extralit-server.build-docker-images.yml @@ -124,6 +124,6 @@ jobs: with: username: ${{ env.DOCKER_USERNAME }} password: ${{ env.DOCKER_PASSWORD }} - repository: $${{ env.SERVER_DOCKER_IMAGE }} + repository: ${{ env.SERVER_DOCKER_IMAGE }} readme-filepath: extralit-server/docker/server/README.md diff --git a/.github/workflows/extralit.yml b/.github/workflows/extralit.yml index 269143329..6603a4d30 100644 --- a/.github/workflows/extralit.yml +++ b/.github/workflows/extralit.yml @@ -34,7 +34,7 @@ jobs: ports: - 6900:6900 env: - EXTRALIT_ENABLE_TELEMETRY: 0 + HF_HUB_DISABLE_TELEMETRY: 1 # Set credentials USERNAME: extralit PASSWORD: 12345678 @@ -82,9 +82,6 @@ jobs: # Stop log streaming kill $LOGS_PID || true - # Create a directory for local storage that the container can access - mkdir -p /tmp/extralit-files - chmod -R 777 /tmp/extralit-files - name: Set huggingface hub credentials run: | echo "HF_TOKEN_EXTRALIT_INTERNAL_TESTING=${{ secrets.HF_TOKEN_EXTRALIT_INTERNAL_TESTING }}" >> "$GITHUB_ENV" diff --git a/.kiro/steering/product.md b/.kiro/steering/product.md deleted file mode 100644 index 5931efee0..000000000 --- a/.kiro/steering/product.md +++ /dev/null @@ -1,22 +0,0 @@ -# Product Overview - -Extralit (EXTRAct LITerature) is a data extraction workflow platform designed for **LLM-assisted scientific data extraction** and **unstructured document intelligence** tasks. Built on top of Argilla, it extends capabilities with enhanced data extraction, validation, and human-in-the-loop workflows. - -## Core Value Proposition -- **Precision First**: Built for high data accuracy, ensuring reliable results -- **Human-in-the-Loop**: Seamlessly integrate human annotations to refine LLM outputs -- **Flexible & Scalable**: Available as Python SDK, CLI, and Web UI with multiple deployment options - -## Key Features -- **Schema-Driven Extraction**: Define structured schemas for context-aware, high-accuracy data extraction -- **Advanced PDF Processing**: AI-powered OCR detects complex table structures in both digital and scanned PDFs -- **Built-in Validation**: Automatically verify extracted data for accuracy -- **User-Friendly Interface**: Review, edit, and validate data with team-based consensus workflows -- **Data Flywheel**: Collect human annotations to monitor performance and build fine-tuning datasets - -## Target Use Cases -- Scientific literature data extraction -- Document intelligence tasks -- PDF processing and table extraction -- Research data validation and annotation -- Academic paper analysis and bibliography management \ No newline at end of file diff --git a/.kiro/steering/structure.md b/.kiro/steering/structure.md deleted file mode 100644 index 3b1319c1a..000000000 --- a/.kiro/steering/structure.md +++ /dev/null @@ -1,225 +0,0 @@ -# Project Structure - -## Repository Organization -This is a monorepo containing multiple related packages: - -``` -extralit/ -├── extralit-server/ # FastAPI backend server -├── extralit-frontend/ # Nuxt.js web UI -├── extralit/ # Python SDK and CLI -├── examples/ # Usage examples and deployments -└── .kiro/ # Kiro AI assistant configuration -``` - -## Backend Structure (extralit-server/) -``` -extralit-server/ -├── src/extralit_server/ -│ ├── api/ # FastAPI routes and handlers -│ │ ├── handlers/ # Request handlers by version -│ │ └── schemas/ # Pydantic models for API -│ ├── contexts/ # Business logic contexts -│ ├── models/ # SQLAlchemy database models -│ ├── jobs/ # Background job definitions -│ ├── cli/ # CLI commands -│ └── alembic/ # Database migrations -├── tests/ # Test suite -├── docker/ # Docker configurations -└── pyproject.toml # PDM configuration -``` - -### Key Backend Patterns -- **API Handlers**: Located in `api/handlers/v1/` - one file per resource -- **Database Models**: In `models/database.py` - SQLAlchemy models -- **Business Logic**: In `contexts/` - domain-specific logic -- **Background Jobs**: In `jobs/` - RQ job definitions -- **Migrations**: Use Alembic in `alembic/versions/` - -## Frontend Structure (extralit-frontend/) -``` -extralit-frontend/ -├── components/ -│ ├── base/ # Reusable UI components -│ └── features/ # Feature-specific components -├── pages/ # Nuxt.js pages (routes) -├── plugins/ # Vue plugins and extensions -├── assets/ # Static assets (SCSS, icons) -├── translation/ # i18n language files -├── v1/ # Domain and Infrastructure layers -│ ├── domain/ # Domain logic (entities, events, services, usecases) -│ │ ├── entities/ -│ │ ├── events/ -│ │ ├── services/ -│ │ └── usecases/ -│ └── infrastructure/ # Infrastructure implementations (events, repositories, services, storage, types) -│ ├── events/ -│ ├── repositories/ -│ ├── services/ -│ ├── storage/ -│ └── types/ -├── e2e/ # Playwright e2e tests -└── package.json # npm configuration -``` - -### Existing Auto-Imported Components - -, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , - - -### Key Frontend Patterns -- **Components**: Base components in `components/base/`, feature components in `components/features/` -- **Pages**: Nuxt.js file-based routing in `pages/` -- **Stores**: Pinia stores in `v1/store/` -- **Domain Logic**: Dependency injection in `v1/di/` -- **Axios**: @nuxt/axios makes API calls with `{proxy: true, browserBaseURL: "api"}` -- **Dependency Injection**: Use `useResolve` from `ts-injecty` for dependency resolution in use cases -- **View Models**: Use the simple function return pattern inspired by `useHomeViewModel.ts` - ```typescript - export const useMyViewModel = (props) => { - const dependency = useResolve(MyUseCase); - - const methodOne = () => { - // implementation - }; - - const methodTwo = async (param) => { - // implementation - }; - - return { - dependency, - methodOne, - methodTwo, - // ... all public methods and properties - }; - }; - ``` -- **Component Setup**: Components use `setup(props) { return useViewModelName(props); }` pattern -- **Styling**: SCSS in `assets/scss/` with component-scoped styles -- **Base Components**: BaseSimpleTable.vue already exists for tabular data display - -### Jest Testing Patterns -- **Test Files**: Place `.spec.js` files next to the component they test -- **Mock Setup**: Define mocks inline within `jest.mock()` calls to avoid hoisting issues: - ```javascript - // Mock dependencies inline to avoid hoisting issues - jest.mock("ts-injecty", () => ({ - useResolve: jest.fn(() => mockUseCase), - })); - - jest.mock("@nuxtjs/composition-api", () => ({ - ref: jest.fn(), - computed: jest.fn(), - watch: jest.fn(), - onMounted: jest.fn(), - })); - ``` -- **Mock Configuration**: Set up mocks in `beforeEach` by getting them from required modules: - ```javascript - beforeEach(() => { - jest.clearAllMocks(); - - const compositionApi = require("@nuxtjs/composition-api"); - mockRef = compositionApi.ref; - mockComputed = compositionApi.computed; - // Configure mock behavior... - }); - ``` -- **Component Stubs**: Use stubs in the mount options for base components: - ```javascript - wrapper = mount(ComponentName, { - propsData: { /* props */ }, - stubs: { - "BaseButton": { - template: '', - props: ["variant", "disabled", "loading"], - }, - "BaseIcon": true, - "BaseFlowModal": true, - }, - }); - ``` -- **Global Mocks**: Mock browser APIs and global functions: - ```javascript - beforeEach(() => { - // Mock window.confirm for modal dialogs - global.confirm = jest.fn(() => true); - // Mock other browser APIs as needed - global.alert = jest.fn(); - }); - - afterEach(() => { - jest.restoreAllMocks(); - }); - ``` -- **View Model Testing**: Test the public interface rather than internal implementation: - ```javascript - // Test computed properties return expected values - expect(computedFn()).toBe("expected-value"); - - // Test methods exist and are callable - expect(typeof viewModel.methodName).toBe("function"); - expect(viewModel.methodName).toBeDefined(); - - // Test reactive state objects are returned - expect(viewModel.property).toBe(mockRefObject); - ``` -- **Test Structure**: - - Use `beforeEach` to reset mock state between tests - - Use `afterEach` to clean up mocks and destroy wrappers - - Group related tests in `describe` blocks - - Test public interfaces, not internal implementation details -- **Props Testing**: Test component behavior with different prop combinations -- **Event Testing**: Verify component emits correct events with proper data -- **State Testing**: Test computed properties and reactive state changes -- **User Interaction**: Mock user actions and verify component responses -- **Error Handling**: Test error states and error recovery -- **Lifecycle Testing**: Test component mounting, updating, and destruction -- **Async Testing**: Use `async/await` for asynchronous operations -- **Mock Validation**: Ensure mocks match actual component interfaces - -## Client SDK Structure (extralit/) -``` -extralit/ -├── src/ -│ ├── extralit/ # Main SDK package -│ │ ├── cli/ # CLI commands -│ │ └── client/ # API client -│ └── extralit/ # Extralit-specific extensions -├── tests/ # Test suite -├── docs/ # Documentation -└── pyproject.toml # PDM configuration -``` - -## Examples and Deployments -``` -examples/ -├── custom_field/ # Custom field examples -├── document_extraction/ # Document processing examples -├── deployments/ -│ ├── docker/ # Docker Compose setups -│ └── k8s/ # Kubernetes manifests -└── webhooks/ # Webhook integration examples -``` - -## Configuration Files -- **Backend**: `extralit-server/pyproject.toml` (PDM), `.env.dev`, `.env.test` -- **Frontend**: `extralit-frontend/package.json` (npm), `nuxt.config.ts` -- **SDK**: `extralit/pyproject.toml` (PDM) -- **Docker**: `docker-compose.yaml` for local development -- **K8s**: `Tiltfile` for Kubernetes development - -## Development Workflow -1. **Backend changes**: Work in `extralit-server/src/extralit_server/` -2. **Frontend changes**: Work in `extralit-frontend/components/` or `extralit-frontend/pages/` -3. **SDK changes**: Work in `extralit/src/extralit/` -4. **Tests**: Each package has its own `tests/` directory -5. **Documentation**: Use `extralit/docs/` for SDK docs - -## File Naming Conventions -- **Python**: snake_case for files and modules -- **Vue/TypeScript**: PascalCase for components, camelCase for utilities -- **API endpoints**: kebab-case in URLs, snake_case in Python -- **Database**: snake_case for tables and columns -- **CSS classes**: kebab-case with BEM methodology where applicable diff --git a/.kiro/steering/tech.md b/.kiro/steering/tech.md deleted file mode 100644 index 1de5b217e..000000000 --- a/.kiro/steering/tech.md +++ /dev/null @@ -1,130 +0,0 @@ ---- -inclusion: always ---- - -# Technology Stack & Development Guidelines - -## Architecture Overview -Extralit is a monorepo with 3 main packages: -- **extralit-server/**: FastAPI backend with PostgreSQL/SQLAlchemy -- **extralit-frontend/**: Nuxt.js 2.17 (Vue.js 2.7) web UI -- **extralit/**: Python SDK and CLI - -## Code Style & Conventions - -### Python (Backend & SDK) -- **Naming**: snake_case for files, modules, functions, variables -- **Type Hints**: Always use type hints with Pydantic models -- **Async/Await**: Use async patterns for database and external API calls -- **Error Handling**: Use custom exceptions from `_exceptions` modules -- **Database**: SQLAlchemy 2.0 async patterns, Alembic migrations -- **API**: FastAPI with Pydantic schemas, dependency injection -- **Build**: PDM for dependency management, not pip - -### Frontend (Vue.js/Nuxt.js) -- **Naming**: PascalCase for components, camelCase for methods/props -- **Components**: Auto-imported from `~/components` directory -- **TypeScript**: Use `