diff --git a/extralit-server/.env.example b/extralit-server/.env.example new file mode 100644 index 000000000..57516092b --- /dev/null +++ b/extralit-server/.env.example @@ -0,0 +1,118 @@ +# ============================================================================== +# Extralit Server Configuration Example +# ============================================================================== +# Copy this file to .env and configure the values for your deployment. +# Lines starting with # are comments and will be ignored. +# ============================================================================== + +# ------------------------------------------------------------------------------ +# Client API Configuration +# ------------------------------------------------------------------------------ +# Use these when running the Extralit client SDK to connect to this server +# EXTRALIT_API_URL=http://localhost:6900 +# EXTRALIT_API_KEY=your-api-key-here + +# ------------------------------------------------------------------------------ +# Authentication & Security +# ------------------------------------------------------------------------------ +# Secret key for JWT token signing - CHANGE THIS IN PRODUCTION! +# Generate with: python -c "import secrets; print(secrets.token_urlsafe(32))" +# EXTRALIT_AUTH_SECRET_KEY=change-this-to-a-random-secret-key + +# Local users database file (for non-OAuth authentication) +EXTRALIT_LOCAL_AUTH_USERS_DB_FILE=.users.yml + +# ------------------------------------------------------------------------------ +# Database Configuration +# ------------------------------------------------------------------------------ +# SQLite (default, good for development) +EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False + +# PostgreSQL (recommended for production) +# EXTRALIT_DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/extralit + +# ------------------------------------------------------------------------------ +# Redis Configuration +# ------------------------------------------------------------------------------ +# Redis is used for caching and job queues (Celery/RQ) +EXTRALIT_REDIS_URL=redis://localhost:6379/0 + +# ------------------------------------------------------------------------------ +# S3-Compatible Object Storage (Optional) +# ------------------------------------------------------------------------------ +# Configure S3 or S3-compatible storage (MinIO, DigitalOcean Spaces, etc.) +# All three fields are required if you want to use S3 storage +# EXTRALIT_S3_ENDPOINT=http://localhost:9000 +# EXTRALIT_S3_ACCESS_KEY=minioadmin +# EXTRALIT_S3_SECRET_KEY=minioadmin +# EXTRALIT_S3_REGION=us-east-1 +# EXTRALIT_S3_SECURE=false + +# MinIO-specific (alternative to EXTRALIT_S3_*) +# MINIO_ACCESS_KEY=minioadmin +# MINIO_SECRET_KEY=minioadmin + +# ------------------------------------------------------------------------------ +# Search Engine Configuration (Optional) +# ------------------------------------------------------------------------------ +# EXTRALIT_SEARCH_ENGINE=elasticsearch +# EXTRALIT_ELASTICSEARCH=http://localhost:9200 + +# ------------------------------------------------------------------------------ +# Marker PDF Processing Configuration +# ------------------------------------------------------------------------------ +# How to run Marker: "local" (in-process) or "modal" (remote API) +MARKER_RUN_MODE=local + +# Required when MARKER_RUN_MODE=modal +# MARKER_MODAL_BASE_URL=https://your-modal-deployment.modal.run +# MARKER_MODAL_TIMEOUT_SECS=600 + +# ------------------------------------------------------------------------------ +# Document Preprocessing Configuration +# ------------------------------------------------------------------------------ +PREPROCESSING_ENABLED=true +PREPROCESSING_ENABLE_ANALYSIS=true +PREPROCESSING_ROTATE_PAGES=true +PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 +PREPROCESSING_CLEAN=false +PREPROCESSING_QUIET=false + +# ------------------------------------------------------------------------------ +# Chat & Message Validation +# ------------------------------------------------------------------------------ +EXTRALIT_MIN_MESSAGE_LENGTH=1 +EXTRALIT_MAX_MESSAGE_LENGTH=20000 +EXTRALIT_MIN_ROLE_LENGTH=1 +EXTRALIT_MAX_ROLE_LENGTH=20 + +# ------------------------------------------------------------------------------ +# HuggingFace & Telemetry +# ------------------------------------------------------------------------------ +# Disable HuggingFace Hub telemetry collection +HF_HUB_DISABLE_TELEMETRY=true + +# ------------------------------------------------------------------------------ +# Weaviate Cloud Services (Optional) +# ------------------------------------------------------------------------------ +# WCS_HTTP_URL=https://your-cluster.weaviate.network +# WCS_GRPC_URL=grpc://your-cluster.weaviate.network:50051 +# WCS_API_KEY=your-wcs-api-key +# WCS_USERNAME=your-username +# WCS_PASSWORD=your-password + +# ------------------------------------------------------------------------------ +# LLM Service Configuration (Optional) +# ------------------------------------------------------------------------------ +# EXTRALIT_EXTRALIT_URL=http://localhost:8000 + +# ------------------------------------------------------------------------------ +# macOS-Specific Configuration +# ------------------------------------------------------------------------------ +# Disable Objective-C fork safety warnings (macOS only) +# OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES + +# ------------------------------------------------------------------------------ +# Alembic Database Migrations +# ------------------------------------------------------------------------------ +# ALEMBIC_CONFIG=alembic.ini diff --git a/extralit-server/src/extralit_server/api/schemas/v1/chat.py b/extralit-server/src/extralit_server/api/schemas/v1/chat.py index 201fa571e..3a65a9890 100644 --- a/extralit-server/src/extralit_server/api/schemas/v1/chat.py +++ b/extralit-server/src/extralit_server/api/schemas/v1/chat.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os - from pydantic import BaseModel, Field -MIN_MESSAGE_LENGTH = int(os.getenv("EXTRALIT_MIN_MESSAGE_LENGTH", 1)) -MAX_MESSAGE_LENGTH = int(os.getenv("EXTRALIT_MAX_MESSAGE_LENGTH", 20000)) +from extralit_server.config import settings + +MIN_MESSAGE_LENGTH = settings.EXTRALIT_MIN_MESSAGE_LENGTH +MAX_MESSAGE_LENGTH = settings.EXTRALIT_MAX_MESSAGE_LENGTH -MIN_ROLE_LENGTH = int(os.getenv("EXTRALIT_MIN_ROLE_LENGTH", 1)) -MAX_ROLE_LENGTH = int(os.getenv("EXTRALIT_MAX_ROLE_LENGTH", 20)) +MIN_ROLE_LENGTH = settings.EXTRALIT_MIN_ROLE_LENGTH +MAX_ROLE_LENGTH = settings.EXTRALIT_MAX_ROLE_LENGTH class ChatFieldValue(BaseModel): diff --git a/extralit-server/src/extralit_server/cli/database/users/migrate.py b/extralit-server/src/extralit_server/cli/database/users/migrate.py index 173a2ade9..5426d279d 100644 --- a/extralit-server/src/extralit_server/cli/database/users/migrate.py +++ b/extralit-server/src/extralit_server/cli/database/users/migrate.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import asyncio -import os from typing import TYPE_CHECKING, Optional import typer import yaml from pydantic import BaseModel, Field, constr +from extralit_server.config import settings from extralit_server.database import AsyncSessionLocal from extralit_server.models import User, UserRole @@ -107,7 +107,7 @@ def _user_workspace_names(self, user: dict) -> list[str]: def migrate(): """Migrate users defined in YAML file to database.""" - users_db_file: str = os.getenv("EXTRALIT_LOCAL_AUTH_USERS_DB_FILE", ".users.yml") + users_db_file: str = settings.EXTRALIT_LOCAL_AUTH_USERS_DB_FILE asyncio.run(UsersMigrator(users_db_file).migrate()) diff --git a/extralit-server/src/extralit_server/config.py b/extralit-server/src/extralit_server/config.py new file mode 100644 index 000000000..7489668a6 --- /dev/null +++ b/extralit-server/src/extralit_server/config.py @@ -0,0 +1,175 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Centralized configuration management for Extralit Server. + +This module provides a Pydantic-based settings class that loads configuration +from environment variables and .env files. All environment variable access +should go through the `settings` object to ensure type safety and validation. + +Usage: + from extralit_server.config import settings + + # Access settings + db_url = settings.EXTRALIT_DATABASE_URL + api_key = settings.EXTRALIT_API_KEY.get_secret_value() # For SecretStr fields +""" + +from typing import Optional + +from pydantic import Field, HttpUrl, SecretStr, field_validator, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """ + Application settings loaded from environment variables. + + Settings are loaded from: + 1. Environment variables + 2. .env file (if present) + 3. Default values defined in field declarations + + Most settings use the EXTRALIT_ prefix, but some third-party integrations + (Marker, MinIO, etc.) use their own naming conventions. + + For sensitive values (API keys, secrets), use the .get_secret_value() method + to access the underlying string value. + """ + + model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") + + # Client API Configuration + EXTRALIT_API_URL: Optional[HttpUrl] = Field( + default=None, description="URL of the Extralit API server for client connections" + ) + EXTRALIT_API_KEY: Optional[SecretStr] = Field( + default=None, description="API key for authenticating with Extralit server" + ) + + # Server Configuration + OBJC_DISABLE_INITIALIZE_FORK_SAFETY: Optional[str] = Field( + default=None, description="macOS-specific setting to disable Objective-C fork safety warnings" + ) + ALEMBIC_CONFIG: Optional[str] = Field(default=None, description="Path to Alembic configuration file") + EXTRALIT_AUTH_SECRET_KEY: Optional[SecretStr] = Field( + default=None, description="Secret key for JWT token signing and authentication" + ) + EXTRALIT_DATABASE_URL: str = Field( + default="sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False", + description="Database connection URL (supports SQLite and PostgreSQL)", + ) + HF_HUB_DISABLE_TELEMETRY: bool = Field(default=True, description="Disable HuggingFace Hub telemetry collection") + # S3 Storage Configuration + EXTRALIT_S3_ENDPOINT: Optional[HttpUrl] = Field( + default=None, description="S3-compatible storage endpoint URL (e.g., MinIO, AWS S3)" + ) + EXTRALIT_S3_ACCESS_KEY: Optional[str] = Field(default=None, description="S3 access key ID") + EXTRALIT_S3_SECRET_KEY: Optional[SecretStr] = Field(default=None, description="S3 secret access key") + EXTRALIT_S3_REGION: Optional[str] = Field(default=None, description="S3 bucket region") + EXTRALIT_S3_SECURE: bool = Field(default=False, description="Use HTTPS for S3 connections") + + # Search and Cache Configuration + EXTRALIT_EXTRALIT_URL: Optional[HttpUrl] = Field(default=None, description="URL for Extralit LLM serving endpoint") + EXTRALIT_SEARCH_ENGINE: Optional[str] = Field( + default=None, description="Search engine backend (elasticsearch or opensearch)" + ) + EXTRALIT_ELASTICSEARCH: Optional[HttpUrl] = Field(default=None, description="Elasticsearch/OpenSearch endpoint URL") + EXTRALIT_REDIS_URL: str = Field( + default="redis://localhost:6379/0", description="Redis connection URL for caching and job queues" + ) + # Document Preprocessing Configuration + PREPROCESSING_ENABLED: bool = Field(default=True, description="Enable document preprocessing pipeline") + PREPROCESSING_ENABLE_ANALYSIS: bool = Field(default=True, description="Enable document layout analysis") + PREPROCESSING_ROTATE_PAGES: bool = Field(default=True, description="Auto-rotate pages based on text orientation") + PREPROCESSING_ROTATE_PAGES_THRESHOLD: float = Field( + default=2.0, description="Confidence threshold for page rotation detection" + ) + PREPROCESSING_CLEAN: bool = Field(default=False, description="Clean up temporary files after preprocessing") + PREPROCESSING_QUIET: bool = Field(default=False, description="Suppress preprocessing log output") + + # External Service Configuration (MinIO, Weaviate) + MINIO_ACCESS_KEY: Optional[str] = Field(default=None, description="MinIO access key for object storage") + MINIO_SECRET_KEY: Optional[SecretStr] = Field(default=None, description="MinIO secret key") + WCS_HTTP_URL: Optional[str] = Field(default=None, description="Weaviate Cloud Services HTTP endpoint") + WCS_GRPC_URL: Optional[str] = Field(default=None, description="Weaviate Cloud Services gRPC endpoint") + WCS_API_KEY: Optional[SecretStr] = Field(default=None, description="Weaviate Cloud Services API key") + WCS_USERNAME: Optional[str] = Field(default=None, description="Weaviate Cloud Services username") + WCS_PASSWORD: Optional[SecretStr] = Field(default=None, description="Weaviate Cloud Services password") + + # Marker PDF Processing Configuration + MARKER_RUN_MODE: str = Field( + default="local", description="Marker execution mode: 'local' for in-process or 'modal' for remote API" + ) + MARKER_MODAL_BASE_URL: Optional[str] = Field( + default=None, description="Base URL for Modal-hosted Marker service (required when MARKER_RUN_MODE=modal)" + ) + MARKER_MODAL_TIMEOUT_SECS: int = Field(default=600, description="Timeout in seconds for Modal Marker API calls") + + # Chat and Message Validation + EXTRALIT_MIN_MESSAGE_LENGTH: int = Field(default=1, description="Minimum chat message length") + EXTRALIT_MAX_MESSAGE_LENGTH: int = Field(default=20000, description="Maximum chat message length") + EXTRALIT_MIN_ROLE_LENGTH: int = Field(default=1, description="Minimum chat role name length") + EXTRALIT_MAX_ROLE_LENGTH: int = Field(default=20, description="Maximum chat role name length") + + # Authentication Configuration + EXTRALIT_LOCAL_AUTH_USERS_DB_FILE: str = Field( + default=".users.yml", description="Path to local users database file for authentication" + ) + + @field_validator("MARKER_MODAL_BASE_URL") + @classmethod + def validate_marker_modal_url(cls, v: Optional[str], info) -> Optional[str]: + """Validate that MARKER_MODAL_BASE_URL is set when using Modal mode.""" + if info.data.get("MARKER_RUN_MODE", "").lower() == "modal" and not v: + raise ValueError( + "MARKER_MODAL_BASE_URL must be set when MARKER_RUN_MODE is 'modal'. " + "Please provide the URL of your Modal deployment endpoint." + ) + return v + + @model_validator(mode="after") + def validate_s3_config(self) -> "Settings": + """Validate that S3 configuration is complete when any S3 field is provided.""" + s3_fields = { + "EXTRALIT_S3_ENDPOINT": self.EXTRALIT_S3_ENDPOINT, + "EXTRALIT_S3_ACCESS_KEY": self.EXTRALIT_S3_ACCESS_KEY, + "EXTRALIT_S3_SECRET_KEY": self.EXTRALIT_S3_SECRET_KEY, + } + provided_fields = {k: v for k, v in s3_fields.items() if v is not None} + + # If any S3 field is provided, all required fields must be provided + if provided_fields and len(provided_fields) < 3: + missing = [k for k, v in s3_fields.items() if v is None] + raise ValueError( + f"Incomplete S3 configuration. When using S3 storage, all required fields must be set. " + f"Missing: {', '.join(missing)}" + ) + return self + + def mask_secrets(self) -> dict: + """Export settings with sensitive values masked for logging/debugging. + + Returns: + dict: Settings dictionary with SecretStr fields masked as '***' + """ + data = self.model_dump() + for key, value in data.items(): + if isinstance(getattr(self, key), SecretStr) and value: + data[key] = "***MASKED***" + return data + + +settings = Settings() diff --git a/extralit-server/src/extralit_server/integrations/modal/marker_client.py b/extralit-server/src/extralit_server/integrations/modal/marker_client.py index 514f111c7..f3fbcb999 100644 --- a/extralit-server/src/extralit_server/integrations/modal/marker_client.py +++ b/extralit-server/src/extralit_server/integrations/modal/marker_client.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os from pathlib import Path from typing import Any, Optional @@ -21,17 +20,24 @@ from dotenv import load_dotenv from extralit_server.api.handlers.v1.models import client +from extralit_server.config import settings load_dotenv() # loads variables from a .env file in the project root -DEFAULT_TIMEOUT = int(os.getenv("MARKER_MODAL_TIMEOUT_SECS", "600")) - def get_modal_base_url() -> str: - base_url = os.getenv("MARKER_MODAL_BASE_URL", "").rstrip("/") + """Get the Modal base URL from settings. + + Returns: + str: The Modal base URL without trailing slash + + Raises: + RuntimeError: If MARKER_MODAL_BASE_URL is not set + """ + base_url = settings.MARKER_MODAL_BASE_URL if not base_url: raise RuntimeError("MARKER_MODAL_BASE_URL is not set. Set it to your Modal endpoint URL.") - return base_url + return base_url.rstrip("/") async def convert_document_via_modal( @@ -67,7 +73,7 @@ async def convert_document_via_modal( data = {k: v for k, v in data.items() if v not in (None, "", "none", "null")} headers = extra_headers or {} - t = timeout if timeout is not None else DEFAULT_TIMEOUT + t = timeout if timeout is not None else settings.MARKER_MODAL_TIMEOUT_SECS try: resp = await client.post(url, files=files, data=data, headers=headers, timeout=t) resp.raise_for_status() diff --git a/extralit-server/src/extralit_server/integrations/modal/marker_modal_deployment.py b/extralit-server/src/extralit_server/integrations/modal/marker_modal_deployment.py new file mode 100644 index 000000000..f78bd3395 --- /dev/null +++ b/extralit-server/src/extralit_server/integrations/modal/marker_modal_deployment.py @@ -0,0 +1,314 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Modal deployment for Datalab Marker PDF conversion service. +""" + +import os +from typing import Optional + +import modal + +# Define the Modal app +app = modal.App("datalab-marker-modal-demo") +GPU_TYPE = "T4" +MODEL_PATH_PREFIX = "/root/.cache/datalab/models" + +# Define the container image with all dependencies +image = ( + modal.Image.debian_slim(python_version="3.10") + .apt_install(["git", "wget"]) + .env({"TORCH_DEVICE": "cuda"}) + .pip_install( + [ + "marker-pdf[full]", + "fastapi==0.104.1", + "uvicorn==0.24.0", + "python-multipart==0.0.6", + "torch>=2.2.2,<3.0.0", + "torchvision>=0.17.0", + "torchaudio>=2.2.0", + ] + ) +) + +# Create a persistent volume for model caching +models_volume = modal.Volume.from_name("marker-models-modal-demo", create_if_missing=True) + + +def setup_models_with_cache_check(logger, commit_volume=False): + """ + Shared function to create models and handle cache checking/logging. + """ + import gc + import os + + from marker.models import create_model_dict + + # Check if models exist in cache + models_dir_exists = os.path.exists(MODEL_PATH_PREFIX) + models_dir_contents = os.listdir(MODEL_PATH_PREFIX) if models_dir_exists else [] + + logger.info(f"Models cache directory exists: {models_dir_exists}") + logger.info(f"Models cache directory contents: {models_dir_contents}") + + if models_dir_exists and models_dir_contents: + logger.info("Found existing models in volume cache, loading from cache...") + else: + logger.warning( + "No models found in volume cache. Models will be downloaded now (this may take several minutes)." + ) + + # Create/load models + models = create_model_dict() + logger.info(f"Successfully loaded {len(models)} models") + + # Check what was downloaded/cached + if os.path.exists(MODEL_PATH_PREFIX): + contents = os.listdir(MODEL_PATH_PREFIX) + logger.info(f"Models in cache: {contents}") + + # Commit volume if requested (for download function) + if commit_volume: + gc.collect() + logger.info("Attempting to commit volume...") + models_volume.commit() + logger.info("Volume committed successfully") + + return models + + +@app.function( + image=image, + volumes={MODEL_PATH_PREFIX: models_volume}, + gpu=GPU_TYPE, + timeout=600, +) +def download_models(): + """ + Helper function to download models used in marker into a Modal volume. + """ + import logging + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + + logger.info("Downloading models to persistent volume...") + logger.info(f"Volume mounted at: {MODEL_PATH_PREFIX}") + + try: + models = setup_models_with_cache_check(logger, commit_volume=True) + return f"Models downloaded successfully: {list(models.keys())}" + except Exception as e: + logger.error(f"Failed to download models: {e}") + raise + + +@app.cls( + image=image, + gpu=GPU_TYPE, + memory=16384, + timeout=600, # 10 minute timeout for large documents + volumes={MODEL_PATH_PREFIX: models_volume}, + scaledown_window=300, +) +class MarkerModalDemoService: + @modal.enter() + def load_models(self): + """Load models once per container using @modal.enter() for efficiency.""" + import logging + import traceback + + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + logger = logging.getLogger(__name__) + + logger.info("Loading Marker models using @modal.enter()...") + try: + self.models = setup_models_with_cache_check(logger, commit_volume=True) + except Exception as e: + logger.error(f"Error loading models: {e}") + traceback.print_exc() + self.models = None + + @modal.asgi_app() + def marker_api(self): + import base64 + import io + import logging + import traceback + from contextlib import asynccontextmanager + from pathlib import Path + + from fastapi import FastAPI, File, Form, HTTPException, UploadFile + from fastapi.encoders import jsonable_encoder + from fastapi.responses import JSONResponse + from marker.config.parser import ConfigParser + from marker.converters.pdf import PdfConverter + from marker.settings import settings + + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + logger = logging.getLogger(__name__) + + @asynccontextmanager + async def lifespan(app: FastAPI): + logger.info("Datalab Marker / Modal demo app starting up...") + yield + logger.info("Datalab Marker / Modal demo app shutting down...") + + web_app = FastAPI( + title="Datalab Marker PDF Conversion Service - Modal Demo", + description="Convert PDFs and documents to markdown, JSON, or HTML using Marker, deployed on Modal", + version="1.0.0", + lifespan=lifespan, + ) + + @web_app.get("/health") + async def health_check(): + models_loaded = hasattr(self, "models") and self.models is not None + model_count = len(self.models) if models_loaded else 0 + + cache_exists = os.path.exists(MODEL_PATH_PREFIX) + cache_contents = os.listdir(MODEL_PATH_PREFIX) if cache_exists else [] + + return { + "status": "healthy" if models_loaded else "loading", + "models_loaded": models_loaded, + "model_count": model_count, + "cache_dir": MODEL_PATH_PREFIX, + "cache_exists": cache_exists, + "cache_contents": cache_contents[:10], + } + + @web_app.post("/convert") + async def convert_document( + file: UploadFile = File(..., description="Document to convert"), + page_range: Optional[str] = Form(None), + force_ocr: bool = Form(False), + paginate_output: bool = Form(False), + output_format: str = Form("markdown"), + use_llm: bool = Form(False), + ): + """Convert uploaded document to specified format.""" + if not hasattr(self, "models") or self.models is None: + logger.error("Models not available for conversion") + raise HTTPException( + status_code=503, detail="Models not loaded yet. Please wait for model initialization." + ) + + allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"} + file_ext = Path(file.filename).suffix.lower() + if file_ext not in allowed_extensions: + raise HTTPException( + status_code=400, detail=f"Unsupported file type: {file_ext}. Supported: {allowed_extensions}" + ) + + if output_format not in ["markdown", "json", "html", "chunks"]: + raise HTTPException( + status_code=400, detail="Output format must be one of: markdown, json, html, chunks" + ) + + try: + # Read and save file + file_content = await file.read() + temp_path = f"/tmp/{file.filename}" + with open(temp_path, "wb") as temp_file: + temp_file.write(file_content) + + # Configure conversion parameters + config = { + "filepath": temp_path, + "page_range": page_range, + "force_ocr": force_ocr, + "paginate_output": paginate_output, + "output_format": output_format, + "use_llm": use_llm, + } + + # Create converter + config_parser = ConfigParser(config) + config_dict = config_parser.generate_config_dict() + config_dict["pdftext_workers"] = 1 + + converter = PdfConverter( + config=config_dict, + artifact_dict=self.models, + processor_list=config_parser.get_processors(), + renderer=config_parser.get_renderer(), + llm_service=config_parser.get_llm_service() if use_llm else None, + ) + + logger.info(f"Converting {file.filename} to {output_format}...") + rendered_output = converter(temp_path) + + # Prepare response payload + json_content = None + html_content = None + markdown_content = None + encoded_images = {} + + if output_format == "json": + # Robust Pydantic serialization + try: + json_content = rendered_output.model_dump(mode="json") + except Exception as e: + logger.warning(f"model_dump(mode='json') failed ({e}); trying model_dump_json.") + import json as pyjson + + json_content = pyjson.loads(rendered_output.model_dump_json()) + else: + from marker.output import text_from_rendered + + text, _, images = text_from_rendered(rendered_output) + + if output_format == "html": + html_content = text + else: + markdown_content = text + + for img_name, img_obj in images.items(): + byte_stream = io.BytesIO() + img_obj.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT) + encoded_images[img_name] = base64.b64encode(byte_stream.getvalue()).decode("utf-8") + + metadata = jsonable_encoder(getattr(rendered_output, "metadata", {})) + + logger.info(f"Conversion completed for {file.filename}") + os.unlink(temp_path) + + payload = { + "success": True, + "filename": file.filename, + "output_format": output_format, + "json": json_content, + "html": html_content, + "markdown": markdown_content, + "images": encoded_images, + "metadata": metadata, + "page_count": len(metadata.get("page_stats", [])) if isinstance(metadata, dict) else None, + } + return JSONResponse(content=jsonable_encoder(payload)) + + except Exception as e: + try: + if os.path.exists(temp_path): + os.unlink(temp_path) + except Exception: + pass + + logger.error(f"Conversion error for {file.filename}: {e!s}") + traceback.print_exc() + raise HTTPException(status_code=500, detail=f"Conversion failed: {e!s}") + + return web_app diff --git a/extralit-server/src/extralit_server/jobs/ocr_jobs.py b/extralit-server/src/extralit_server/jobs/ocr_jobs.py index 53151e071..74352a479 100644 --- a/extralit-server/src/extralit_server/jobs/ocr_jobs.py +++ b/extralit-server/src/extralit_server/jobs/ocr_jobs.py @@ -15,7 +15,6 @@ """OCR-related job functions for document processing.""" import logging -import os from pathlib import Path from pprint import pprint from typing import TYPE_CHECKING, Any, Optional, Union @@ -26,6 +25,7 @@ from rq.decorators import job from extralit_server.api.schemas.v1.document.layout import Block, Layout, Page +from extralit_server.config import settings from extralit_server.contexts.ocr.figures import extract_figure_bboxes from extralit_server.contexts.ocr.tables import extract_table_bboxes from extralit_server.contexts.ocr.text import extract_text_bboxes @@ -34,7 +34,7 @@ load_dotenv() # Switch between local Marker and Modal-remote Marker -MARKER_RUN_MODE = os.getenv("MARKER_RUN_MODE", "local").lower() +MARKER_RUN_MODE = settings.MARKER_RUN_MODE.lower() _LOGGER = logging.getLogger(__name__) @@ -89,7 +89,7 @@ async def async_marker_layout_job( _LOGGER.info(f"Starting Marker layout extraction for: {pdf_path} (mode={MARKER_RUN_MODE})") if MARKER_RUN_MODE == "modal": - _LOGGER.info(f"Using Modal endpoint: {os.getenv('MARKER_MODAL_BASE_URL')}") + _LOGGER.info(f"Using Modal endpoint: {settings.MARKER_MODAL_BASE_URL}") # Call Modal-hosted Marker with JSON output for layout parsing modal_resp = await convert_document_via_modal( pdf_path=pdf_path, diff --git a/extralit-server/src/extralit_server/security/authentication/oauth2/settings.py b/extralit-server/src/extralit_server/security/authentication/oauth2/settings.py index 7e395dbc6..6f5db2198 100644 --- a/extralit-server/src/extralit_server/security/authentication/oauth2/settings.py +++ b/extralit-server/src/extralit_server/security/authentication/oauth2/settings.py @@ -59,6 +59,7 @@ def __init__( if self.allow_http_redirect: # See https://stackoverflow.com/questions/27785375/testing-flask-oauthlib-locally-without-https + # Note: This is a runtime configuration for oauthlib, not a persistent setting os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1" @property diff --git a/extralit-server/src/extralit_server/telemetry/_helpers.py b/extralit-server/src/extralit_server/telemetry/_helpers.py index 4eac0a7a0..e6b1bb8eb 100644 --- a/extralit-server/src/extralit_server/telemetry/_helpers.py +++ b/extralit-server/src/extralit_server/telemetry/_helpers.py @@ -106,7 +106,7 @@ def _has_docker_cgroup() -> bool: def _is_quickstart_env(): # TODO: Any modification in the `quickstart.Dockerfile` file should be reflected here - + # Note: These are quickstart-specific env vars not in the main settings config for env_var in [ "OWNER_USERNAME", "OWNER_PASSWORD", diff --git a/extralit/docs/admin_guide/configuration.md b/extralit/docs/admin_guide/configuration.md new file mode 100644 index 000000000..de62557c3 --- /dev/null +++ b/extralit/docs/admin_guide/configuration.md @@ -0,0 +1,346 @@ +# Configuration Management + +Extralit Server uses a centralized configuration system based on Pydantic Settings, providing type safety, validation, and clear documentation for all configuration options. + +## Overview + +Starting from version 2.0, Extralit Server uses a **Pydantic-based settings model** located in `extralit_server.config`. This provides: + +- ✅ **Type Safety**: All settings are strongly typed with automatic validation +- ✅ **Documentation**: Every setting includes inline documentation +- ✅ **Validation**: Configuration errors are caught at startup, not runtime +- ✅ **IDE Support**: Autocomplete and type hints in modern IDEs +- ✅ **Secret Management**: Sensitive values are protected using `SecretStr` + +## Configuration Sources + +Settings are loaded in the following order (later sources override earlier ones): + +1. **Default values** defined in the Settings class +2. **Environment variables** +3. **.env file** (if present in the project root) + +## Quick Start + +### 1. Copy the Example Configuration + +```bash +cd extralit-server +cp .env.example .env +``` + +### 2. Edit Your Configuration + +Open `.env` and configure the values for your deployment: + +```bash +# Required for production +EXTRALIT_AUTH_SECRET_KEY=your-secret-key-here +EXTRALIT_DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/extralit + +# Optional but recommended +EXTRALIT_REDIS_URL=redis://localhost:6379/0 +``` + +### 3. Generate a Secret Key + +For production deployments, generate a secure secret key: + +```bash +python -c "import secrets; print(secrets.token_urlsafe(32))" +``` + +Use this value for `EXTRALIT_AUTH_SECRET_KEY`. + +## Configuration Categories + +### Authentication & Security + +| Setting | Type | Default | Description | +|---------|------|---------|-------------| +| `EXTRALIT_AUTH_SECRET_KEY` | SecretStr | None | Secret key for JWT token signing ⚠️ **Required for production** | +| `EXTRALIT_LOCAL_AUTH_USERS_DB_FILE` | str | `.users.yml` | Path to local users database file | + +!!! warning "Production Security" + Always set a strong `EXTRALIT_AUTH_SECRET_KEY` in production. Never use default values or commit secrets to version control. + +### Database Configuration + +Extralit supports both SQLite (for development) and PostgreSQL (for production). + +#### SQLite (Development) + +```bash +EXTRALIT_DATABASE_URL=sqlite+aiosqlite:///./extralit-dev.db?check_same_thread=False +``` + +#### PostgreSQL (Production Recommended) + +```bash +EXTRALIT_DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/extralit +``` + +### Redis Configuration + +Redis is used for caching and background job queues: + +```bash +EXTRALIT_REDIS_URL=redis://localhost:6379/0 +``` + +For Redis with authentication: + +```bash +EXTRALIT_REDIS_URL=redis://:password@localhost:6379/0 +``` + +### S3-Compatible Storage + +Configure S3 or S3-compatible storage (MinIO, DigitalOcean Spaces, AWS S3): + +```bash +EXTRALIT_S3_ENDPOINT=http://localhost:9000 +EXTRALIT_S3_ACCESS_KEY=your-access-key +EXTRALIT_S3_SECRET_KEY=your-secret-key +EXTRALIT_S3_REGION=us-east-1 +EXTRALIT_S3_SECURE=false # Set to true for HTTPS +``` + +!!! note "S3 Configuration Validation" + When using S3 storage, all three required fields (`ENDPOINT`, `ACCESS_KEY`, `SECRET_KEY`) must be set. The configuration will fail validation if any are missing. + +### Marker PDF Processing + +Marker can run in two modes: + +#### Local Mode (Default) + +Runs Marker in-process. Requires `marker-pdf` to be installed: + +```bash +MARKER_RUN_MODE=local +``` + +#### Modal Mode (Remote API) + +Uses a remote Modal deployment for Marker processing: + +```bash +MARKER_RUN_MODE=modal +MARKER_MODAL_BASE_URL=https://your-modal-deployment.modal.run +MARKER_MODAL_TIMEOUT_SECS=600 +``` + +!!! note "Modal Configuration Validation" + When `MARKER_RUN_MODE=modal`, the `MARKER_MODAL_BASE_URL` must be set. The configuration will fail validation if it's missing. + +### Document Preprocessing + +Control document preprocessing behavior: + +```bash +PREPROCESSING_ENABLED=true +PREPROCESSING_ENABLE_ANALYSIS=true +PREPROCESSING_ROTATE_PAGES=true +PREPROCESSING_ROTATE_PAGES_THRESHOLD=2.0 +PREPROCESSING_CLEAN=false +PREPROCESSING_QUIET=false +``` + +| Setting | Type | Default | Description | +|---------|------|---------|-------------| +| `PREPROCESSING_ENABLED` | bool | true | Enable/disable preprocessing pipeline | +| `PREPROCESSING_ENABLE_ANALYSIS` | bool | true | Enable document layout analysis | +| `PREPROCESSING_ROTATE_PAGES` | bool | true | Auto-rotate pages based on text orientation | +| `PREPROCESSING_ROTATE_PAGES_THRESHOLD` | float | 2.0 | Confidence threshold for rotation | +| `PREPROCESSING_CLEAN` | bool | false | Clean up temporary files after processing | +| `PREPROCESSING_QUIET` | bool | false | Suppress preprocessing log output | + +### Search Engine Configuration + +Configure Elasticsearch or OpenSearch for full-text search: + +```bash +EXTRALIT_SEARCH_ENGINE=elasticsearch +EXTRALIT_ELASTICSEARCH=http://localhost:9200 +``` + +### Chat & Message Validation + +Configure validation limits for chat messages: + +```bash +EXTRALIT_MIN_MESSAGE_LENGTH=1 +EXTRALIT_MAX_MESSAGE_LENGTH=20000 +EXTRALIT_MIN_ROLE_LENGTH=1 +EXTRALIT_MAX_ROLE_LENGTH=20 +``` + +## Using Settings in Code + +### Importing Settings + +```python +from extralit_server.config import settings + +# Access configuration values +db_url = settings.EXTRALIT_DATABASE_URL +redis_url = settings.EXTRALIT_REDIS_URL +``` + +### Accessing Secret Values + +For fields defined as `SecretStr`, use `.get_secret_value()` to access the underlying string: + +```python +from extralit_server.config import settings + +# ❌ Wrong - this returns a SecretStr object +api_key = settings.EXTRALIT_API_KEY + +# ✅ Correct - this returns the actual string +api_key = settings.EXTRALIT_API_KEY.get_secret_value() +``` + +### Debugging Configuration + +To see all current settings (with secrets masked): + +```python +from extralit_server.config import settings + +# Export settings with secrets masked +config_dict = settings.mask_secrets() +print(config_dict) +``` + +## Environment Variable Naming + +Most Extralit settings use the `EXTRALIT_` prefix: + +- ✅ `EXTRALIT_DATABASE_URL` +- ✅ `EXTRALIT_REDIS_URL` +- ✅ `EXTRALIT_AUTH_SECRET_KEY` + +Some third-party integrations use their own naming: + +- `MARKER_RUN_MODE` - Marker-specific +- `MINIO_ACCESS_KEY` - MinIO-specific +- `WCS_HTTP_URL` - Weaviate-specific +- `HF_HUB_DISABLE_TELEMETRY` - HuggingFace-specific + +## Validation and Error Handling + +The configuration system validates settings at startup. If validation fails, you'll see a clear error message: + +``` +ValidationError: 1 validation error for Settings +MARKER_MODAL_BASE_URL + MARKER_MODAL_BASE_URL must be set when MARKER_RUN_MODE is 'modal'. + Please provide the URL of your Modal deployment endpoint. +``` + +### Common Validation Errors + +#### Missing Required Fields + +**Error**: `MARKER_MODAL_BASE_URL must be set when MARKER_RUN_MODE is 'modal'` + +**Solution**: Set the required environment variable: +```bash +export MARKER_MODAL_BASE_URL=https://your-modal-deployment.modal.run +``` + +#### Incomplete S3 Configuration + +**Error**: `Incomplete S3 configuration. Missing: EXTRALIT_S3_ACCESS_KEY, EXTRALIT_S3_SECRET_KEY` + +**Solution**: Provide all required S3 fields or remove all S3 configuration to use local storage. + +## Best Practices + +### Development + +1. **Use `.env` file**: Keep configuration in `.env` for easy local development +2. **Use SQLite**: Simplest database for local development +3. **Enable debug logging**: Set appropriate log levels for troubleshooting + +### Production + +1. **Use environment variables**: Set configuration via environment variables (not `.env` file) +2. **Use PostgreSQL**: More robust and performant than SQLite +3. **Set strong secrets**: Generate cryptographically secure secret keys +4. **Enable HTTPS**: Use `EXTRALIT_S3_SECURE=true` for S3 connections +5. **Use Redis**: Required for background jobs and caching +6. **Monitor configuration**: Regularly audit your configuration settings + +### Security + +1. **Never commit `.env`**: Add `.env` to `.gitignore` +2. **Rotate secrets regularly**: Change secret keys periodically +3. **Use secret management**: Consider using Vault, AWS Secrets Manager, etc. +4. **Limit access**: Restrict who can view/modify production configuration +5. **Audit logs**: Monitor who accesses configuration + +## Migration from Old Configuration + +If you're upgrading from an older version of Extralit that used direct `os.getenv()` calls: + +### Before (Old Pattern) + +```python +import os +db_url = os.getenv("EXTRALIT_DATABASE_URL") +``` + +### After (New Pattern) + +```python +from extralit_server.config import settings +db_url = settings.EXTRALIT_DATABASE_URL +``` + +### Benefits of New Pattern + +- ✅ Type safety with automatic validation +- ✅ Clear error messages for missing configuration +- ✅ IDE autocomplete and type hints +- ✅ Documentation available inline +- ✅ Protection against typos + +## Troubleshooting + +### Configuration Not Loading + +**Problem**: Settings don't seem to be loading from `.env` file + +**Solutions**: +1. Check that `.env` is in the correct directory (same as `config.py`) +2. Verify `.env` file format (no quotes around values unless needed) +3. Check for typos in variable names +4. Ensure `.env` has proper line endings (Unix LF, not Windows CRLF) + +### Import Errors + +**Problem**: `ImportError: cannot import name 'settings' from 'extralit_server.config'` + +**Solutions**: +1. Ensure you're using the latest version of Extralit Server +2. Check that `config.py` exists in `extralit_server/` directory +3. Verify Python path is set correctly + +### Validation Errors at Startup + +**Problem**: Server fails to start with validation errors + +**Solutions**: +1. Read the error message carefully - it tells you exactly what's wrong +2. Check that all required fields are set +3. Verify field types (e.g., URLs should start with `http://` or `https://`) +4. Ensure boolean values are `true`/`false`, not `yes`/`no` + +## Further Reading + +- [Pydantic Settings Documentation](https://docs.pydantic.dev/latest/concepts/pydantic_settings/) +- [Environment Variables Best Practices](https://12factor.net/config) +- [Extralit Deployment Guide](../admin_guide/deployment.md) diff --git a/extralit/docs/community/developer/configuration-system.md b/extralit/docs/community/developer/configuration-system.md new file mode 100644 index 000000000..feec18b61 --- /dev/null +++ b/extralit/docs/community/developer/configuration-system.md @@ -0,0 +1,462 @@ +# Configuration System (Developer Guide) + +This guide explains the Pydantic-based configuration system for Extralit Server developers. + +## Architecture + +The configuration system uses Pydantic Settings to provide type-safe, validated configuration management. + +### Key Components + +``` +extralit-server/ +├── src/extralit_server/ +│ └── config.py # Central configuration module +├── .env.example # Configuration template +└── .env # Local configuration (gitignored) +``` + +## The Settings Class + +Located in `extralit_server/config.py`, the `Settings` class is a Pydantic model that defines all configuration options: + +```python +from pydantic import Field, SecretStr +from pydantic_settings import BaseSettings + +class Settings(BaseSettings): + """Application settings loaded from environment variables.""" + + EXTRALIT_DATABASE_URL: str = Field( + default="sqlite+aiosqlite:///./extralit-dev.db", + description="Database connection URL" + ) + + EXTRALIT_API_KEY: Optional[SecretStr] = Field( + default=None, + description="API key for authentication" + ) + +settings = Settings() # Global singleton instance +``` + +## Adding New Configuration Options + +### 1. Define the Field + +Add the field to the `Settings` class in `config.py`: + +```python +class Settings(BaseSettings): + # ... existing fields ... + + MY_NEW_FEATURE_ENABLED: bool = Field( + default=False, + description="Enable my awesome new feature" + ) + + MY_NEW_API_KEY: Optional[SecretStr] = Field( + default=None, + description="API key for my new service" + ) +``` + +### 2. Use Field() for Documentation + +Always use `Field()` with a `description` parameter: + +```python +# ❌ Bad - no documentation +MY_SETTING: str = "default" + +# ✅ Good - includes description +MY_SETTING: str = Field( + default="default", + description="What this setting does" +) +``` + +### 3. Choose the Right Type + +Use appropriate types for validation: + +```python +# Strings +MY_STRING: str = Field(default="value", description="...") + +# URLs (validated) +MY_URL: HttpUrl = Field(default=None, description="...") + +# Secrets (protected from logging) +MY_SECRET: SecretStr = Field(default=None, description="...") + +# Integers +MY_NUMBER: int = Field(default=100, description="...") + +# Floats +MY_THRESHOLD: float = Field(default=0.5, description="...") + +# Booleans +MY_FLAG: bool = Field(default=True, description="...") + +# Optional values +MY_OPTIONAL: Optional[str] = Field(default=None, description="...") +``` + +### 4. Add Validation (if needed) + +Use Pydantic validators for complex validation logic: + +```python +from pydantic import field_validator, model_validator + +class Settings(BaseSettings): + MY_MODE: str = Field(default="auto", description="...") + MY_URL: Optional[str] = Field(default=None, description="...") + + @field_validator("MY_URL") + @classmethod + def validate_my_url(cls, v: Optional[str], info) -> Optional[str]: + """Validate MY_URL is set when MY_MODE requires it.""" + if info.data.get("MY_MODE") == "remote" and not v: + raise ValueError( + "MY_URL must be set when MY_MODE is 'remote'" + ) + return v + + @model_validator(mode="after") + def validate_complete_config(self) -> "Settings": + """Cross-field validation.""" + # Check relationships between multiple fields + return self +``` + +### 5. Update Documentation + +Add the new setting to: + +1. `.env.example` - with commented example +2. `docs/admin_guide/configuration.md` - with full documentation + +## Using Settings in Your Code + +### Basic Usage + +```python +from extralit_server.config import settings + +def my_function(): + db_url = settings.EXTRALIT_DATABASE_URL + timeout = settings.MARKER_MODAL_TIMEOUT_SECS +``` + +### Accessing Secrets + +For `SecretStr` fields, use `.get_secret_value()`: + +```python +from extralit_server.config import settings + +def authenticate(): + # ❌ Wrong - returns SecretStr object + api_key = settings.EXTRALIT_API_KEY + + # ✅ Correct - returns the actual string + if settings.EXTRALIT_API_KEY: + api_key = settings.EXTRALIT_API_KEY.get_secret_value() +``` + +### Conditional Logic + +```python +from extralit_server.config import settings + +if settings.MARKER_RUN_MODE == "modal": + # Use Modal API + url = settings.MARKER_MODAL_BASE_URL +else: + # Use local processing + pass +``` + +## Type Safety Benefits + +The Pydantic settings system provides strong type safety: + +### IDE Autocomplete + +Modern IDEs will autocomplete setting names and show types: + +```python +from extralit_server.config import settings + +settings. # IDE shows all available settings with types +``` + +### Type Checking + +Tools like `mypy` can catch errors at development time: + +```python +# mypy will catch this error +timeout: int = settings.EXTRALIT_DATABASE_URL # Type mismatch! +``` + +### Runtime Validation + +Invalid values are caught immediately: + +```bash +# This will fail at startup +export EXTRALIT_REDIS_URL="not-a-valid-url" +``` + +## Testing with Settings + +### Override Settings in Tests + +Use Pydantic's test utilities or monkeypatch: + +```python +import pytest +from extralit_server.config import Settings + +def test_with_custom_settings(monkeypatch): + # Method 1: Patch environment variables + monkeypatch.setenv("MARKER_RUN_MODE", "modal") + monkeypatch.setenv("MARKER_MODAL_BASE_URL", "https://test.example.com") + + # Reload settings + from extralit_server import config + config.settings = Settings() + + # Now test with the new settings + assert config.settings.MARKER_RUN_MODE == "modal" +``` + +```python +def test_with_settings_override(): + # Method 2: Create a test settings instance + test_settings = Settings( + MARKER_RUN_MODE="local", + EXTRALIT_DATABASE_URL="sqlite:///:memory:" + ) + + # Use test_settings in your test + assert test_settings.MARKER_RUN_MODE == "local" +``` + +### Testing Validation + +Test that validation works correctly: + +```python +import pytest +from pydantic import ValidationError +from extralit_server.config import Settings + +def test_marker_modal_validation(): + # Should raise error when modal mode without URL + with pytest.raises(ValidationError) as exc_info: + Settings( + MARKER_RUN_MODE="modal", + MARKER_MODAL_BASE_URL=None + ) + + assert "MARKER_MODAL_BASE_URL must be set" in str(exc_info.value) +``` + +## Migration Patterns + +### Replacing os.getenv() + +When you find code using `os.getenv()`, refactor it to use settings: + +```python +# Before +import os +timeout = int(os.getenv("TIMEOUT", "60")) + +# After +from extralit_server.config import settings +timeout = settings.MY_TIMEOUT # Already typed as int +``` + +### Replacing os.environ + +```python +# Before +import os +api_key = os.environ["API_KEY"] + +# After +from extralit_server.config import settings +api_key = settings.MY_API_KEY.get_secret_value() +``` + +### Handling Defaults + +```python +# Before +value = os.getenv("SETTING", "default") + +# After +# Define default in Settings class +MY_SETTING: str = Field(default="default", description="...") + +# Then use directly +value = settings.MY_SETTING +``` + +## Common Patterns + +### Feature Flags + +```python +class Settings(BaseSettings): + FEATURE_NEW_UI_ENABLED: bool = Field( + default=False, + description="Enable new UI features" + ) + +# Usage +from extralit_server.config import settings + +if settings.FEATURE_NEW_UI_ENABLED: + return render_new_ui() +else: + return render_old_ui() +``` + +### Environment-Specific Defaults + +```python +import os +from pydantic import Field + +class Settings(BaseSettings): + DEBUG: bool = Field( + default=os.getenv("ENV") == "development", + description="Enable debug mode" + ) +``` + +### Computed Properties + +```python +class Settings(BaseSettings): + REDIS_HOST: str = Field(default="localhost", description="...") + REDIS_PORT: int = Field(default=6379, description="...") + + @property + def redis_url(self) -> str: + """Computed Redis URL from host and port.""" + return f"redis://{self.REDIS_HOST}:{self.REDIS_PORT}/0" +``` + +## Security Best Practices + +### Always Use SecretStr for Secrets + +```python +# ❌ Bad - secret visible in logs +MY_SECRET: str = Field(default=None, description="...") + +# ✅ Good - secret protected +MY_SECRET: SecretStr = Field(default=None, description="...") +``` + +### Implement mask_secrets() + +The `Settings` class includes a `mask_secrets()` method for safe logging: + +```python +from extralit_server.config import settings + +# Safe to log - secrets are masked +config_dict = settings.mask_secrets() +logger.info(f"Configuration: {config_dict}") +``` + +### Validate Secret Strength + +```python +@field_validator("EXTRALIT_AUTH_SECRET_KEY") +@classmethod +def validate_secret_key_strength(cls, v: Optional[SecretStr]) -> Optional[SecretStr]: + if v: + secret = v.get_secret_value() + if len(secret) < 32: + raise ValueError("Secret key must be at least 32 characters") + return v +``` + +## Performance Considerations + +### Settings are Loaded Once + +The `settings` object is created once at import time: + +```python +# config.py +settings = Settings() # Loaded once + +# When you import +from extralit_server.config import settings # Reuses same instance +``` + +### Avoid Repeated Access in Loops + +```python +# ❌ Less efficient +for item in large_list: + if settings.MY_FLAG: # Settings access in loop + process(item) + +# ✅ More efficient +my_flag = settings.MY_FLAG # Access once +for item in large_list: + if my_flag: + process(item) +``` + +## Troubleshooting + +### Circular Import Issues + +If you get circular import errors: + +```python +# ❌ Can cause circular imports +from extralit_server.config import settings # At module level + +def my_function(): + pass + +# ✅ Import inside function if needed +def my_function(): + from extralit_server.config import settings + return settings.MY_VALUE +``` + +### Type Checker Complaints + +If mypy complains about optional values: + +```python +# ❌ mypy error: Optional[str] not compatible with str +url: str = settings.MY_OPTIONAL_URL + +# ✅ Handle the optional case +url = settings.MY_OPTIONAL_URL or "default" + +# ✅ Or use type narrowing +if settings.MY_OPTIONAL_URL: + url: str = settings.MY_OPTIONAL_URL # Now mypy knows it's not None +``` + +## References + +- [Pydantic Settings Documentation](https://docs.pydantic.dev/latest/concepts/pydantic_settings/) +- [Pydantic Validation](https://docs.pydantic.dev/latest/concepts/validators/) +- [Twelve-Factor App Config](https://12factor.net/config)