diff --git a/extralit-server/src/extralit_server/api/schemas/v1/document/layout.py b/extralit-server/src/extralit_server/api/schemas/v1/document/layout.py new file mode 100644 index 000000000..7dae5cd2d --- /dev/null +++ b/extralit-server/src/extralit_server/api/schemas/v1/document/layout.py @@ -0,0 +1,34 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional + +from pydantic import BaseModel + + +class Block(BaseModel): + type: str = "unknown" + bbox: list[Any] = [] + content: str = "" + id: str = "" + score: Optional[float] = None + + +class Page(BaseModel): + page: int + blocks: list[Block] + + +class Layout(BaseModel): + pages: list[Page] diff --git a/extralit-server/src/extralit_server/contexts/document/layout.py b/extralit-server/src/extralit_server/contexts/document/layout.py new file mode 100644 index 000000000..0f18b381c --- /dev/null +++ b/extralit-server/src/extralit_server/contexts/document/layout.py @@ -0,0 +1,39 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Literal + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class PDFOCRSettings(BaseSettings): + """ + PDF OCR settings that can be configured via environment variables. + + All settings have the OCR_ prefix. + """ + + model_config = SettingsConfigDict(env_prefix="OCR_") + + run_mode: Literal["marker", "local"] = "local" + + marker_modal_base_url: str | None = Field(default=None, description="Base URL for Modal-hosted Marker service") + + marker_modal_timeout_secs: int = Field( + default=600, description="Timeout in seconds for requests to Modal-hosted Marker service" + ) + + +settings = PDFOCRSettings() diff --git a/extralit-server/src/extralit_server/integrations/modal/marker_client.py b/extralit-server/src/extralit_server/integrations/modal/marker_client.py new file mode 100644 index 000000000..c2a88707a --- /dev/null +++ b/extralit-server/src/extralit_server/integrations/modal/marker_client.py @@ -0,0 +1,77 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import Any, Optional + +import aiofiles +import httpx +from dotenv import load_dotenv + +from extralit_server.api.handlers.v1.models import client +from extralit_server.contexts.document.layout import PDFOCRSettings + +load_dotenv() # loads variables from a .env file in the project root + +# Initialize settings +ocr_settings = PDFOCRSettings() + + +def get_modal_base_url() -> str: + base_url = ocr_settings.marker_modal_base_url + if not base_url: + raise RuntimeError("OCR_MARKER_MODAL_BASE_URL is not set. Set it to your Modal endpoint URL.") + return base_url.rstrip("/") + + +async def convert_document_via_modal( + pdf_path: Path, + output_format: str = "json", + page_range: Optional[str] = None, + force_ocr: bool = False, + paginate_output: bool = False, + use_llm: bool = False, + timeout: Optional[int] = None, + extra_headers: Optional[dict[str, str]] = None, +) -> dict[str, Any]: + """ + Calls the Modal-hosted Marker /convert endpoint and returns the JSON response. + """ + base_url = get_modal_base_url() + url = f"{base_url}/convert" + + if not pdf_path.exists(): + raise FileNotFoundError(f"PDF file not found: {pdf_path}") + + # httpx requires files as (name, file, content_type) + async with aiofiles.open(pdf_path, "rb") as f: + file_bytes = await f.read() + files = {"file": (pdf_path.name, file_bytes, "application/pdf")} + data = { + "output_format": output_format, + "page_range": page_range, + "force_ocr": str(bool(force_ocr)).lower(), + "paginate_output": str(bool(paginate_output)).lower(), + "use_llm": str(bool(use_llm)).lower(), + } + data = {k: v for k, v in data.items() if v not in (None, "", "none", "null")} + + headers = extra_headers or {} + t = timeout if timeout is not None else ocr_settings.marker_modal_timeout_secs + try: + resp = await client.post(url, files=files, data=data, headers=headers, timeout=t) + resp.raise_for_status() + except httpx.HTTPStatusError as e: + raise RuntimeError(f"Modal Marker conversion failed: {e}; body={resp.text[:1000]}") from e + return resp.json() diff --git a/extralit-server/src/extralit_server/integrations/modal/marker_modal_deployment.py b/extralit-server/src/extralit_server/integrations/modal/marker_modal_deployment.py new file mode 100644 index 000000000..426e10849 --- /dev/null +++ b/extralit-server/src/extralit_server/integrations/modal/marker_modal_deployment.py @@ -0,0 +1,315 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Modal deployment for Datalab Marker PDF conversion service. +""" + +import os +from typing import Optional + +import aiofiles +import modal + +# Define the Modal app +app = modal.App("datalab-marker-modal-demo") +GPU_TYPE = "T4" +MODEL_PATH_PREFIX = "/root/.cache/datalab/models" + +# Define the container image with all dependencies +image = ( + modal.Image.debian_slim(python_version="3.10") + .apt_install(["git", "wget"]) + .env({"TORCH_DEVICE": "cuda"}) + .pip_install( + [ + "marker-pdf[full]", + "fastapi==0.104.1", + "uvicorn==0.24.0", + "python-multipart==0.0.6", + "torch>=2.2.2,<3.0.0", + "torchvision>=0.17.0", + "torchaudio>=2.2.0", + ] + ) +) + +# Create a persistent volume for model caching +models_volume = modal.Volume.from_name("marker-models-modal-demo", create_if_missing=True) + + +def setup_models_with_cache_check(logger, commit_volume=False): + """ + Shared function to create models and handle cache checking/logging. + """ + import gc + import os + + from marker.models import create_model_dict + + # Check if models exist in cache + models_dir_exists = os.path.exists(MODEL_PATH_PREFIX) + models_dir_contents = os.listdir(MODEL_PATH_PREFIX) if models_dir_exists else [] + + logger.info(f"Models cache directory exists: {models_dir_exists}") + logger.info(f"Models cache directory contents: {models_dir_contents}") + + if models_dir_exists and models_dir_contents: + logger.info("Found existing models in volume cache, loading from cache...") + else: + logger.warning( + "No models found in volume cache. Models will be downloaded now (this may take several minutes)." + ) + + # Create/load models + models = create_model_dict() + logger.info(f"Successfully loaded {len(models)} models") + + # Check what was downloaded/cached + if os.path.exists(MODEL_PATH_PREFIX): + contents = os.listdir(MODEL_PATH_PREFIX) + logger.info(f"Models in cache: {contents}") + + # Commit volume if requested (for download function) + if commit_volume: + gc.collect() + logger.info("Attempting to commit volume...") + models_volume.commit() + logger.info("Volume committed successfully") + + return models + + +@app.function( + image=image, + volumes={MODEL_PATH_PREFIX: models_volume}, + gpu=GPU_TYPE, + timeout=600, +) +def download_models(): + """ + Helper function to download models used in marker into a Modal volume. + """ + import logging + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + + logger.info("Downloading models to persistent volume...") + logger.info(f"Volume mounted at: {MODEL_PATH_PREFIX}") + + try: + models = setup_models_with_cache_check(logger, commit_volume=True) + return f"Models downloaded successfully: {list(models.keys())}" + except Exception as e: + logger.error(f"Failed to download models: {e}") + raise + + +@app.cls( + image=image, + gpu=GPU_TYPE, + memory=16384, + timeout=600, # 10 minute timeout for large documents + volumes={MODEL_PATH_PREFIX: models_volume}, + scaledown_window=300, +) +class MarkerModalDemoService: + @modal.enter() + def load_models(self): + """Load models once per container using @modal.enter() for efficiency.""" + import logging + import traceback + + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + logger = logging.getLogger(__name__) + + logger.info("Loading Marker models using @modal.enter()...") + try: + self.models = setup_models_with_cache_check(logger, commit_volume=True) + except Exception as e: + logger.error(f"Error loading models: {e}") + traceback.print_exc() + self.models = None + + @modal.asgi_app() + def marker_api(self): + import base64 + import io + import logging + import traceback + from contextlib import asynccontextmanager + from pathlib import Path + + from fastapi import FastAPI, File, Form, HTTPException, UploadFile + from fastapi.encoders import jsonable_encoder + from fastapi.responses import JSONResponse + from marker.config.parser import ConfigParser + from marker.converters.pdf import PdfConverter + from marker.settings import settings + + logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") + logger = logging.getLogger(__name__) + + @asynccontextmanager + async def lifespan(app: FastAPI): + logger.info("Datalab Marker / Modal demo app starting up...") + yield + logger.info("Datalab Marker / Modal demo app shutting down...") + + web_app = FastAPI( + title="Datalab Marker PDF Conversion Service - Modal Demo", + description="Convert PDFs and documents to markdown, JSON, or HTML using Marker, deployed on Modal", + version="1.0.0", + lifespan=lifespan, + ) + + @web_app.get("/health") + async def health_check(): + models_loaded = hasattr(self, "models") and self.models is not None + model_count = len(self.models) if models_loaded else 0 + + cache_exists = os.path.exists(MODEL_PATH_PREFIX) + cache_contents = os.listdir(MODEL_PATH_PREFIX) if cache_exists else [] + + return { + "status": "healthy" if models_loaded else "loading", + "models_loaded": models_loaded, + "model_count": model_count, + "cache_dir": MODEL_PATH_PREFIX, + "cache_exists": cache_exists, + "cache_contents": cache_contents[:10], + } + + @web_app.post("/convert") + async def convert_document( + file: UploadFile = File(..., description="Document to convert"), + page_range: Optional[str] = Form(None), + force_ocr: bool = Form(False), + paginate_output: bool = Form(False), + output_format: str = Form("markdown"), + use_llm: bool = Form(False), + ): + """Convert uploaded document to specified format.""" + if not hasattr(self, "models") or self.models is None: + logger.error("Models not available for conversion") + raise HTTPException( + status_code=503, detail="Models not loaded yet. Please wait for model initialization." + ) + + allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"} + file_ext = Path(file.filename).suffix.lower() + if file_ext not in allowed_extensions: + raise HTTPException( + status_code=400, detail=f"Unsupported file type: {file_ext}. Supported: {allowed_extensions}" + ) + + if output_format not in ["markdown", "json", "html", "chunks"]: + raise HTTPException( + status_code=400, detail="Output format must be one of: markdown, json, html, chunks" + ) + + try: + # Read and save file + file_content = await file.read() + temp_path = f"/tmp/{file.filename}" + async with aiofiles.open(temp_path, "wb") as temp_file: + await temp_file.write(file_content) + + # Configure conversion parameters + config = { + "filepath": temp_path, + "page_range": page_range, + "force_ocr": force_ocr, + "paginate_output": paginate_output, + "output_format": output_format, + "use_llm": use_llm, + } + + # Create converter + config_parser = ConfigParser(config) + config_dict = config_parser.generate_config_dict() + config_dict["pdftext_workers"] = 1 + + converter = PdfConverter( + config=config_dict, + artifact_dict=self.models, + processor_list=config_parser.get_processors(), + renderer=config_parser.get_renderer(), + llm_service=config_parser.get_llm_service() if use_llm else None, + ) + + logger.info(f"Converting {file.filename} to {output_format}...") + rendered_output = converter(temp_path) + + # Prepare response payload + json_content = None + html_content = None + markdown_content = None + encoded_images = {} + + if output_format == "json": + # Robust Pydantic serialization + try: + json_content = rendered_output.model_dump(mode="json") + except Exception as e: + logger.warning(f"model_dump(mode='json') failed ({e}); trying model_dump_json.") + import json as pyjson + + json_content = pyjson.loads(rendered_output.model_dump_json()) + else: + from marker.output import text_from_rendered + + text, _, images = text_from_rendered(rendered_output) + + if output_format == "html": + html_content = text + else: + markdown_content = text + + for img_name, img_obj in images.items(): + byte_stream = io.BytesIO() + img_obj.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT) + encoded_images[img_name] = base64.b64encode(byte_stream.getvalue()).decode("utf-8") + + metadata = jsonable_encoder(getattr(rendered_output, "metadata", {})) + + logger.info(f"Conversion completed for {file.filename}") + os.unlink(temp_path) + + payload = { + "success": True, + "filename": file.filename, + "output_format": output_format, + "json": json_content, + "html": html_content, + "markdown": markdown_content, + "images": encoded_images, + "metadata": metadata, + "page_count": len(metadata.get("page_stats", [])) if isinstance(metadata, dict) else None, + } + return JSONResponse(content=jsonable_encoder(payload)) + + except Exception as e: + try: + if os.path.exists(temp_path): + os.unlink(temp_path) + except Exception: + pass + + logger.error(f"Conversion error for {file.filename}: {e!s}") + traceback.print_exc() + raise HTTPException(status_code=500, detail=f"Conversion failed: {e!s}") + + return web_app diff --git a/extralit-server/src/extralit_server/jobs/ocr_jobs.py b/extralit-server/src/extralit_server/jobs/ocr_jobs.py index a63348a3a..53151e071 100644 --- a/extralit-server/src/extralit_server/jobs/ocr_jobs.py +++ b/extralit-server/src/extralit_server/jobs/ocr_jobs.py @@ -15,31 +15,43 @@ """OCR-related job functions for document processing.""" import logging +import os from pathlib import Path from pprint import pprint from typing import TYPE_CHECKING, Any, Optional, Union from uuid import UUID +from dotenv import load_dotenv from rq import Retry, get_current_job from rq.decorators import job +from extralit_server.api.schemas.v1.document.layout import Block, Layout, Page from extralit_server.contexts.ocr.figures import extract_figure_bboxes from extralit_server.contexts.ocr.tables import extract_table_bboxes from extralit_server.contexts.ocr.text import extract_text_bboxes from extralit_server.jobs.queues import DEFAULT_QUEUE, REDIS_CONNECTION -if TYPE_CHECKING: - from marker.renderers.json import JSONOutput +load_dotenv() + +# Switch between local Marker and Modal-remote Marker +MARKER_RUN_MODE = os.getenv("MARKER_RUN_MODE", "local").lower() _LOGGER = logging.getLogger(__name__) -try: - from marker.config.parser import ConfigParser - from marker.converters.pdf import PdfConverter - from marker.models import create_model_dict -except ImportError as e: - _LOGGER.error(f"Marker dependencies not available: {e}") - raise ImportError("Marker not installed. Install with: pip install marker-pdf") from e +if MARKER_RUN_MODE == "local": + try: + from marker.config.parser import ConfigParser + from marker.converters.pdf import PdfConverter + from marker.models import create_model_dict + except ImportError as e: + _LOGGER.error(f"Marker dependencies not available: {e}") + raise ImportError("Marker not installed. Install with: pip install marker-pdf") from e +else: + # Modal mode: use HTTP client, no local Marker deps needed + from extralit_server.integrations.modal.marker_client import convert_document_via_modal + +if TYPE_CHECKING: + from marker.renderers.json import JSONOutput @job(queue=DEFAULT_QUEUE, connection=REDIS_CONNECTION, timeout=1800, retry=Retry(max=2, interval=[30, 60])) @@ -50,23 +62,8 @@ async def async_marker_layout_job( document_id: Optional[UUID] = None, ) -> dict[str, Any]: """ - Use Marker to extract layout (tables, figures, text blocks) without running OCR. - - This job uses Marker's layout detection capabilities to identify and extract - bounding boxes for different document elements without performing OCR. - - Args: - pdf_path: Path to the PDF file to process - pages: Optional comma-separated page numbers to process (0-indexed). If None, processes all pages - extract_text: Whether to extract text blocks in addition to tables/figures - document_id: Optional document ID for job tracking - - Returns: - Dictionary containing structured layout information: - - tables: List of table bounding boxes - - figures: List of figure bounding boxes - - text_blocks: List of text block bounding boxes (if extract_text=True) - - metadata: Job execution metadata + Use Marker to extract layout (tables, figures, text blocks). + If MARKER_RUN_MODE=modal, calls Modal endpoint; if =local, runs Marker in-process. """ current_job = get_current_job() if current_job is not None: @@ -77,6 +74,7 @@ async def async_marker_layout_job( "pages": pages, "extract_text": extract_text, "workflow_step": "marker_layout_extraction", + "run_mode": MARKER_RUN_MODE, } ) current_job.save_meta() @@ -85,38 +83,45 @@ async def async_marker_layout_job( pdf_path = Path(pdf_path) if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}") - - _LOGGER.info(f"Starting Marker layout extraction for: {pdf_path}") - if pdf_path.suffix.lower() != ".pdf": raise ValueError(f"File is not a PDF: {pdf_path}") - try: - # Step 1: Create configuration + _LOGGER.info(f"Starting Marker layout extraction for: {pdf_path} (mode={MARKER_RUN_MODE})") + + if MARKER_RUN_MODE == "modal": + _LOGGER.info(f"Using Modal endpoint: {os.getenv('MARKER_MODAL_BASE_URL')}") + # Call Modal-hosted Marker with JSON output for layout parsing + modal_resp = await convert_document_via_modal( + pdf_path=pdf_path, + output_format="json", + page_range=pages, + force_ocr=False, + paginate_output=False, + use_llm=False, + ) + if not modal_resp.get("success"): + raise RuntimeError(f"Modal conversion failed: {modal_resp}") + json_payload = modal_resp.get("json") or {} + layout_result = parse_marker_json_output(json_payload) + else: + # Local execution config_dict, model_dict = create_marker_config(pages) - - # Step 2: Run Marker result = run_marker(str(pdf_path), config_dict, model_dict) - - # Step 3: Parse output layout_result = parse_marker_output(result) - except Exception as e: - _LOGGER.error(f"Error calling Marker API: {e}", exc_info=True) - raise e - # Extract bounding boxes using our utility functions tables = extract_table_bboxes(layout_result) figures = extract_figure_bboxes(layout_result) - text_blocks = extract_text_bboxes(layout_result) + text_blocks = extract_text_bboxes(layout_result) if extract_text else [] - print(f"Extracted {len(tables)} tables, {len(figures)} figures, {len(text_blocks)} text blocks") output = { "tables": tables, "figures": figures, "text_blocks": text_blocks, + "layout": layout_result.model_dump(), "metadata": { "source": "marker", + "run_mode": MARKER_RUN_MODE, "pdf_path": str(pdf_path), "pages_processed": pages or "all", "total_elements": len(tables) + len(figures) + len(text_blocks), @@ -125,69 +130,34 @@ async def async_marker_layout_job( } pprint(output) - - # Update job metadata with outputs - # current_job.meta.update( - # { - # "layout_extraction_complete": True, - # "tables_found": len(tables), - # "figures_found": len(figures), - # "text_blocks_found": len(text_blocks), - # } - # ) - # current_job.save_meta() - _LOGGER.info(f"Marker layout extraction completed. Found {len(tables)} tables, {len(figures)} figures") return output except Exception as e: _LOGGER.error(f"Error in marker layout extraction job: {e}", exc_info=True) - # current_job.meta["error"] = str(e) - # current_job.save_meta() raise def create_marker_config(pages: Optional[str] = None) -> tuple[dict[str, Any], dict[str, Any]]: """ Create optimized Marker configuration for layout detection only (no OCR). - - Args: - pages: Optional comma-separated page numbers to process - - Returns: - Tuple of (config_dict, model_dict) for Marker """ - # Configure for JSON output and layout detection only config_dict = { "output_format": "json", "force_ocr": False, "paginate_output": False, - "extract_images": False, # Skip image extraction for speed + "extract_images": False, } - if pages is not None: config_dict["page_range"] = pages - - # Create model dict - keep all models to avoid dependency resolution issues - # Models will be loaded but won't be used for actual OCR due to configuration model_dict = create_model_dict() - return config_dict, model_dict def run_marker(pdf_path: str, config_dict: dict[str, Any], model_dict: dict[str, Any]) -> "JSONOutput": """ - Run Marker layout detection on a PDF. - - Args: - pdf_path: Path to the PDF file - config_dict: Marker configuration dictionary - model_dict: Marker model dictionary - - Returns: - JSONOutput object containing layout detection results + Run Marker layout detection locally. """ - # Use ConfigParser to properly set up the renderer config_parser = ConfigParser(config_dict) final_config = config_parser.generate_config_dict() @@ -198,53 +168,61 @@ def run_marker(pdf_path: str, config_dict: dict[str, Any], model_dict: dict[str, renderer=config_parser.get_renderer(), ) - # This should return JSONOutput because of our config result = converter(pdf_path) - - # Verify we got JSONOutput as expected if not hasattr(result, "model_dump"): raise ValueError(f"Expected a Pydantic model with model_dump (like JSONOutput), but got {type(result)}") - return result -def parse_marker_output(result: "JSONOutput") -> dict[str, Any]: +def parse_marker_output(result: "JSONOutput") -> Layout: """ - Parse Marker JSONOutput into our application's expected layout format. - - Args: - result: JSONOutput object from Marker - - Returns: - A dictionary with a structured list of pages and their blocks. + Parse Marker JSONOutput into a Layout Pydantic model. """ - layout_data = {"pages": []} - + pages = [] if result.children: for page_idx, page in enumerate(result.children): - page_data = {"page": page_idx, "blocks": []} - + blocks = [] if page.children: for block in page.children: - block_data = { - "type": block.block_type or "unknown", - "bbox": block.bbox or [], - "content": (block.html or "").strip(), - "id": block.id or "", - "score": None, # Marker doesn't provide confidence scores - } - page_data["blocks"].append(block_data) - - layout_data["pages"].append(page_data) - - return layout_data + blocks.append( + Block( + type=getattr(block, "block_type", "unknown"), + bbox=getattr(block, "bbox", []), + content=(getattr(block, "html", "") or "").strip(), + id=getattr(block, "id", ""), + score=None, + ) + ) + pages.append(Page(page=page_idx, blocks=blocks)) + return Layout(pages=pages) + + +def parse_marker_json_output(result_json: dict[str, Any]) -> Layout: + """ + Parse the JSON renderer payload returned by Modal (modal_resp['json']) into a Layout Pydantic model. + """ + pages = [] + children = result_json.get("children") or [] + for page_idx, page in enumerate(children): + blocks = [] + for block in page.get("children") or []: + blocks.append( + Block( + type=block.get("block_type") or "unknown", + bbox=block.get("bbox") or [], + content=(block.get("html") or "").strip(), + id=block.get("id") or "", + score=None, + ) + ) + pages.append(Page(page=page_idx, blocks=blocks)) + return Layout(pages=pages) if __name__ == "__main__": import argparse import asyncio import json - from uuid import UUID parser = argparse.ArgumentParser(description="Test async_marker_layout_job from CLI.") parser.add_argument("pdf_path", type=str, help="Path to the PDF file to process.") @@ -259,16 +237,11 @@ def parse_marker_output(result: "JSONOutput") -> dict[str, Any]: ) args = parser.parse_args() - pdf_path: str = args.pdf_path - pages: str = args.pages - extract_text: bool = args.extract_text - async def _main(): - # Call the underlying logic directly, not as an RQ job result = await async_marker_layout_job( - pdf_path=pdf_path, - pages=pages, - extract_text=extract_text, + pdf_path=args.pdf_path, + pages=args.pages, + extract_text=args.extract_text, ) print(json.dumps(result, indent=2, ensure_ascii=False))