diff --git a/.gitignore b/.gitignore index fd0d2e5..18ece59 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ .idea/ /.vscode/settings.json /tests/testresources/pdfs/private/ +/.run/* diff --git a/CHANGELOG.md b/CHANGELOG.md index 68557d9..4a0484a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## [unreleased] +## 0.2.4 - 01.10.2025 ### 🚀 Features diff --git a/docs/source/_static/ShowFaceBoxes.png b/docs/source/_static/ShowFaceBoxes.png new file mode 100644 index 0000000..3589c82 Binary files /dev/null and b/docs/source/_static/ShowFaceBoxes.png differ diff --git a/docs/source/_static/ShowFaceCropped.png b/docs/source/_static/ShowFaceCropped.png new file mode 100644 index 0000000..6e0bd15 Binary files /dev/null and b/docs/source/_static/ShowFaceCropped.png differ diff --git a/docs/source/_static/ShowImageInvoice.png b/docs/source/_static/ShowImageInvoice.png new file mode 100644 index 0000000..7a1d36c Binary files /dev/null and b/docs/source/_static/ShowImageInvoice.png differ diff --git a/docs/source/_static/ShowSignatureBoxes.png b/docs/source/_static/ShowSignatureBoxes.png new file mode 100644 index 0000000..f1d04e5 Binary files /dev/null and b/docs/source/_static/ShowSignatureBoxes.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 2871cc5..ccd77ba 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,16 +6,25 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +import os +import sys + +sys.path.insert(0, os.path.abspath("../scaledp")) + project = "ScaleDP" -copyright = "2024, StabRise" -author = "StabRise" -release = "0.1.0" +author = "Mykola Melnyk" +release = "0.2.4" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = ["sphinx.ext.autodoc", "myst_parser"] +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + templates_path = ["_templates"] exclude_patterns = [] @@ -46,7 +55,13 @@ "icon": "https://img.shields.io/badge/by-StabRise-orange.svg?style=flat&colorA=E1523D&colorB=007D8A", "type": "url", }, - ] + ], + "extra_footer": """ +
+ © Copyright 2025, StabRise +
+ """, } # -- Options for HTML output ------------------------------------------------- diff --git a/docs/source/detectors.md b/docs/source/detectors.md new file mode 100644 index 0000000..c7aa4a9 --- /dev/null +++ b/docs/source/detectors.md @@ -0,0 +1,24 @@ +Detectors +========= + +## Overview + +This section provides an overview of the various detectors available in ScaleDP for processing images and documents. These detectors are designed to identify and extract specific features such as text, objects, and layout structures from images. + +## Object Detection + +* [**Face Detector**](#FaceDetector) +* [**Signature Detector**](#SignatureDetector) + +## Text Detection + +* [**CraftTextDetector**](#CraftTextDetector) +* [**DBNetOnnxDetector**](#DBNetOnnxDetector) +* **YoloOnnxTextDetector** +* **DocTRTextDetector** + +## Base Detectors + +* **BaseDetector** +* [**YoloOnnxDetector**](#YoloOnnxDetector) + diff --git a/docs/source/image/data_to_image.md b/docs/source/image/data_to_image.md new file mode 100644 index 0000000..cc6233c --- /dev/null +++ b/docs/source/image/data_to_image.md @@ -0,0 +1,49 @@ +(DataToImage)= +# DataToImage + +## Overview + +`DataToImage` is a PySpark ML transformer that converts binary content (such as bytes from files or streams) into image objects. It is designed for use in Spark pipelines, enabling scalable and distributed image processing workflows. The transformer supports various image types and handles errors gracefully. + +## Usage Example + +```python +from scaledp import DataToImage, PipelineModel + +image_example = files('resources/images/Invoice.png') + +df = spark.read.format("binaryFile") \ + .load(image_example) + +data_to_image = DataToImage( + inputCol="content", # Column with binary data + outputCol="image", # Output column for image objects + pathCol="path", # Optional: column with image paths + keepInputData=True, # Keep original data in output + propagateError=False, # Handle errors gracefully +) + +pipeline = PipelineModel(stages=[data_to_image]) +result = pipeline.transform(df) # df should have 'content' and optionally 'path' columns +result.show_image("image") +``` + + + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------| +| inputCol | str | Input column with binary content | "content" | +| outputCol | str | Output column for image objects | "image" | +| pathCol | str | Path column for image metadata | "path" | +| keepInputData | bool | Keep input data in output | False | +| imageType | Enum | Type of image (e.g., FILE, PIL) | ImageType.FILE | +| propagateError | bool | Propagate errors | False | + +## Notes +- Converts binary data to image objects using the specified image type. +- Handles errors gracefully; if `propagateError` is False, exceptions are logged and empty images are returned. +- Can be used as the first stage in image processing pipelines to ingest raw image data. +- Supports distributed processing with Spark. + diff --git a/docs/source/image/image_crop_boxes.md b/docs/source/image/image_crop_boxes.md new file mode 100644 index 0000000..1607afd --- /dev/null +++ b/docs/source/image/image_crop_boxes.md @@ -0,0 +1,63 @@ +(ImageCropBoxes)= +# ImageCropBoxes + +## Overview + +`ImageCropBoxes` is a PySpark ML transformer that crops images based on provided bounding boxes. It is designed to process images in Spark pipelines, supporting batch and distributed processing. The transformer can add padding to crops, limit the number of crops per image, and handle cases where no boxes are present. + +## Usage Example + +```python +from scaledp import FaceDetector, ImageCropBoxes, PipelineModel + +# Step 1: Detect faces in images +detector = FaceDetector( + inputCol="image", + outputCol="boxes", + keepInputData=True, + scoreThreshold=0.25, + padding=20, +) + +# Step 2: Crop images using detected face boxes +cropper = ImageCropBoxes( + inputCols=["image", "boxes"], + outputCol="cropped_image", + keepInputData=True, + padding=10, + limit=5, + noCrop=True, + autoRotate=False, # Automatically rotate crops if box height > width +) + +# Build and run the pipeline +pipeline = PipelineModel(stages=[detector, cropper]) +result = pipeline.transform(image_df) +result.show_image("cropped_image") +``` + + + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------| +| inputCols | list | Input columns: image and boxes | ["image", "boxes"] | +| outputCol | str | Output column for cropped images | "cropped_image"| +| keepInputData | bool | Keep input data in output | False | +| imageType | Enum | Type of image (e.g., FILE) | ImageType.FILE | +| numPartitions | int | Number of partitions for Spark | 0 | +| padding | int | Padding added to each crop | 0 | +| pageCol | str | Page column for repartitioning | "page" | +| propagateError | bool | Propagate errors | False | +| noCrop | bool | Raise error if no boxes to crop | True | +| limit | int | Limit number of crops per image | 0 (no limit) | +| autoRotate | bool | Auto rotate crop if box height > width | True | + +## Notes +- Crops are performed using bounding boxes from the `boxes` column. +- If `noCrop` is True and no boxes are present, an error is raised. +- If `limit` is set, only the first N boxes are used for cropping. +- If `autoRotate` is True, crops are rotated if the bounding box height is greater than its width. +- Supports distributed processing with Spark. +- Errors can be propagated or handled gracefully based on `propagateError`. diff --git a/docs/source/image/image_draw_boxes.md b/docs/source/image/image_draw_boxes.md new file mode 100644 index 0000000..0dc430a --- /dev/null +++ b/docs/source/image/image_draw_boxes.md @@ -0,0 +1,61 @@ +(ImageDrawBoxes)= +# ImageDrawBoxes + +## Overview + +`ImageDrawBoxes` is a PySpark ML transformer that draws bounding boxes and/or NER entity boxes on images. It supports both standard bounding boxes and named entity recognition (NER) outputs, allowing for flexible visualization of detected objects or entities. The transformer can be integrated into Spark pipelines for scalable image annotation tasks. + +## Usage Example + +```python +from scaledp import FaceDetector, ImageDrawBoxes, PipelineModel + +detector = FaceDetector( + inputCol="image", + outputCol="boxes", + keepInputData=True, + scoreThreshold=0.25, + padding=20, +) + +draw = ImageDrawBoxes( + inputCols=["image", "boxes"], + outputCol="image_with_boxes", + keepInputData=True, + filled=False, + color="green", + lineWidth=5, +) + +pipeline = PipelineModel(stages=[detector, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|---------------------| +| inputCols | list | Input columns: image and boxes/entities | ["image", "boxes"] | +| outputCol | str | Output column for annotated images | "image_with_boxes" | +| keepInputData | bool | Keep input data in output | False | +| imageType | Enum | Type of image (e.g., FILE) | ImageType.FILE | +| filled | bool | Fill rectangles | False | +| color | str | Box color (hex or name) | None (random) | +| lineWidth | int | Line width for boxes | 1 | +| textSize | int | Text size for labels | 12 | +| displayDataList | list | List of box/entity attributes to display as text | [] | +| numPartitions | int | Number of partitions for Spark | 0 | +| padding | int | Padding added to boxes | 0 | +| pageCol | str | Page column for repartitioning | "page" | +| whiteList | list | Only draw boxes/entities of these types | [] | +| blackList | list | Do not draw boxes/entities of these types | [] | + +## Notes +- Supports drawing both standard bounding boxes and NER entity boxes. +- Colors can be set manually or randomly assigned per entity/class. +- Text labels can be displayed using `displayDataList`. +- Handles rotated boxes and fills/outline options. +- Can be used in Spark pipelines for distributed image annotation. +- Errors are handled gracefully and logged. diff --git a/docs/source/image_processing.md b/docs/source/image_processing.md new file mode 100644 index 0000000..e7e152f --- /dev/null +++ b/docs/source/image_processing.md @@ -0,0 +1,13 @@ +# Image Processing + +This document provides an overview of various image processing transformers in ScaledP. + + +## Available Image Processing Transformers + +* [**DataToImage**](#DataToImage): Converts raw data into image format for further + processing. +* [**ImageCropBoxes**](#ImageCropBoxes): Crops specified regions from images based on + bounding box coordinates. +* [**ImageDrawBoxes**](#ImageDrawBoxes): Draws bounding boxes on images to highlight detected + objects or regions of interest. diff --git a/docs/source/index.rst b/docs/source/index.rst index b955fc4..4c39230 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -42,4 +42,11 @@ Benefits of using ScaleDP installation.md quickstart.md + image_processing.md + pdf_processing.md + detectors.md + ocr.md + show_utils.md + release_notes.md + diff --git a/docs/source/models/detectors/craft_text_detector.md b/docs/source/models/detectors/craft_text_detector.md new file mode 100644 index 0000000..33a0884 --- /dev/null +++ b/docs/source/models/detectors/craft_text_detector.md @@ -0,0 +1,81 @@ +(CraftTextDetector)= +# CraftTextDetector + +## Overview + +`CraftTextDetector` is a PySpark ML transformer for text detection in images using the CRAFT model. It supports distributed processing in Spark pipelines, batch inference, and optional refiner network postprocessing for improved accuracy. The detector outputs bounding boxes for detected text regions, with options for rotated boxes and threshold tuning. + +## Usage Example + +```python +from scaledp.models.detectors import CraftTextDetector +from scaledp import TesseractRecognizer, ImageDrawBoxes, PipelineModel + +detector = CraftTextDetector( + device="cpu", + keepInputData=True, + partitionMap=True, + numPartitions=1, + width=1600, + scoreThreshold=0.7, + textThreshold=0.4, + linkThreshold=0.4, + withRefiner=True, +) + +ocr = TesseractRecognizer( + inputCols=["image", "boxes"], + keepFormatting=False, + keepInputData=True, + lang=["eng", "spa"], + scoreThreshold=0.2, + scaleFactor=2.0, + partitionMap=True, + numPartitions=1, +) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "text"], + filled=False, + color="green", + lineWidth=5, + displayDataList=["score", "text", "angle"], +) + +pipeline = PipelineModel(stages=[detector, ocr, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.7 | +| textThreshold | float | Threshold for text region score | 0.4 | +| linkThreshold | float | Threshold for link affinity score | 0.4 | +| sizeThreshold | int | Minimum height for detected regions | -1 | +| width | int | Width for image resizing | 1280 | +| withRefiner | bool | Enable refiner network postprocessing | False | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| onlyRotated | bool | Return only rotated boxes | False | + +## Notes +- Supports optional refiner network for improved text box accuracy (`withRefiner`). +- Outputs bounding boxes for detected text regions, including rotated boxes if `onlyRotated` is True. +- Thresholds (`scoreThreshold`, `textThreshold`, `linkThreshold`) can be tuned for different document types. +- Can be integrated with OCR and visualization stages in Spark pipelines. +- Supports batch and distributed processing for scalable text detection. +- Errors are handled gracefully and can be propagated if desired. + diff --git a/docs/source/models/detectors/dbnet_onnx_detector.md b/docs/source/models/detectors/dbnet_onnx_detector.md new file mode 100644 index 0000000..fa3d896 --- /dev/null +++ b/docs/source/models/detectors/dbnet_onnx_detector.md @@ -0,0 +1,72 @@ +(DBNetOnnxDetector)= +# DBNetOnnxDetector + +## Overview + +`DBNetOnnxDetector` is a PySpark ML transformer for text detection in images using the DBNet ONNX model. It supports distributed processing in Spark pipelines and can automatically download models from Hugging Face Hub. The detector outputs bounding boxes for detected text regions, with options for rotated boxes and merging overlapping results. + +## Usage Example + +```python +from scaledp.models.detectors import DBNetOnnxDetector +from scaledp import TesseractRecognizer, ImageDrawBoxes, PipelineModel + +detector = DBNetOnnxDetector( + model="StabRise/text_detection_dbnet_ml_v0.2", # Hugging Face model repo + keepInputData=True, + onlyRotated=False, + scoreThreshold=0.2, +) + +ocr = TesseractRecognizer( + inputCols=["image", "boxes"], + keepFormatting=False, + keepInputData=True, + lang=["eng", "spa"], + scoreThreshold=0.2, + scaleFactor=2.0, + partitionMap=True, + numPartitions=1, +) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "text"], + filled=False, + color="green", + lineWidth=5, + displayDataList=["score", "text", "angle"], +) + +pipeline = PipelineModel(stages=[detector, ocr, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier or path | "StabRise/text_detection_dbnet_ml_v0.2" | + +## Notes +- Automatically downloads the ONNX model from Hugging Face Hub if not present locally. +- Outputs bounding boxes for detected text regions, including rotated boxes if `onlyRotated` is True. +- Merges overlapping boxes based on IOU, angle, and line proximity for cleaner results. +- Can be integrated with OCR and visualization stages in Spark pipelines. +- Supports batch and distributed processing for scalable text detection. +- Errors are handled gracefully and can be propagated if desired. + diff --git a/docs/source/models/detectors/face_detector.md b/docs/source/models/detectors/face_detector.md new file mode 100644 index 0000000..01bf1bc --- /dev/null +++ b/docs/source/models/detectors/face_detector.md @@ -0,0 +1,62 @@ +(FaceDetector)= +# FaceDetector + +## Overview + +`FaceDetector` is a face detection transformer based on the YOLO ONNX model. It is designed to efficiently detect faces in images using a pre-trained model from Hugging Face Hub. The detector is implemented as a PySpark ML transformer and can be integrated into Spark pipelines for scalable face detection tasks. + +## Usage Example + +```python +from scaledp import FaceDetector, ImageDrawBoxes, PipelineModel + +detector = FaceDetector( + keepInputData=True, + partitionMap=True, + numPartitions=0, + scoreThreshold=0.25, + task="detect", + padding=20, + ) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "boxes"], + filled=False, + color="green", + lineWidth=5, + displayDataList=[], +) +# Transform the image dataframe through the OCR stage +pipeline = PipelineModel(stages=[detector, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + + + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| task | str | Detection task type | "detect" | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier | "StabRise/face_detection" | + +## Notes +- The detector uses the YOLO ONNX model from Hugging Face Hub for face detection. +- Supports batch processing and distributed inference with Spark. +- Additional parameters can be set using the corresponding setter methods. + diff --git a/docs/source/models/detectors/signature_detector.md b/docs/source/models/detectors/signature_detector.md new file mode 100644 index 0000000..e9e867b --- /dev/null +++ b/docs/source/models/detectors/signature_detector.md @@ -0,0 +1,62 @@ +(SignatureDetector)= +# SignatureDetector + +## Overview + +`SignatureDetector` is a signature detection transformer based on the YOLO ONNX model. It efficiently detects signatures in images using a pre-trained model from Hugging Face Hub. The detector is implemented as a PySpark ML transformer and can be integrated into Spark pipelines for scalable signature detection tasks. + +## Usage Example + +```python +from scaledp import SignatureDetector, ImageDrawBoxes, PipelineModel + +detector = SignatureDetector( + keepInputData=True, + partitionMap=True, + numPartitions=0, + scoreThreshold=0.25, + task="detect", + padding=20, +) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "signatures"], + filled=False, + color="blue", + lineWidth=5, + displayDataList=[], +) +# Transform the image dataframe through the signature detection stage +pipeline = PipelineModel(stages=[detector, draw]) +result = pipeline.transform(image_df) +``` + + + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|--------------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for signatures | "signatures" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| task | str | Detection task type | "detect" | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier | "StabRise/signature_detection" | +| padding | int | Padding percent to expand detected boxes | 0 | + +## Notes +- The detector uses the YOLO ONNX model from Hugging Face Hub for signature detection. +- Supports batch processing and distributed inference with Spark. +- Additional parameters can be set using the corresponding setter methods. + diff --git a/docs/source/models/detectors/yolo_onnx_detector.md b/docs/source/models/detectors/yolo_onnx_detector.md new file mode 100644 index 0000000..751e39e --- /dev/null +++ b/docs/source/models/detectors/yolo_onnx_detector.md @@ -0,0 +1,57 @@ +(YoloOnnxDetector)= +# YoloOnnxDetector + +## Overview + +`YoloOnnxDetector` is a generic object detector transformer based on the YOLO ONNX model. It provides efficient detection of objects in images using a pre-trained YOLO model, supporting batch and distributed inference in Spark pipelines. It is designed for extensibility and is used as a base for specialized detectors such as FaceDetector and SignatureDetector. +It not need installing Pytorch, only ONNX Runtime is required. + + +## Inheritance + +- Inherits from [`BaseDetector`](./BaseDetector.md), which provides core Spark ML transformer functionality and schema handling. +- Mixes in `HasDevice` and `HasBatchSize` for device and batch configuration. + +## Usage Example + +```python +from scaledp.models.detectors.YoloOnnxDetector import YoloOnnxDetector + +detector = YoloOnnxDetector( + model="StabRise/face_detection", # or any supported YOLO ONNX model + scoreThreshold=0.3, + padding=10, +) + +# Use in a Spark pipeline +detected_df = detector.transform(input_df) +``` + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| task | str | Detection task type | "detect" | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier | (required) | +| padding | int | Padding percent to expand detected boxes | 0 | + +## Notes +- The detector loads YOLO ONNX models from Hugging Face Hub or local path. +- Supports batch and distributed processing with Spark. +- Padding expands detected bounding boxes by a percentage. +- Used as a base for specialized detectors (e.g., [**Face Detector**] + (#FaceDetector), [**Signature Detector**](#SignatureDetector)). + diff --git a/docs/source/models/recognizers/tesseract_recognizer.md b/docs/source/models/recognizers/tesseract_recognizer.md new file mode 100644 index 0000000..e2e3b64 --- /dev/null +++ b/docs/source/models/recognizers/tesseract_recognizer.md @@ -0,0 +1,67 @@ +(TesseractRecognizer)= +# TesseractRecognizer + +## Overview + +`TesseractRecognizer` is a PySpark ML transformer that runs Tesseract OCR on images. It supports multiple languages, Tesseract libraries (`tesserocr` and `pytesseract`), and advanced options such as line orientation detection, formatting, and rotated box handling. The transformer can be integrated into Spark pipelines for scalable and distributed text recognition tasks. + +## Usage Example + +```python +from scaledp import DocTRTextDetector, TesseractRecognizer, PipelineModel + +detector = DocTRTextDetector( + device="cpu", + keepInputData=True, + scoreThreshold=0.1, + partitionMap=True, + numPartitions=1, +) + +ocr = TesseractRecognizer( + keepFormatting=True, + tessLib="tesserocr", # or "pytesseract" + lang=["ukr", "eng"], + scoreThreshold=0.2, + partitionMap=True, + numPartitions=1, + tessDataPath="/usr/share/tesseract-ocr/5/tessdata/", + onlyRotated=True, +) + +pipeline = PipelineModel(stages=[detector, ocr]) +result = pipeline.transform(image_df) +for row in result.collect(): + print(row.text.text) # Recognized text +``` + +## Parameters + +| Parameter | Type | Description | Default | +|---------------------|---------|--------------------------------------------------|-----------------------------------------| +| inputCols | list | Input columns: image and boxes | ["image", "boxes"] | +| outputCol | str | Output column for recognized text | "text" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.5 | +| oem | int | OCR engine mode (see Tesseract OEM) | OEM.DEFAULT | +| lang | list | List of languages for OCR | ["eng"] | +| lineTolerance | int | Tolerance for line grouping | 0 | +| keepFormatting | bool | Preserve text formatting | False | +| tessDataPath | str | Path to Tesseract data folder | "/usr/share/tesseract-ocr/5/tessdata/" | +| tessLib | int/str | Tesseract library to use (TESSEROCR/PYTESSERACT) | TessLib.PYTESSERACT | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions for Spark | 0 | +| pageCol | str | Page column for repartitioning | "page" | +| pathCol | str | Path column for image metadata | "path" | +| detectLineOrientation| bool | Detect and auto-orient text lines | True | +| onlyRotated | bool | Only return rotated boxes | True | +| oriModel | str | Model for line orientation detection | "StabRise/line_orientation_detection_v0.1" | + +## Notes +- Supports both `tesserocr` and `pytesseract` libraries for OCR. +- Can process multiple languages and preserve formatting if desired. +- Handles rotated boxes and auto-orients text lines for improved accuracy. +- Errors are handled gracefully and logged; exceptions are included in the output if any occur. +- Can be used in Spark pipelines for distributed OCR processing. + diff --git a/docs/source/ocr.md b/docs/source/ocr.md new file mode 100644 index 0000000..24e5a93 --- /dev/null +++ b/docs/source/ocr.md @@ -0,0 +1,34 @@ +OCR models +========== + +The OCR models in ScaledP provide robust optical character recognition capabilities for extracting text from images and documents. These models leverage advanced deep learning techniques to deliver high accuracy and performance across various use cases. + +## Available OCR Engines + +End-to-end OCR solutions available in ScaledP include: + +- **Tesseract OCR**: An open-source OCR engine that supports multiple languages and is widely used for text extraction tasks. +- **DocTR OCR**: A deep learning-based OCR model that offers superior accuracy, especially for complex documents and layouts. +- **Surya OCR**: A high-performance OCR model optimized for speed and accuracy, suitable for real-time applications. +- **EasyOCR**: A lightweight OCR model that provides fast text recognition with support for multiple languages. +- **LLMOcr**: An OCR that utilizes large language models to enhance text recognition capabilities. + +## Text Detectors + +Text detectors are used to identify and locate text regions within images. +In some cases useful to run it as separate step in the OCR pipeline. + +See the following text detectors available in ScaledP: + +* [**CraftTextDetector**](#CraftTextDetector) +* [**DBNetOnnxDetector**](#DBNetOnnxDetector) +* **YoloOnnxTextDetector** +* **DocTRTextDetector** + +## Text Recognizers + +Text recognizers can recognize text from images contains single +line/word/character. +Available text recognizers in ScaledP include: + +* [**TesseractRecognizer**](#TesseractRecognizer) diff --git a/docs/source/pdf/pdf_assembler.md b/docs/source/pdf/pdf_assembler.md new file mode 100644 index 0000000..9c7da6a --- /dev/null +++ b/docs/source/pdf/pdf_assembler.md @@ -0,0 +1,108 @@ +(PdfAssembler)= +# PdfAssembler + +## Overview + +`PdfAssembler` is a PySpark ML transformer that assembles single-page PDF documents into a single multi-page PDF. It supports both Spark and Pandas DataFrames, grouping pages by a specified column (e.g., file path) and merging them using PyMuPDF (fitz). This transformer is useful for reconstructing full documents from page-level PDF outputs in distributed pipelines. + +## Usage Example + +```python +from scaledp.pdf import PdfAssembler +from pyspark.ml import PipelineModel + +pdf_assembler = PdfAssembler( + inputCol="pdf_with_text_layer", # Column with single-page PDFs + outputCol="assembled_pdf", # Output column for merged PDF + groupByCol="path", # Group pages by file path +) + +pipeline = PipelineModel(stages=[pdf_assembler]) +result = pipeline.transform(pdf_df) +for row in result.collect(): + with open("output.pdf", "wb") as f: + f.write(row.assembled_pdf.data) +``` + +## Parameters + +| Parameter | Type | Description | Default | +|--------------|------|--------------------------------------|------------------------| +| inputCol | str | Input column with single-page PDFs | "pdf" | +| outputCol | str | Output column for assembled PDF | "assembled_pdf" | +| groupByCol | str | Column to group pages by | "path" | + +## Notes +- Supports both Spark and Pandas DataFrames for flexible integration. +- Groups single-page PDFs by the specified column and merges them in order. +- Uses PyMuPDF (fitz) for PDF manipulation and merging. +- Handles errors gracefully; exceptions are included in the output if any occur. +- Can be used as the final stage in PDF processing pipelines to reconstruct full documents. + +## Complex Pipeline Example + +This example demonstrates a full pipeline for processing PDFs: converting pages to images, running OCR, adding a text layer, and assembling the final document. + +```python +from scaledp.pdf import PdfDataToImage, PdfAddTextLayer, PdfAssembler, SingleImageToPdf +from scaledp import TesseractOcr +from pyspark.ml import PipelineModel + +# Step 1: Convert PDF pages to images +pdf_data_to_image = PdfDataToImage( + inputCol="content", + outputCol="image", + pageLimit=10, # Limit number of pages processed +) + +# Step 2: Run OCR on images +ocr = TesseractOcr( + inputCol="image", + outputCol="text", + keepInputData=True, + tessLib="tesserocr", # or "pytesseract" + lang=["eng", "spa"], + scoreThreshold=0.2, +) + +# Step 3: Convert images back to single-page PDFs +image_to_pdf = SingleImageToPdf( + inputCol="image", + outputCol="pdf", +) + +# Step 4: Add recognized text as a layer to each PDF page +pdf_text_layer = PdfAddTextLayer( + inputCols=["pdf", "text"], + outputCol="pdf_with_text_layer", +) + +# Step 5: Assemble all processed pages into a single PDF document +pdf_assembler = PdfAssembler( + inputCol="pdf_with_text_layer", + outputCol="assembled_pdf", + groupByCol="path", +) + +# Build and run the pipeline +pipeline = PipelineModel(stages=[ + pdf_data_to_image, + ocr, + image_to_pdf, + pdf_text_layer, + pdf_assembler, +]) +result = pipeline.transform(pdf_df) + +# Save the assembled PDF +for row in result.collect(): + with open("output.pdf", "wb") as f: + f.write(row.assembled_pdf.data) +``` + +This pipeline: +- Converts PDF pages to images +- Runs OCR to extract text +- Adds the text layer to each PDF page +- Assembles all processed pages into a single PDF document +- Works with both Spark and Pandas DataFrames diff --git a/docs/source/pdf/pdf_data_to_image.md b/docs/source/pdf/pdf_data_to_image.md new file mode 100644 index 0000000..af05de1 --- /dev/null +++ b/docs/source/pdf/pdf_data_to_image.md @@ -0,0 +1,49 @@ +(PdfDataToImage)= +# PdfDataToImage + +## Overview + +`PdfDataToImage` is a PySpark ML transformer that extracts images from PDF files, converting each page into an image. It supports both Spark and Pandas DataFrames, configurable resolution, page limits, and output image types. This transformer is useful for document digitization, OCR preprocessing, and distributed PDF-to-image conversion workflows. + +## Usage Example + +```python +from scaledp.pdf import PdfDataToImage +from pyspark.ml import PipelineModel + +pdf_to_image = PdfDataToImage( + inputCol="content", # Column with PDF binary data + outputCol="image", # Output column for images + pathCol="path", # Optional: column with PDF file paths + pageCol="page", # Output page number column + keepInputData=True, # Keep original data in output + imageType="FILE", # Output image type (e.g., FILE, PIL) + resolution=300, # DPI for image extraction + pageLimit=5, # Limit number of pages processed +) + +pipeline = PipelineModel(stages=[pdf_to_image]) +result = pipeline.transform(pdf_df) # pdf_df should have 'content' and optionally 'path' columns +result.show_image("image") +``` + +## Parameters + +| Parameter | Type | Description | Default | +|--------------|------|--------------------------------------|------------------------| +| inputCol | str | Input column with PDF binary data | "content" | +| outputCol | str | Output column for images | "image" | +| pathCol | str | Path column for PDF metadata | "path" | +| pageCol | str | Output page number column | "page" | +| keepInputData| bool | Keep input data in output | False | +| imageType | Enum | Output image type (e.g., FILE, PIL) | ImageType.FILE | +| resolution | int | DPI for image extraction | 300 | +| pageLimit | int | Limit number of pages processed | 0 (no limit) | + +## Notes +- Converts each PDF page to an image using the specified resolution and image type. +- Supports limiting the number of pages processed with `pageLimit`. +- Handles errors gracefully; if an exception occurs, an empty image with the error message is returned. +- Can be used as the first stage in document processing pipelines for OCR or image analysis. +- Supports distributed processing with Spark and Pandas DataFrames. + diff --git a/docs/source/pdf/pdf_data_to_text.md b/docs/source/pdf/pdf_data_to_text.md new file mode 100644 index 0000000..ab7d12b --- /dev/null +++ b/docs/source/pdf/pdf_data_to_text.md @@ -0,0 +1,45 @@ +(PdfDataToText)= +# PdfDataToText + +## Overview + +`PdfDataToText` is a PySpark ML transformer that extracts text and word-level bounding boxes from PDF files. It processes each page of a PDF, returning both the text content and the coordinates of each word, making it suitable for downstream tasks such as OCR, document analysis, and layout understanding. The transformer supports both Spark and Pandas DataFrames and handles errors gracefully. + +## Usage Example + +```python +from scaledp.pdf import PdfDataToText +from pyspark.ml import PipelineModel + +pdf_to_text = PdfDataToText( + inputCol="content", # Column with PDF binary data + outputCol="document", # Output column for extracted text and boxes + pathCol="path", # Optional: column with PDF file paths + pageCol="page", # Output page number column + keepInputData=True, # Keep original data in output +) + +pipeline = PipelineModel(stages=[pdf_to_text]) +result = pipeline.transform(pdf_df) # pdf_df should have 'content' and optionally 'path' columns +for row in result.collect(): + print(row.document.text) # Extracted text + print(row.document.bboxes) # List of word bounding boxes +``` + +## Parameters + +| Parameter | Type | Description | Default | +|----------------|------|--------------------------------------|--------------| +| inputCol | str | Input column with PDF binary data | "content" | +| outputCol | str | Output column for extracted text | "document" | +| pathCol | str | Path column for PDF metadata | "path" | +| pageCol | str | Output page number column | "page" | +| keepInputData | bool | Keep input data in output | False | + +## Notes +- Extracts text and word-level bounding boxes for each page in the PDF. +- Returns a `Document` object with `text`, `bboxes`, and metadata for each page. +- Handles errors gracefully; if an exception occurs, an empty document with the error message is returned. +- Can be used as the first stage in document analysis or OCR pipelines. +- Supports distributed processing with Spark and Pandas DataFrames. + diff --git a/docs/source/pdf_processing.md b/docs/source/pdf_processing.md new file mode 100644 index 0000000..60cb2c8 --- /dev/null +++ b/docs/source/pdf_processing.md @@ -0,0 +1,18 @@ +# Pdf Processing + +This document provides an overview of various PDF processing transformers in ScaledP. + +## Available PDF Processing Transformers + +* [**PdfDataToImage**](#PdfDataToImage): Converts PDF documents into images for further + processing. +* [**PdfDataToText**](#PdfDataToText): Extracts text content from PDF documents. +* **PdfDataToDocument**: Converts PDF documents into structured document format for + analysis and processing. +* **PdfDataToSingleImage**: Converts single page of a PDF document into a + single image. +* **SingleImageToPdf**: Converts a single image into PDF document format. +* [**PdfAssembler**](#PdfAssembler): Assembles multiple PDF documents into a + single PDF file. +* **PdfAddTextLayer**: Adds a text layer to PDF document. + diff --git a/docs/source/release_notes.md b/docs/source/release_notes.md new file mode 100644 index 0000000..43b093d --- /dev/null +++ b/docs/source/release_notes.md @@ -0,0 +1,20 @@ +Release Notes +============= + +This document outlines the release notes for the ScaledP project. It includes information about new features, bug fixes, and other changes made in each version. + + +## 0.2.4 - 01.10.2025 + +### 🚀 Features + +- Added