diff --git a/.gitignore b/.gitignore index fd0d2e5..18ece59 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ .idea/ /.vscode/settings.json /tests/testresources/pdfs/private/ +/.run/* diff --git a/CHANGELOG.md b/CHANGELOG.md index 68557d9..4a0484a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## [unreleased] +## 0.2.4 - 01.10.2025 ### 🚀 Features diff --git a/docs/source/_static/ShowFaceBoxes.png b/docs/source/_static/ShowFaceBoxes.png new file mode 100644 index 0000000..3589c82 Binary files /dev/null and b/docs/source/_static/ShowFaceBoxes.png differ diff --git a/docs/source/_static/ShowFaceCropped.png b/docs/source/_static/ShowFaceCropped.png new file mode 100644 index 0000000..6e0bd15 Binary files /dev/null and b/docs/source/_static/ShowFaceCropped.png differ diff --git a/docs/source/_static/ShowImageInvoice.png b/docs/source/_static/ShowImageInvoice.png new file mode 100644 index 0000000..7a1d36c Binary files /dev/null and b/docs/source/_static/ShowImageInvoice.png differ diff --git a/docs/source/_static/ShowSignatureBoxes.png b/docs/source/_static/ShowSignatureBoxes.png new file mode 100644 index 0000000..f1d04e5 Binary files /dev/null and b/docs/source/_static/ShowSignatureBoxes.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 2871cc5..ccd77ba 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,16 +6,25 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +import os +import sys + +sys.path.insert(0, os.path.abspath("../scaledp")) + project = "ScaleDP" -copyright = "2024, StabRise" -author = "StabRise" -release = "0.1.0" +author = "Mykola Melnyk" +release = "0.2.4" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = ["sphinx.ext.autodoc", "myst_parser"] +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + templates_path = ["_templates"] exclude_patterns = [] @@ -46,7 +55,13 @@ "icon": "https://img.shields.io/badge/by-StabRise-orange.svg?style=flat&colorA=E1523D&colorB=007D8A", "type": "url", }, - ] + ], + "extra_footer": """ +

+ © Copyright 2025, StabRise +

+ """, } # -- Options for HTML output ------------------------------------------------- diff --git a/docs/source/detectors.md b/docs/source/detectors.md new file mode 100644 index 0000000..c7aa4a9 --- /dev/null +++ b/docs/source/detectors.md @@ -0,0 +1,24 @@ +Detectors +========= + +## Overview + +This section provides an overview of the various detectors available in ScaleDP for processing images and documents. These detectors are designed to identify and extract specific features such as text, objects, and layout structures from images. + +## Object Detection + +* [**Face Detector**](#FaceDetector) +* [**Signature Detector**](#SignatureDetector) + +## Text Detection + +* [**CraftTextDetector**](#CraftTextDetector) +* [**DBNetOnnxDetector**](#DBNetOnnxDetector) +* **YoloOnnxTextDetector** +* **DocTRTextDetector** + +## Base Detectors + +* **BaseDetector** +* [**YoloOnnxDetector**](#YoloOnnxDetector) + diff --git a/docs/source/image/data_to_image.md b/docs/source/image/data_to_image.md new file mode 100644 index 0000000..cc6233c --- /dev/null +++ b/docs/source/image/data_to_image.md @@ -0,0 +1,49 @@ +(DataToImage)= +# DataToImage + +## Overview + +`DataToImage` is a PySpark ML transformer that converts binary content (such as bytes from files or streams) into image objects. It is designed for use in Spark pipelines, enabling scalable and distributed image processing workflows. The transformer supports various image types and handles errors gracefully. + +## Usage Example + +```python +from scaledp import DataToImage, PipelineModel + +image_example = files('resources/images/Invoice.png') + +df = spark.read.format("binaryFile") \ + .load(image_example) + +data_to_image = DataToImage( + inputCol="content", # Column with binary data + outputCol="image", # Output column for image objects + pathCol="path", # Optional: column with image paths + keepInputData=True, # Keep original data in output + propagateError=False, # Handle errors gracefully +) + +pipeline = PipelineModel(stages=[data_to_image]) +result = pipeline.transform(df) # df should have 'content' and optionally 'path' columns +result.show_image("image") +``` + +![ShowImageInvoice.png](../_static/ShowImageInvoice.png) + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------| +| inputCol | str | Input column with binary content | "content" | +| outputCol | str | Output column for image objects | "image" | +| pathCol | str | Path column for image metadata | "path" | +| keepInputData | bool | Keep input data in output | False | +| imageType | Enum | Type of image (e.g., FILE, PIL) | ImageType.FILE | +| propagateError | bool | Propagate errors | False | + +## Notes +- Converts binary data to image objects using the specified image type. +- Handles errors gracefully; if `propagateError` is False, exceptions are logged and empty images are returned. +- Can be used as the first stage in image processing pipelines to ingest raw image data. +- Supports distributed processing with Spark. + diff --git a/docs/source/image/image_crop_boxes.md b/docs/source/image/image_crop_boxes.md new file mode 100644 index 0000000..1607afd --- /dev/null +++ b/docs/source/image/image_crop_boxes.md @@ -0,0 +1,63 @@ +(ImageCropBoxes)= +# ImageCropBoxes + +## Overview + +`ImageCropBoxes` is a PySpark ML transformer that crops images based on provided bounding boxes. It is designed to process images in Spark pipelines, supporting batch and distributed processing. The transformer can add padding to crops, limit the number of crops per image, and handle cases where no boxes are present. + +## Usage Example + +```python +from scaledp import FaceDetector, ImageCropBoxes, PipelineModel + +# Step 1: Detect faces in images +detector = FaceDetector( + inputCol="image", + outputCol="boxes", + keepInputData=True, + scoreThreshold=0.25, + padding=20, +) + +# Step 2: Crop images using detected face boxes +cropper = ImageCropBoxes( + inputCols=["image", "boxes"], + outputCol="cropped_image", + keepInputData=True, + padding=10, + limit=5, + noCrop=True, + autoRotate=False, # Automatically rotate crops if box height > width +) + +# Build and run the pipeline +pipeline = PipelineModel(stages=[detector, cropper]) +result = pipeline.transform(image_df) +result.show_image("cropped_image") +``` + +![ShowFaceCropped.png](../_static/ShowFaceCropped.png) + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------| +| inputCols | list | Input columns: image and boxes | ["image", "boxes"] | +| outputCol | str | Output column for cropped images | "cropped_image"| +| keepInputData | bool | Keep input data in output | False | +| imageType | Enum | Type of image (e.g., FILE) | ImageType.FILE | +| numPartitions | int | Number of partitions for Spark | 0 | +| padding | int | Padding added to each crop | 0 | +| pageCol | str | Page column for repartitioning | "page" | +| propagateError | bool | Propagate errors | False | +| noCrop | bool | Raise error if no boxes to crop | True | +| limit | int | Limit number of crops per image | 0 (no limit) | +| autoRotate | bool | Auto rotate crop if box height > width | True | + +## Notes +- Crops are performed using bounding boxes from the `boxes` column. +- If `noCrop` is True and no boxes are present, an error is raised. +- If `limit` is set, only the first N boxes are used for cropping. +- If `autoRotate` is True, crops are rotated if the bounding box height is greater than its width. +- Supports distributed processing with Spark. +- Errors can be propagated or handled gracefully based on `propagateError`. diff --git a/docs/source/image/image_draw_boxes.md b/docs/source/image/image_draw_boxes.md new file mode 100644 index 0000000..0dc430a --- /dev/null +++ b/docs/source/image/image_draw_boxes.md @@ -0,0 +1,61 @@ +(ImageDrawBoxes)= +# ImageDrawBoxes + +## Overview + +`ImageDrawBoxes` is a PySpark ML transformer that draws bounding boxes and/or NER entity boxes on images. It supports both standard bounding boxes and named entity recognition (NER) outputs, allowing for flexible visualization of detected objects or entities. The transformer can be integrated into Spark pipelines for scalable image annotation tasks. + +## Usage Example + +```python +from scaledp import FaceDetector, ImageDrawBoxes, PipelineModel + +detector = FaceDetector( + inputCol="image", + outputCol="boxes", + keepInputData=True, + scoreThreshold=0.25, + padding=20, +) + +draw = ImageDrawBoxes( + inputCols=["image", "boxes"], + outputCol="image_with_boxes", + keepInputData=True, + filled=False, + color="green", + lineWidth=5, +) + +pipeline = PipelineModel(stages=[detector, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` +![ShowFaceBoxes.png](../_static/ShowFaceBoxes.png) + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|---------------------| +| inputCols | list | Input columns: image and boxes/entities | ["image", "boxes"] | +| outputCol | str | Output column for annotated images | "image_with_boxes" | +| keepInputData | bool | Keep input data in output | False | +| imageType | Enum | Type of image (e.g., FILE) | ImageType.FILE | +| filled | bool | Fill rectangles | False | +| color | str | Box color (hex or name) | None (random) | +| lineWidth | int | Line width for boxes | 1 | +| textSize | int | Text size for labels | 12 | +| displayDataList | list | List of box/entity attributes to display as text | [] | +| numPartitions | int | Number of partitions for Spark | 0 | +| padding | int | Padding added to boxes | 0 | +| pageCol | str | Page column for repartitioning | "page" | +| whiteList | list | Only draw boxes/entities of these types | [] | +| blackList | list | Do not draw boxes/entities of these types | [] | + +## Notes +- Supports drawing both standard bounding boxes and NER entity boxes. +- Colors can be set manually or randomly assigned per entity/class. +- Text labels can be displayed using `displayDataList`. +- Handles rotated boxes and fills/outline options. +- Can be used in Spark pipelines for distributed image annotation. +- Errors are handled gracefully and logged. diff --git a/docs/source/image_processing.md b/docs/source/image_processing.md new file mode 100644 index 0000000..e7e152f --- /dev/null +++ b/docs/source/image_processing.md @@ -0,0 +1,13 @@ +# Image Processing + +This document provides an overview of various image processing transformers in ScaledP. + + +## Available Image Processing Transformers + +* [**DataToImage**](#DataToImage): Converts raw data into image format for further + processing. +* [**ImageCropBoxes**](#ImageCropBoxes): Crops specified regions from images based on + bounding box coordinates. +* [**ImageDrawBoxes**](#ImageDrawBoxes): Draws bounding boxes on images to highlight detected + objects or regions of interest. diff --git a/docs/source/index.rst b/docs/source/index.rst index b955fc4..4c39230 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -42,4 +42,11 @@ Benefits of using ScaleDP installation.md quickstart.md + image_processing.md + pdf_processing.md + detectors.md + ocr.md + show_utils.md + release_notes.md + diff --git a/docs/source/models/detectors/craft_text_detector.md b/docs/source/models/detectors/craft_text_detector.md new file mode 100644 index 0000000..33a0884 --- /dev/null +++ b/docs/source/models/detectors/craft_text_detector.md @@ -0,0 +1,81 @@ +(CraftTextDetector)= +# CraftTextDetector + +## Overview + +`CraftTextDetector` is a PySpark ML transformer for text detection in images using the CRAFT model. It supports distributed processing in Spark pipelines, batch inference, and optional refiner network postprocessing for improved accuracy. The detector outputs bounding boxes for detected text regions, with options for rotated boxes and threshold tuning. + +## Usage Example + +```python +from scaledp.models.detectors import CraftTextDetector +from scaledp import TesseractRecognizer, ImageDrawBoxes, PipelineModel + +detector = CraftTextDetector( + device="cpu", + keepInputData=True, + partitionMap=True, + numPartitions=1, + width=1600, + scoreThreshold=0.7, + textThreshold=0.4, + linkThreshold=0.4, + withRefiner=True, +) + +ocr = TesseractRecognizer( + inputCols=["image", "boxes"], + keepFormatting=False, + keepInputData=True, + lang=["eng", "spa"], + scoreThreshold=0.2, + scaleFactor=2.0, + partitionMap=True, + numPartitions=1, +) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "text"], + filled=False, + color="green", + lineWidth=5, + displayDataList=["score", "text", "angle"], +) + +pipeline = PipelineModel(stages=[detector, ocr, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.7 | +| textThreshold | float | Threshold for text region score | 0.4 | +| linkThreshold | float | Threshold for link affinity score | 0.4 | +| sizeThreshold | int | Minimum height for detected regions | -1 | +| width | int | Width for image resizing | 1280 | +| withRefiner | bool | Enable refiner network postprocessing | False | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| onlyRotated | bool | Return only rotated boxes | False | + +## Notes +- Supports optional refiner network for improved text box accuracy (`withRefiner`). +- Outputs bounding boxes for detected text regions, including rotated boxes if `onlyRotated` is True. +- Thresholds (`scoreThreshold`, `textThreshold`, `linkThreshold`) can be tuned for different document types. +- Can be integrated with OCR and visualization stages in Spark pipelines. +- Supports batch and distributed processing for scalable text detection. +- Errors are handled gracefully and can be propagated if desired. + diff --git a/docs/source/models/detectors/dbnet_onnx_detector.md b/docs/source/models/detectors/dbnet_onnx_detector.md new file mode 100644 index 0000000..fa3d896 --- /dev/null +++ b/docs/source/models/detectors/dbnet_onnx_detector.md @@ -0,0 +1,72 @@ +(DBNetOnnxDetector)= +# DBNetOnnxDetector + +## Overview + +`DBNetOnnxDetector` is a PySpark ML transformer for text detection in images using the DBNet ONNX model. It supports distributed processing in Spark pipelines and can automatically download models from Hugging Face Hub. The detector outputs bounding boxes for detected text regions, with options for rotated boxes and merging overlapping results. + +## Usage Example + +```python +from scaledp.models.detectors import DBNetOnnxDetector +from scaledp import TesseractRecognizer, ImageDrawBoxes, PipelineModel + +detector = DBNetOnnxDetector( + model="StabRise/text_detection_dbnet_ml_v0.2", # Hugging Face model repo + keepInputData=True, + onlyRotated=False, + scoreThreshold=0.2, +) + +ocr = TesseractRecognizer( + inputCols=["image", "boxes"], + keepFormatting=False, + keepInputData=True, + lang=["eng", "spa"], + scoreThreshold=0.2, + scaleFactor=2.0, + partitionMap=True, + numPartitions=1, +) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "text"], + filled=False, + color="green", + lineWidth=5, + displayDataList=["score", "text", "angle"], +) + +pipeline = PipelineModel(stages=[detector, ocr, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier or path | "StabRise/text_detection_dbnet_ml_v0.2" | + +## Notes +- Automatically downloads the ONNX model from Hugging Face Hub if not present locally. +- Outputs bounding boxes for detected text regions, including rotated boxes if `onlyRotated` is True. +- Merges overlapping boxes based on IOU, angle, and line proximity for cleaner results. +- Can be integrated with OCR and visualization stages in Spark pipelines. +- Supports batch and distributed processing for scalable text detection. +- Errors are handled gracefully and can be propagated if desired. + diff --git a/docs/source/models/detectors/face_detector.md b/docs/source/models/detectors/face_detector.md new file mode 100644 index 0000000..01bf1bc --- /dev/null +++ b/docs/source/models/detectors/face_detector.md @@ -0,0 +1,62 @@ +(FaceDetector)= +# FaceDetector + +## Overview + +`FaceDetector` is a face detection transformer based on the YOLO ONNX model. It is designed to efficiently detect faces in images using a pre-trained model from Hugging Face Hub. The detector is implemented as a PySpark ML transformer and can be integrated into Spark pipelines for scalable face detection tasks. + +## Usage Example + +```python +from scaledp import FaceDetector, ImageDrawBoxes, PipelineModel + +detector = FaceDetector( + keepInputData=True, + partitionMap=True, + numPartitions=0, + scoreThreshold=0.25, + task="detect", + padding=20, + ) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "boxes"], + filled=False, + color="green", + lineWidth=5, + displayDataList=[], +) +# Transform the image dataframe through the OCR stage +pipeline = PipelineModel(stages=[detector, draw]) +result = pipeline.transform(image_df) +result.show_image("image_with_boxes") +``` + +![ShowFaceBoxes.png](../../_static/ShowFaceBoxes.png) + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| task | str | Detection task type | "detect" | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier | "StabRise/face_detection" | + +## Notes +- The detector uses the YOLO ONNX model from Hugging Face Hub for face detection. +- Supports batch processing and distributed inference with Spark. +- Additional parameters can be set using the corresponding setter methods. + diff --git a/docs/source/models/detectors/signature_detector.md b/docs/source/models/detectors/signature_detector.md new file mode 100644 index 0000000..e9e867b --- /dev/null +++ b/docs/source/models/detectors/signature_detector.md @@ -0,0 +1,62 @@ +(SignatureDetector)= +# SignatureDetector + +## Overview + +`SignatureDetector` is a signature detection transformer based on the YOLO ONNX model. It efficiently detects signatures in images using a pre-trained model from Hugging Face Hub. The detector is implemented as a PySpark ML transformer and can be integrated into Spark pipelines for scalable signature detection tasks. + +## Usage Example + +```python +from scaledp import SignatureDetector, ImageDrawBoxes, PipelineModel + +detector = SignatureDetector( + keepInputData=True, + partitionMap=True, + numPartitions=0, + scoreThreshold=0.25, + task="detect", + padding=20, +) + +draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "signatures"], + filled=False, + color="blue", + lineWidth=5, + displayDataList=[], +) +# Transform the image dataframe through the signature detection stage +pipeline = PipelineModel(stages=[detector, draw]) +result = pipeline.transform(image_df) +``` + +![ShowSignatureBoxes.png](../../_static/ShowSignatureBoxes.png) + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|--------------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for signatures | "signatures" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| task | str | Detection task type | "detect" | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier | "StabRise/signature_detection" | +| padding | int | Padding percent to expand detected boxes | 0 | + +## Notes +- The detector uses the YOLO ONNX model from Hugging Face Hub for signature detection. +- Supports batch processing and distributed inference with Spark. +- Additional parameters can be set using the corresponding setter methods. + diff --git a/docs/source/models/detectors/yolo_onnx_detector.md b/docs/source/models/detectors/yolo_onnx_detector.md new file mode 100644 index 0000000..751e39e --- /dev/null +++ b/docs/source/models/detectors/yolo_onnx_detector.md @@ -0,0 +1,57 @@ +(YoloOnnxDetector)= +# YoloOnnxDetector + +## Overview + +`YoloOnnxDetector` is a generic object detector transformer based on the YOLO ONNX model. It provides efficient detection of objects in images using a pre-trained YOLO model, supporting batch and distributed inference in Spark pipelines. It is designed for extensibility and is used as a base for specialized detectors such as FaceDetector and SignatureDetector. +It not need installing Pytorch, only ONNX Runtime is required. + + +## Inheritance + +- Inherits from [`BaseDetector`](./BaseDetector.md), which provides core Spark ML transformer functionality and schema handling. +- Mixes in `HasDevice` and `HasBatchSize` for device and batch configuration. + +## Usage Example + +```python +from scaledp.models.detectors.YoloOnnxDetector import YoloOnnxDetector + +detector = YoloOnnxDetector( + model="StabRise/face_detection", # or any supported YOLO ONNX model + scoreThreshold=0.3, + padding=10, +) + +# Use in a Spark pipeline +detected_df = detector.transform(input_df) +``` + +## Parameters + +| Parameter | Type | Description | Default | +|-------------------|---------|--------------------------------------------------|-----------------------------| +| inputCol | str | Input image column | "image" | +| outputCol | str | Output column for boxes | "boxes" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.2 | +| device | Device | Inference device (CPU/GPU) | Device.CPU | +| batchSize | int | Batch size for inference | 2 | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions | 0 | +| pageCol | str | Page column | "page" | +| pathCol | str | Path column | "path" | +| propagateError | bool | Propagate errors | False | +| task | str | Detection task type | "detect" | +| onlyRotated | bool | Return only rotated boxes | False | +| model | str | Model identifier | (required) | +| padding | int | Padding percent to expand detected boxes | 0 | + +## Notes +- The detector loads YOLO ONNX models from Hugging Face Hub or local path. +- Supports batch and distributed processing with Spark. +- Padding expands detected bounding boxes by a percentage. +- Used as a base for specialized detectors (e.g., [**Face Detector**] + (#FaceDetector), [**Signature Detector**](#SignatureDetector)). + diff --git a/docs/source/models/recognizers/tesseract_recognizer.md b/docs/source/models/recognizers/tesseract_recognizer.md new file mode 100644 index 0000000..e2e3b64 --- /dev/null +++ b/docs/source/models/recognizers/tesseract_recognizer.md @@ -0,0 +1,67 @@ +(TesseractRecognizer)= +# TesseractRecognizer + +## Overview + +`TesseractRecognizer` is a PySpark ML transformer that runs Tesseract OCR on images. It supports multiple languages, Tesseract libraries (`tesserocr` and `pytesseract`), and advanced options such as line orientation detection, formatting, and rotated box handling. The transformer can be integrated into Spark pipelines for scalable and distributed text recognition tasks. + +## Usage Example + +```python +from scaledp import DocTRTextDetector, TesseractRecognizer, PipelineModel + +detector = DocTRTextDetector( + device="cpu", + keepInputData=True, + scoreThreshold=0.1, + partitionMap=True, + numPartitions=1, +) + +ocr = TesseractRecognizer( + keepFormatting=True, + tessLib="tesserocr", # or "pytesseract" + lang=["ukr", "eng"], + scoreThreshold=0.2, + partitionMap=True, + numPartitions=1, + tessDataPath="/usr/share/tesseract-ocr/5/tessdata/", + onlyRotated=True, +) + +pipeline = PipelineModel(stages=[detector, ocr]) +result = pipeline.transform(image_df) +for row in result.collect(): + print(row.text.text) # Recognized text +``` + +## Parameters + +| Parameter | Type | Description | Default | +|---------------------|---------|--------------------------------------------------|-----------------------------------------| +| inputCols | list | Input columns: image and boxes | ["image", "boxes"] | +| outputCol | str | Output column for recognized text | "text" | +| keepInputData | bool | Keep input data in output | False | +| scaleFactor | float | Image resize factor | 1.0 | +| scoreThreshold | float | Minimum confidence score | 0.5 | +| oem | int | OCR engine mode (see Tesseract OEM) | OEM.DEFAULT | +| lang | list | List of languages for OCR | ["eng"] | +| lineTolerance | int | Tolerance for line grouping | 0 | +| keepFormatting | bool | Preserve text formatting | False | +| tessDataPath | str | Path to Tesseract data folder | "/usr/share/tesseract-ocr/5/tessdata/" | +| tessLib | int/str | Tesseract library to use (TESSEROCR/PYTESSERACT) | TessLib.PYTESSERACT | +| partitionMap | bool | Use partitioned mapping | False | +| numPartitions | int | Number of partitions for Spark | 0 | +| pageCol | str | Page column for repartitioning | "page" | +| pathCol | str | Path column for image metadata | "path" | +| detectLineOrientation| bool | Detect and auto-orient text lines | True | +| onlyRotated | bool | Only return rotated boxes | True | +| oriModel | str | Model for line orientation detection | "StabRise/line_orientation_detection_v0.1" | + +## Notes +- Supports both `tesserocr` and `pytesseract` libraries for OCR. +- Can process multiple languages and preserve formatting if desired. +- Handles rotated boxes and auto-orients text lines for improved accuracy. +- Errors are handled gracefully and logged; exceptions are included in the output if any occur. +- Can be used in Spark pipelines for distributed OCR processing. + diff --git a/docs/source/ocr.md b/docs/source/ocr.md new file mode 100644 index 0000000..24e5a93 --- /dev/null +++ b/docs/source/ocr.md @@ -0,0 +1,34 @@ +OCR models +========== + +The OCR models in ScaledP provide robust optical character recognition capabilities for extracting text from images and documents. These models leverage advanced deep learning techniques to deliver high accuracy and performance across various use cases. + +## Available OCR Engines + +End-to-end OCR solutions available in ScaledP include: + +- **Tesseract OCR**: An open-source OCR engine that supports multiple languages and is widely used for text extraction tasks. +- **DocTR OCR**: A deep learning-based OCR model that offers superior accuracy, especially for complex documents and layouts. +- **Surya OCR**: A high-performance OCR model optimized for speed and accuracy, suitable for real-time applications. +- **EasyOCR**: A lightweight OCR model that provides fast text recognition with support for multiple languages. +- **LLMOcr**: An OCR that utilizes large language models to enhance text recognition capabilities. + +## Text Detectors + +Text detectors are used to identify and locate text regions within images. +In some cases useful to run it as separate step in the OCR pipeline. + +See the following text detectors available in ScaledP: + +* [**CraftTextDetector**](#CraftTextDetector) +* [**DBNetOnnxDetector**](#DBNetOnnxDetector) +* **YoloOnnxTextDetector** +* **DocTRTextDetector** + +## Text Recognizers + +Text recognizers can recognize text from images contains single +line/word/character. +Available text recognizers in ScaledP include: + +* [**TesseractRecognizer**](#TesseractRecognizer) diff --git a/docs/source/pdf/pdf_assembler.md b/docs/source/pdf/pdf_assembler.md new file mode 100644 index 0000000..9c7da6a --- /dev/null +++ b/docs/source/pdf/pdf_assembler.md @@ -0,0 +1,108 @@ +(PdfAssembler)= +# PdfAssembler + +## Overview + +`PdfAssembler` is a PySpark ML transformer that assembles single-page PDF documents into a single multi-page PDF. It supports both Spark and Pandas DataFrames, grouping pages by a specified column (e.g., file path) and merging them using PyMuPDF (fitz). This transformer is useful for reconstructing full documents from page-level PDF outputs in distributed pipelines. + +## Usage Example + +```python +from scaledp.pdf import PdfAssembler +from pyspark.ml import PipelineModel + +pdf_assembler = PdfAssembler( + inputCol="pdf_with_text_layer", # Column with single-page PDFs + outputCol="assembled_pdf", # Output column for merged PDF + groupByCol="path", # Group pages by file path +) + +pipeline = PipelineModel(stages=[pdf_assembler]) +result = pipeline.transform(pdf_df) +for row in result.collect(): + with open("output.pdf", "wb") as f: + f.write(row.assembled_pdf.data) +``` + +## Parameters + +| Parameter | Type | Description | Default | +|--------------|------|--------------------------------------|------------------------| +| inputCol | str | Input column with single-page PDFs | "pdf" | +| outputCol | str | Output column for assembled PDF | "assembled_pdf" | +| groupByCol | str | Column to group pages by | "path" | + +## Notes +- Supports both Spark and Pandas DataFrames for flexible integration. +- Groups single-page PDFs by the specified column and merges them in order. +- Uses PyMuPDF (fitz) for PDF manipulation and merging. +- Handles errors gracefully; exceptions are included in the output if any occur. +- Can be used as the final stage in PDF processing pipelines to reconstruct full documents. + +## Complex Pipeline Example + +This example demonstrates a full pipeline for processing PDFs: converting pages to images, running OCR, adding a text layer, and assembling the final document. + +```python +from scaledp.pdf import PdfDataToImage, PdfAddTextLayer, PdfAssembler, SingleImageToPdf +from scaledp import TesseractOcr +from pyspark.ml import PipelineModel + +# Step 1: Convert PDF pages to images +pdf_data_to_image = PdfDataToImage( + inputCol="content", + outputCol="image", + pageLimit=10, # Limit number of pages processed +) + +# Step 2: Run OCR on images +ocr = TesseractOcr( + inputCol="image", + outputCol="text", + keepInputData=True, + tessLib="tesserocr", # or "pytesseract" + lang=["eng", "spa"], + scoreThreshold=0.2, +) + +# Step 3: Convert images back to single-page PDFs +image_to_pdf = SingleImageToPdf( + inputCol="image", + outputCol="pdf", +) + +# Step 4: Add recognized text as a layer to each PDF page +pdf_text_layer = PdfAddTextLayer( + inputCols=["pdf", "text"], + outputCol="pdf_with_text_layer", +) + +# Step 5: Assemble all processed pages into a single PDF document +pdf_assembler = PdfAssembler( + inputCol="pdf_with_text_layer", + outputCol="assembled_pdf", + groupByCol="path", +) + +# Build and run the pipeline +pipeline = PipelineModel(stages=[ + pdf_data_to_image, + ocr, + image_to_pdf, + pdf_text_layer, + pdf_assembler, +]) +result = pipeline.transform(pdf_df) + +# Save the assembled PDF +for row in result.collect(): + with open("output.pdf", "wb") as f: + f.write(row.assembled_pdf.data) +``` + +This pipeline: +- Converts PDF pages to images +- Runs OCR to extract text +- Adds the text layer to each PDF page +- Assembles all processed pages into a single PDF document +- Works with both Spark and Pandas DataFrames diff --git a/docs/source/pdf/pdf_data_to_image.md b/docs/source/pdf/pdf_data_to_image.md new file mode 100644 index 0000000..af05de1 --- /dev/null +++ b/docs/source/pdf/pdf_data_to_image.md @@ -0,0 +1,49 @@ +(PdfDataToImage)= +# PdfDataToImage + +## Overview + +`PdfDataToImage` is a PySpark ML transformer that extracts images from PDF files, converting each page into an image. It supports both Spark and Pandas DataFrames, configurable resolution, page limits, and output image types. This transformer is useful for document digitization, OCR preprocessing, and distributed PDF-to-image conversion workflows. + +## Usage Example + +```python +from scaledp.pdf import PdfDataToImage +from pyspark.ml import PipelineModel + +pdf_to_image = PdfDataToImage( + inputCol="content", # Column with PDF binary data + outputCol="image", # Output column for images + pathCol="path", # Optional: column with PDF file paths + pageCol="page", # Output page number column + keepInputData=True, # Keep original data in output + imageType="FILE", # Output image type (e.g., FILE, PIL) + resolution=300, # DPI for image extraction + pageLimit=5, # Limit number of pages processed +) + +pipeline = PipelineModel(stages=[pdf_to_image]) +result = pipeline.transform(pdf_df) # pdf_df should have 'content' and optionally 'path' columns +result.show_image("image") +``` + +## Parameters + +| Parameter | Type | Description | Default | +|--------------|------|--------------------------------------|------------------------| +| inputCol | str | Input column with PDF binary data | "content" | +| outputCol | str | Output column for images | "image" | +| pathCol | str | Path column for PDF metadata | "path" | +| pageCol | str | Output page number column | "page" | +| keepInputData| bool | Keep input data in output | False | +| imageType | Enum | Output image type (e.g., FILE, PIL) | ImageType.FILE | +| resolution | int | DPI for image extraction | 300 | +| pageLimit | int | Limit number of pages processed | 0 (no limit) | + +## Notes +- Converts each PDF page to an image using the specified resolution and image type. +- Supports limiting the number of pages processed with `pageLimit`. +- Handles errors gracefully; if an exception occurs, an empty image with the error message is returned. +- Can be used as the first stage in document processing pipelines for OCR or image analysis. +- Supports distributed processing with Spark and Pandas DataFrames. + diff --git a/docs/source/pdf/pdf_data_to_text.md b/docs/source/pdf/pdf_data_to_text.md new file mode 100644 index 0000000..ab7d12b --- /dev/null +++ b/docs/source/pdf/pdf_data_to_text.md @@ -0,0 +1,45 @@ +(PdfDataToText)= +# PdfDataToText + +## Overview + +`PdfDataToText` is a PySpark ML transformer that extracts text and word-level bounding boxes from PDF files. It processes each page of a PDF, returning both the text content and the coordinates of each word, making it suitable for downstream tasks such as OCR, document analysis, and layout understanding. The transformer supports both Spark and Pandas DataFrames and handles errors gracefully. + +## Usage Example + +```python +from scaledp.pdf import PdfDataToText +from pyspark.ml import PipelineModel + +pdf_to_text = PdfDataToText( + inputCol="content", # Column with PDF binary data + outputCol="document", # Output column for extracted text and boxes + pathCol="path", # Optional: column with PDF file paths + pageCol="page", # Output page number column + keepInputData=True, # Keep original data in output +) + +pipeline = PipelineModel(stages=[pdf_to_text]) +result = pipeline.transform(pdf_df) # pdf_df should have 'content' and optionally 'path' columns +for row in result.collect(): + print(row.document.text) # Extracted text + print(row.document.bboxes) # List of word bounding boxes +``` + +## Parameters + +| Parameter | Type | Description | Default | +|----------------|------|--------------------------------------|--------------| +| inputCol | str | Input column with PDF binary data | "content" | +| outputCol | str | Output column for extracted text | "document" | +| pathCol | str | Path column for PDF metadata | "path" | +| pageCol | str | Output page number column | "page" | +| keepInputData | bool | Keep input data in output | False | + +## Notes +- Extracts text and word-level bounding boxes for each page in the PDF. +- Returns a `Document` object with `text`, `bboxes`, and metadata for each page. +- Handles errors gracefully; if an exception occurs, an empty document with the error message is returned. +- Can be used as the first stage in document analysis or OCR pipelines. +- Supports distributed processing with Spark and Pandas DataFrames. + diff --git a/docs/source/pdf_processing.md b/docs/source/pdf_processing.md new file mode 100644 index 0000000..60cb2c8 --- /dev/null +++ b/docs/source/pdf_processing.md @@ -0,0 +1,18 @@ +# Pdf Processing + +This document provides an overview of various PDF processing transformers in ScaledP. + +## Available PDF Processing Transformers + +* [**PdfDataToImage**](#PdfDataToImage): Converts PDF documents into images for further + processing. +* [**PdfDataToText**](#PdfDataToText): Extracts text content from PDF documents. +* **PdfDataToDocument**: Converts PDF documents into structured document format for + analysis and processing. +* **PdfDataToSingleImage**: Converts single page of a PDF document into a + single image. +* **SingleImageToPdf**: Converts a single image into PDF document format. +* [**PdfAssembler**](#PdfAssembler): Assembles multiple PDF documents into a + single PDF file. +* **PdfAddTextLayer**: Adds a text layer to PDF document. + diff --git a/docs/source/release_notes.md b/docs/source/release_notes.md new file mode 100644 index 0000000..43b093d --- /dev/null +++ b/docs/source/release_notes.md @@ -0,0 +1,20 @@ +Release Notes +============= + +This document outlines the release notes for the ScaledP project. It includes information about new features, bug fixes, and other changes made in each version. + + +## 0.2.4 - 01.10.2025 + +### 🚀 Features + +- Added , +- Updated [ImageCropBoxes](#ImageCropBoxes) to support multiple boxes +- Added LineOrientation detector model to the [TesseractRecognizer](#TesseractRecognizer) +- Added [PdfAssembler](#PdfAssembler) transformer for assembling PDFs +- Added possibility to use subfields in show utils +- Added padding option to the [YoloOnnxDetector](#YoloOnnxDetector) + +### 🐛 Bug Fixes + +- Fixed borders in [show utils](#ShowUtils) diff --git a/docs/source/show_utils.md b/docs/source/show_utils.md new file mode 100644 index 0000000..21c2728 --- /dev/null +++ b/docs/source/show_utils.md @@ -0,0 +1,93 @@ +(ShowUtils)= +# Show Utils + +## Overview + +Show Utils provides visualization helpers for displaying images, text, PDFs, and named entities from Spark DataFrames in Jupyter/IPython environments. It is designed to work with ScaledP's data structures and transformers, making it easy to inspect and debug results in interactive notebooks. + +## Functions + +### show_image +Displays images from a DataFrame column. Automatically converts binary columns to images if needed. +- **Parameters:** + - `df`: Spark DataFrame + - `column`: Column name (default: auto-detect) + - `limit`: Number of images to show (default: 5) + - `width`: Display width in pixels (default: 600) + - `show_meta`: Show image metadata (default: True) + +![ShowImageInvoice.png](_static/ShowImageInvoice.png) + +### show_text +Displays text from a DataFrame column, with optional metadata and formatting. +- **Parameters:** + - `df`: Spark DataFrame + - `column`: Column name (default: auto-detect) + - `field`: Field in the text struct (default: "text") + - `limit`: Number of texts to show (default: 5) + - `width`: Display width in pixels (default: 800) + +### show_json +Displays JSON data from a DataFrame column, pretty-printed and syntax-highlighted. +- **Parameters:** + - `df`: Spark DataFrame + - `column`: Column name (default: auto-detect) + - `field`: Field in the struct (default: "json_data") + - `limit`: Number of items to show (default: 5) + - `width`: Display width in pixels (default: 800) + +### show_pdf +Displays PDF pages as images from a DataFrame column. Converts binary PDF data to images for visualization. +- **Parameters:** + - `df`: Spark DataFrame + - `column`: Column name (default: auto-detect) + - `limit`: Number of pages to show (default: 5) + - `width`: Display width in pixels (default: 600) + - `show_meta`: Show image metadata (default: True) + +### show_ner +Displays named entities from a DataFrame column in tabular format. +- **Parameters:** + - `df`: Spark DataFrame + - `column`: NER column (default: "ner") + - `limit`: Number of entities to show (default: 20) + - `truncate`: Truncate long text (default: False) + +### visualize_ner +Visualizes named entities inline within the original text, color-coded by entity type. +- **Parameters:** + - `df`: Spark DataFrame + - `column`: NER column (default: "ner") + - `text_column`: Text column (default: "text") + - `limit`: Number of rows to show (default: 20) + - `width`: Display width in pixels (optional) + +## Usage Example + +```python +from scaledp import * + +# Show images from a DataFrame +image_df.show_image(column="image", limit=3) + +# Show recognized text +text_df.show_text(column="document", field="text", limit=2) + +# Show PDF pages as images +pdf_df.show_pdf(column="content", limit=2) + +# Show named entities +ner_df.show_ner(column="ner", limit=10) + +# Visualize NER results inline +ner_df.visualize_ner(column="ner", text_column="text", limit=1) +``` + + +## Notes +- Designed for use in Jupyter/IPython environments; uses HTML and Jinja2 templates for rich output. +- Automatically detects column types and applies necessary conversions (e.g., binary to image). +- Handles errors gracefully and displays exceptions in metadata. +- Useful for debugging and inspecting results in interactive data science workflows. +- Available as methods on Spark DataFrames when ScaledP is installed. + diff --git a/scaledp/README.md b/scaledp/README.md index 974c514..8999102 100644 --- a/scaledp/README.md +++ b/scaledp/README.md @@ -30,7 +30,9 @@ ### Build documentation ```bash + pip install sphinx_book_theme myst_parser poetry run sphinx-build -M html source build + poetry run sphinx-apidoc -o source/ ../scaledp ``` ### Release diff --git a/scaledp/image/ImageCropBoxes.py b/scaledp/image/ImageCropBoxes.py index 3aeb2c0..e7a4453 100644 --- a/scaledp/image/ImageCropBoxes.py +++ b/scaledp/image/ImageCropBoxes.py @@ -59,6 +59,7 @@ class ImageCropBoxes( "Padding.", typeConverter=TypeConverters.toInt, ) + noCrop = Param( Params._dummy(), "noCrop", @@ -73,6 +74,13 @@ class ImageCropBoxes( typeConverter=TypeConverters.toInt, ) + autoRotate = Param( + Params._dummy(), + "autoRotate", + "Auto rotate cropped image if box height > box width.", + typeConverter=TypeConverters.toBoolean, + ) + defaultParams = MappingProxyType( { "inputCols": ["image", "boxes"], @@ -85,6 +93,7 @@ class ImageCropBoxes( "propagateError": False, "noCrop": True, "limit": 0, + "autoRotate": True, }, ) @@ -115,7 +124,7 @@ def transform_udf(self, image, data): box = b if not isinstance(box, Box): box = Box(**box.asDict()) - if box.width < box.height: + if self.getAutoRotate() and box.width < box.height: cropped_image = img.crop(box.bbox(self.getPadding())).rotate( -90, expand=True, diff --git a/scaledp/models/detectors/BaseDetector.py b/scaledp/models/detectors/BaseDetector.py index 8fbfa95..53eadb2 100644 --- a/scaledp/models/detectors/BaseDetector.py +++ b/scaledp/models/detectors/BaseDetector.py @@ -75,6 +75,7 @@ def get_params(self): return json.dumps({k.name: v for k, v in self.extractParamMap().items()}) def outputSchema(self): + """Output schema of the detector.""" return StructType( [ StructField("path", StringType(), True), @@ -92,6 +93,9 @@ def outputSchema(self): ) def transform_udf(self, image, params=None): + """ + Run detector on a single image. + """ logging.info("Run Detector") if params is None: params = self.get_params() @@ -147,6 +151,9 @@ def transform_udf_pandas( images: pd.DataFrame, params: pd.Series, ) -> pd.DataFrame: + """ + Run detector on a batch of images. + """ params = json.loads(params[0]) resized_images = [] for _index, img in images.iterrows(): diff --git a/scaledp/models/detectors/CraftTextDetector.py b/scaledp/models/detectors/CraftTextDetector.py index 362c800..c7f7cac 100644 --- a/scaledp/models/detectors/CraftTextDetector.py +++ b/scaledp/models/detectors/CraftTextDetector.py @@ -12,6 +12,8 @@ class CraftTextDetector(BaseDetector, HasDevice, HasBatchSize): + """CRAFT text detector.""" + _craft_net = None _refine_net = None diff --git a/scaledp/models/detectors/FaceDetector.py b/scaledp/models/detectors/FaceDetector.py index 48f1776..fe4dd1b 100644 --- a/scaledp/models/detectors/FaceDetector.py +++ b/scaledp/models/detectors/FaceDetector.py @@ -5,6 +5,8 @@ class FaceDetector(YoloOnnxDetector): + """Face detector using YOLO ONNX model.""" + defaultParams = MappingProxyType( { "inputCol": "image", diff --git a/scaledp/models/detectors/YoloOnnxDetector.py b/scaledp/models/detectors/YoloOnnxDetector.py index 0580c02..165f45a 100644 --- a/scaledp/models/detectors/YoloOnnxDetector.py +++ b/scaledp/models/detectors/YoloOnnxDetector.py @@ -18,6 +18,8 @@ class YoloOnnxDetector(BaseDetector, HasDevice, HasBatchSize): + """YOLO ONNX object detector.""" + _model: ClassVar = {} task = Param( diff --git a/scaledp/models/detectors/YoloOnnxTextDetector.py b/scaledp/models/detectors/YoloOnnxTextDetector.py index b889e5d..51e6e0e 100644 --- a/scaledp/models/detectors/YoloOnnxTextDetector.py +++ b/scaledp/models/detectors/YoloOnnxTextDetector.py @@ -17,6 +17,8 @@ class YoloOnnxTextDetector(BaseDetector, HasDevice, HasBatchSize): + """YOLO ONNX text detector.""" + _model = None task = Param( diff --git a/scaledp/models/detectors/__init__.py b/scaledp/models/detectors/__init__.py index e69de29..f9b60ce 100644 --- a/scaledp/models/detectors/__init__.py +++ b/scaledp/models/detectors/__init__.py @@ -0,0 +1,17 @@ +from scaledp.models.detectors.BaseDetector import BaseDetector +from scaledp.models.detectors.CraftTextDetector import CraftTextDetector +from scaledp.models.detectors.DBNetOnnxDetector import DBNetOnnxDetector +from scaledp.models.detectors.FaceDetector import FaceDetector +from scaledp.models.detectors.SignatureDetector import SignatureDetector +from scaledp.models.detectors.YoloOnnxDetector import YoloOnnxDetector +from scaledp.models.detectors.YoloOnnxTextDetector import YoloOnnxTextDetector + +__all__ = [ + "FaceDetector", + "SignatureDetector", + "YoloOnnxDetector", + "BaseDetector", + "DBNetOnnxDetector", + "CraftTextDetector", + "YoloOnnxTextDetector", +] diff --git a/scaledp/models/detectors/craft/models.py b/scaledp/models/detectors/craft/models.py index dfe6606..f59f3f1 100644 --- a/scaledp/models/detectors/craft/models.py +++ b/scaledp/models/detectors/craft/models.py @@ -4,7 +4,6 @@ class Craftnet: def __init__(self, onnx_path=None): - onnx_path = "/home/mykola/PycharmProjects/scaledp/tests/model.quant.onnx" session_options = onnxruntime.SessionOptions() if onnx_path is None: diff --git a/scaledp/pdf/PdfDataToSingleImage.py b/scaledp/pdf/PdfDataToSingleImage.py index 986a20a..eac1bdf 100644 --- a/scaledp/pdf/PdfDataToSingleImage.py +++ b/scaledp/pdf/PdfDataToSingleImage.py @@ -65,7 +65,7 @@ def process(self, input, path, resolution=0): except Exception: return Image( path, - exception="Error in extration of image from pdf document", + exception="Error in extraction of image from pdf document", ) def getPageNumber(self):