Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
### 🚀 Features

- Added possibility to use subfields in show utils and added padding option to the YoloOnnxDetector
- Added SignatureDetector
- Updated ImageCropBoxes to support multiple boxes
- Added LineOrientation detector model to the TesseractRecognizer
- Added FaceDetector, SignatureDetector
- Added PdfAssembler

### 🐛 Bug Fixes

- Fixed borders in show utils
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scaledp"
version = "0.2.4rc11"
version = "0.2.4rc13"
description = "ScaleDP is a library for processing documents using Apache Spark and LLMs"
authors = ["Mykola Melnyk <mykola@stabrise.com>"]
repository = "https://github.com/StabRise/scaledp"
Expand Down Expand Up @@ -48,6 +48,7 @@ onnxruntime = "1.22.0"




[tool.poetry.extras]
ml = ["transformers",
#"torch",
Expand Down
48 changes: 39 additions & 9 deletions scaledp/image/ImageCropBoxes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import udf
from pyspark.sql.functions import explode, udf
from pyspark.sql.types import ArrayType

from scaledp.params import (
AutoParamsMeta,
Expand Down Expand Up @@ -65,6 +66,13 @@ class ImageCropBoxes(
typeConverter=TypeConverters.toBoolean,
)

limit = Param(
Params._dummy(),
"limit",
"Limit of boxes for crop.",
typeConverter=TypeConverters.toInt,
)

defaultParams = MappingProxyType(
{
"inputCols": ["image", "boxes"],
Expand All @@ -76,6 +84,7 @@ class ImageCropBoxes(
"pageCol": "page",
"propagateError": False,
"noCrop": True,
"limit": 0,
},
)

Expand All @@ -98,20 +107,36 @@ def transform_udf(self, image, data):
)
img = image.to_pil()
results = []
for b in data.bboxes:
limit = self.getLimit()

bboxes = data.bboxes[:limit] if limit > 0 else data.bboxes

for b in bboxes:
box = b
if not isinstance(box, Box):
box = Box(**box.asDict())
if box.width > box.height:
results.append(
img.crop(box.bbox(self.getPadding())).rotate(-90, expand=True),
if box.width < box.height:
cropped_image = img.crop(box.bbox(self.getPadding())).rotate(
-90,
expand=True,
)
else:
results.append(img.crop(box.bbox(self.getPadding())))
cropped_image = img.crop(box.bbox(self.getPadding()))
results.append(
Image.from_pil(
cropped_image,
image.path,
image.imageType,
image.resolution,
),
)

if self.getNoCrop() and len(results) == 0:
raise ImageCropError("No boxes to crop")
if len(results) == 0:
results.append(img)
results.append(
Image.from_pil(img, image.path, image.imageType, image.resolution),
)

except Exception as e:
exception = traceback.format_exc()
Expand All @@ -120,7 +145,7 @@ def transform_udf(self, image, data):
if self.getPropagateError():
raise ImageCropError from e
return Image(image.path, image.imageType, data=bytes(), exception=exception)
return Image.from_pil(results[0], image.path, image.imageType, image.resolution)
return results

def _transform(self, dataset):
out_col = self.getOutputCol()
Expand All @@ -133,7 +158,12 @@ def _transform(self, dataset):
)
result = dataset.withColumn(
out_col,
udf(self.transform_udf, Image.get_schema())(image_col, box_col),
explode(
udf(self.transform_udf, ArrayType(Image.get_schema()))(
image_col,
box_col,
),
),
)

if not self.getKeepInputData():
Expand Down
13 changes: 9 additions & 4 deletions scaledp/utils/templates/base.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div style="border-radius: 8px; margin: 10px; padding: 10px; width: {{ width }}px; background: #e5edf5; border: 0; min-width: 600px;">
<div style="display: grid; grid-template-columns: 3fr 1fr; grid-gap: 20px; width:{{ width }}px;">
<div style="border-radius: 8px; margin: 10px; padding: 10px;line-height:0px;
width: {{ width }}px; background: #e5edf5; border: 0; min-width: 600px;">
<div style="display: grid; grid-template-columns: 3fr 1fr; grid-gap: 20px;">
<div style="padding: 10px 0px 10px 20px;">
<table>
{% for key, value in metadata.items() %}
Expand All @@ -13,8 +14,12 @@
</table>

</div>
<a href="https://stabrise.com/scaledp/"><img src="https://raw.githubusercontent.com/StabRise/ScaleDP/refs/heads/master/images/scaledp.webp"
style="width: 200px; margin:10px 20px 0 0px;justify-self: end;"/></a>
<a href="https://stabrise.com/scaledp/" style="font-size: 10px;
text-align: center; justify-self:
end;"><img
src="https://raw.githubusercontent.com/StabRise/ScaleDP/refs/heads/master/images/scaledp.webp"
style="width: 200px; margin:10px 20px 0 0px;"/>https://stabrise
.com/scaledp/</a>
</div>
{% if metadata.Exception %}
<p><pre style="border-radius: 8px;overflow:auto;background-color: rgb(255, 221, 221);padding: 1em;">{{metadata.Exception}}</pre></p>
Expand Down
3 changes: 2 additions & 1 deletion scaledp/utils/templates/image.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{% extends "base.html" %}

{% block body %}
<img src="data:image/png;base64,{{image}}" style="width: 100%; align: center;border-radius: 8px;"/>
<img src="data:image/png;base64,{{image}}" style="width: 100%; align:
center;border-radius: 6px; padding: 0px; margin: 0px"/>
{% endblock %}
3 changes: 2 additions & 1 deletion scaledp/utils/templates/json.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{% extends "base.html" %}

{% block body %}
<div style="background-color: white;padding: 1em;border-radius: 8px;">{{text|safe}}</div>
<div style="background-color: white;padding: 1em;border-radius: 6px;
">{{text|safe}}</div>
{% endblock %}
3 changes: 2 additions & 1 deletion scaledp/utils/templates/ner.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{% extends "base.html" %}

{% block body %}
<div style="border-radius: 8px;background-color: white;font-family: monospace; overflow:auto; padding: 1em;">{{ ner|safe }}</div>
<div style="border-radius: 6px;background-color: white;font-family:
monospace; overflow:auto; padding: 1em;">{{ ner|safe }}</div>
{% endblock %}
3 changes: 2 additions & 1 deletion scaledp/utils/templates/text.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{% extends "base.html" %}

{% block body %}
<p><pre style="border-radius: 8px;overflow:auto;background-color: white;padding: 1em;">{{text}}</pre></p>
<p><pre style="border-radius: 6px;overflow:auto;background-color: white;
padding: 1em;">{{text}}</pre></p>
{% endblock %}
50 changes: 50 additions & 0 deletions tests/image/test_image_crop_boxes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import tempfile

from pyspark.ml.pipeline import PipelineModel

from scaledp.enums import PSM
from scaledp.image.ImageCropBoxes import ImageCropBoxes
from scaledp.models.recognizers.TesseractOcr import TesseractOcr


def test_image_crop_boxes_ocr(image_df):

# Initialize the OCR stage with specific parameters
ocr = TesseractOcr(
keepInputData=True,
scoreThreshold=0.5,
psm=PSM.SPARSE_TEXT.value,
scaleFactor=2.0,
)

# Initialize the ImageCropBoxes stage
crop = ImageCropBoxes(
inputCols=["image", "text"],
limit=2,
)

# Create the pipeline with the OCR and ImageDrawBoxes stages
pipeline = PipelineModel(stages=[ocr, crop])

# Run the pipeline on the input image dataframe
result = pipeline.transform(image_df).collect()

# Verify the pipeline result
assert len(result) == 2
assert hasattr(result[0], "cropped_image")

# Save the output image to a temporary file for verification
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp:
temp.write(result[0].cropped_image.data)
temp.close()

# Print the path to the temporary file
print("file://" + temp.name)

# Verify the OCR stage output
ocr_result = result[0].text
assert len(ocr_result) > 0

# Verify the draw stage output
cropped_image = result[0].cropped_image
assert cropped_image.exception == ""
2 changes: 2 additions & 0 deletions tests/models/detectors/test_layout_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def layout_detector():
)


@pytest.mark.skip("Requires PaddleOCR which may not be installed in all environments.")
def test_layout_detector_with_drawn_boxes(image_df):
"""Test LayoutDetector with drawn boxes on the original image."""
detector = LayoutDetector(
Expand Down Expand Up @@ -92,6 +93,7 @@ def test_layout_detector_with_drawn_boxes(image_df):
assert "Error in object detection" in str(e) or "PaddleOCR" in str(e)


@pytest.mark.skip("Requires PaddleOCR which may not be installed in all environments.")
def test_layout_detector_with_custom_layout_types():
"""Test LayoutDetector with custom layout types."""
detector = LayoutDetector(
Expand Down
2 changes: 1 addition & 1 deletion tests/pdf/test_pdf_assembler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import tempfile

from models.detectors.DBNetOnnxDetector import DBNetOnnxDetector
from pyspark.ml import PipelineModel
from pyspark.sql import DataFrame

from scaledp import ImageDrawBoxes, TesseractRecognizer, TessLib
from scaledp.models.detectors.DBNetOnnxDetector import DBNetOnnxDetector
from scaledp.models.recognizers.TesseractOcr import TesseractOcr
from scaledp.pdf import PdfAddTextLayer, PdfAssembler, PdfDataToImage, SingleImageToPdf
from scaledp.pipeline.PandasPipeline import PandasPipeline
Expand Down