StabRise · mykolamelnykml · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@
 ### 🚀 Features
 
 - Added possibility to use subfields in show utils and added padding option to the YoloOnnxDetector
+- Added SignatureDetector
+- Updated ImageCropBoxes to support multiple boxes
 - Added LineOrientation detector model to the TesseractRecognizer
 - Added FaceDetector, SignatureDetector
 - Added PdfAssembler
+
+### 🐛 Bug Fixes
+
+- Fixed borders in show utils
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scaledp"
-version = "0.2.4rc11"
+version = "0.2.4rc13"
 description = "ScaleDP is a library for processing documents using Apache Spark and LLMs"
 authors = ["Mykola Melnyk <mykola@stabrise.com>"]
 repository = "https://github.com/StabRise/scaledp"
@@ -48,6 +48,7 @@ onnxruntime = "1.22.0"
 
 
 
+
 [tool.poetry.extras]
 ml = ["transformers",
     #"torch",

diff --git a/scaledp/image/ImageCropBoxes.py b/scaledp/image/ImageCropBoxes.py
@@ -6,7 +6,8 @@
 from pyspark import keyword_only
 from pyspark.ml import Transformer
 from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
-from pyspark.sql.functions import udf
+from pyspark.sql.functions import explode, udf
+from pyspark.sql.types import ArrayType
 
 from scaledp.params import (
     AutoParamsMeta,
@@ -65,6 +66,13 @@ class ImageCropBoxes(
         typeConverter=TypeConverters.toBoolean,
     )
 
+    limit = Param(
+        Params._dummy(),
+        "limit",
+        "Limit of boxes for crop.",
+        typeConverter=TypeConverters.toInt,
+    )
+
     defaultParams = MappingProxyType(
         {
             "inputCols": ["image", "boxes"],
@@ -76,6 +84,7 @@ class ImageCropBoxes(
             "pageCol": "page",
             "propagateError": False,
             "noCrop": True,
+            "limit": 0,
         },
     )
 
@@ -98,20 +107,36 @@ def transform_udf(self, image, data):
                 )
             img = image.to_pil()
             results = []
-            for b in data.bboxes:
+            limit = self.getLimit()
+
+            bboxes = data.bboxes[:limit] if limit > 0 else data.bboxes
+
+            for b in bboxes:
                 box = b
                 if not isinstance(box, Box):
                     box = Box(**box.asDict())
-                if box.width > box.height:
-                    results.append(
-                        img.crop(box.bbox(self.getPadding())).rotate(-90, expand=True),
+                if box.width < box.height:
+                    cropped_image = img.crop(box.bbox(self.getPadding())).rotate(
+                        -90,
+                        expand=True,
                     )
                 else:
-                    results.append(img.crop(box.bbox(self.getPadding())))
+                    cropped_image = img.crop(box.bbox(self.getPadding()))
+                results.append(
+                    Image.from_pil(
+                        cropped_image,
+                        image.path,
+                        image.imageType,
+                        image.resolution,
+                    ),
+                )
+
             if self.getNoCrop() and len(results) == 0:
                 raise ImageCropError("No boxes to crop")
             if len(results) == 0:
-                results.append(img)
+                results.append(
+                    Image.from_pil(img, image.path, image.imageType, image.resolution),
+                )
 
         except Exception as e:
             exception = traceback.format_exc()
@@ -120,7 +145,7 @@ def transform_udf(self, image, data):
             if self.getPropagateError():
                 raise ImageCropError from e
             return Image(image.path, image.imageType, data=bytes(), exception=exception)
-        return Image.from_pil(results[0], image.path, image.imageType, image.resolution)
+        return results
 
     def _transform(self, dataset):
         out_col = self.getOutputCol()
@@ -133,7 +158,12 @@ def _transform(self, dataset):
             )
         result = dataset.withColumn(
             out_col,
-            udf(self.transform_udf, Image.get_schema())(image_col, box_col),
+            explode(
+                udf(self.transform_udf, ArrayType(Image.get_schema()))(
+                    image_col,
+                    box_col,
+                ),
+            ),
         )
 
         if not self.getKeepInputData():

diff --git a/scaledp/utils/templates/base.html b/scaledp/utils/templates/base.html
@@ -1,5 +1,6 @@
-<div style="border-radius: 8px; margin: 10px; padding: 10px; width: {{ width }}px; background: #e5edf5; border: 0; min-width: 600px;">
-    <div style="display: grid; grid-template-columns: 3fr 1fr; grid-gap: 20px; width:{{ width }}px;">
+<div style="border-radius: 8px; margin: 10px; padding: 10px;line-height:0px;
+        width: {{ width }}px; background: #e5edf5; border: 0; min-width: 600px;">
+    <div style="display: grid; grid-template-columns: 3fr 1fr; grid-gap: 20px;">
         <div style="padding: 10px 0px 10px 20px;">
             <table>
                 {% for key, value in metadata.items() %}
@@ -13,8 +14,12 @@
             </table>
 
         </div>
-        <a href="https://stabrise.com/scaledp/"><img src="https://raw.githubusercontent.com/StabRise/ScaleDP/refs/heads/master/images/scaledp.webp"
-                style="width: 200px; margin:10px 20px 0 0px;justify-self: end;"/></a>
+        <a href="https://stabrise.com/scaledp/" style="font-size: 10px;
+        text-align: center; justify-self:
+                end;"><img
+                src="https://raw.githubusercontent.com/StabRise/ScaleDP/refs/heads/master/images/scaledp.webp"
+                style="width: 200px; margin:10px 20px 0 0px;"/>https://stabrise
+            .com/scaledp/</a>
     </div>
         {% if metadata.Exception %}
             <p><pre style="border-radius: 8px;overflow:auto;background-color: rgb(255, 221, 221);padding: 1em;">{{metadata.Exception}}</pre></p>

diff --git a/scaledp/utils/templates/image.html b/scaledp/utils/templates/image.html
@@ -1,5 +1,6 @@
 {% extends "base.html" %}
 
 {% block body %}
-    <img src="data:image/png;base64,{{image}}" style="width: 100%; align: center;border-radius: 8px;"/>
+    <img src="data:image/png;base64,{{image}}" style="width: 100%; align:
+    center;border-radius: 6px; padding: 0px; margin: 0px"/>
 {% endblock %}
diff --git a/scaledp/utils/templates/json.html b/scaledp/utils/templates/json.html
@@ -1,5 +1,6 @@
 {% extends "base.html" %}
 
 {% block body %}
-    <div style="background-color: white;padding: 1em;border-radius: 8px;">{{text|safe}}</div>
+    <div style="background-color: white;padding: 1em;border-radius: 6px;
+">{{text|safe}}</div>
 {% endblock %}
diff --git a/scaledp/utils/templates/ner.html b/scaledp/utils/templates/ner.html
@@ -1,5 +1,6 @@
 {% extends "base.html" %}
 
 {% block body %}
-    <div style="border-radius: 8px;background-color: white;font-family: monospace; overflow:auto; padding: 1em;">{{ ner|safe }}</div>
+    <div style="border-radius: 6px;background-color: white;font-family:
+    monospace; overflow:auto; padding: 1em;">{{ ner|safe }}</div>
 {% endblock %}
diff --git a/scaledp/utils/templates/text.html b/scaledp/utils/templates/text.html
@@ -1,5 +1,6 @@
 {% extends "base.html" %}
 
 {% block body %}
-    <p><pre style="border-radius: 8px;overflow:auto;background-color: white;padding: 1em;">{{text}}</pre></p>
+    <p><pre style="border-radius: 6px;overflow:auto;background-color: white;
+    padding: 1em;">{{text}}</pre></p>
 {% endblock %}
diff --git a/tests/image/test_image_crop_boxes.py b/tests/image/test_image_crop_boxes.py
@@ -0,0 +1,50 @@
+import tempfile
+
+from pyspark.ml.pipeline import PipelineModel
+
+from scaledp.enums import PSM
+from scaledp.image.ImageCropBoxes import ImageCropBoxes
+from scaledp.models.recognizers.TesseractOcr import TesseractOcr
+
+
+def test_image_crop_boxes_ocr(image_df):
+
+    # Initialize the OCR stage with specific parameters
+    ocr = TesseractOcr(
+        keepInputData=True,
+        scoreThreshold=0.5,
+        psm=PSM.SPARSE_TEXT.value,
+        scaleFactor=2.0,
+    )
+
+    # Initialize the ImageCropBoxes stage
+    crop = ImageCropBoxes(
+        inputCols=["image", "text"],
+        limit=2,
+    )
+
+    # Create the pipeline with the OCR and ImageDrawBoxes stages
+    pipeline = PipelineModel(stages=[ocr, crop])
+
+    # Run the pipeline on the input image dataframe
+    result = pipeline.transform(image_df).collect()
+
+    # Verify the pipeline result
+    assert len(result) == 2
+    assert hasattr(result[0], "cropped_image")
+
+    # Save the output image to a temporary file for verification
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp:
+        temp.write(result[0].cropped_image.data)
+        temp.close()
+
+        # Print the path to the temporary file
+        print("file://" + temp.name)
+
+    # Verify the OCR stage output
+    ocr_result = result[0].text
+    assert len(ocr_result) > 0
+
+    # Verify the draw stage output
+    cropped_image = result[0].cropped_image
+    assert cropped_image.exception == ""
diff --git a/tests/models/detectors/test_layout_detector.py b/tests/models/detectors/test_layout_detector.py
@@ -36,6 +36,7 @@ def layout_detector():
     )
 
 
+@pytest.mark.skip("Requires PaddleOCR which may not be installed in all environments.")
 def test_layout_detector_with_drawn_boxes(image_df):
     """Test LayoutDetector with drawn boxes on the original image."""
     detector = LayoutDetector(
@@ -92,6 +93,7 @@ def test_layout_detector_with_drawn_boxes(image_df):
         assert "Error in object detection" in str(e) or "PaddleOCR" in str(e)
 
 
+@pytest.mark.skip("Requires PaddleOCR which may not be installed in all environments.")
 def test_layout_detector_with_custom_layout_types():
     """Test LayoutDetector with custom layout types."""
     detector = LayoutDetector(

diff --git a/tests/pdf/test_pdf_assembler.py b/tests/pdf/test_pdf_assembler.py
@@ -1,10 +1,10 @@
 import tempfile
 
-from models.detectors.DBNetOnnxDetector import DBNetOnnxDetector
 from pyspark.ml import PipelineModel
 from pyspark.sql import DataFrame
 
 from scaledp import ImageDrawBoxes, TesseractRecognizer, TessLib
+from scaledp.models.detectors.DBNetOnnxDetector import DBNetOnnxDetector
 from scaledp.models.recognizers.TesseractOcr import TesseractOcr
 from scaledp.pdf import PdfAddTextLayer, PdfAssembler, PdfDataToImage, SingleImageToPdf
 from scaledp.pipeline.PandasPipeline import PandasPipeline