diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bd9bac..68557d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ ### 🚀 Features - Added possibility to use subfields in show utils and added padding option to the YoloOnnxDetector +- Added SignatureDetector +- Updated ImageCropBoxes to support multiple boxes - Added LineOrientation detector model to the TesseractRecognizer - Added FaceDetector, SignatureDetector - Added PdfAssembler + +### 🐛 Bug Fixes + +- Fixed borders in show utils diff --git a/pyproject.toml b/pyproject.toml index dc67e5c..e5388d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scaledp" -version = "0.2.4rc11" +version = "0.2.4rc13" description = "ScaleDP is a library for processing documents using Apache Spark and LLMs" authors = ["Mykola Melnyk "] repository = "https://github.com/StabRise/scaledp" @@ -48,6 +48,7 @@ onnxruntime = "1.22.0" + [tool.poetry.extras] ml = ["transformers", #"torch", diff --git a/scaledp/image/ImageCropBoxes.py b/scaledp/image/ImageCropBoxes.py index 58d540a..3aeb2c0 100644 --- a/scaledp/image/ImageCropBoxes.py +++ b/scaledp/image/ImageCropBoxes.py @@ -6,7 +6,8 @@ from pyspark import keyword_only from pyspark.ml import Transformer from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable -from pyspark.sql.functions import udf +from pyspark.sql.functions import explode, udf +from pyspark.sql.types import ArrayType from scaledp.params import ( AutoParamsMeta, @@ -65,6 +66,13 @@ class ImageCropBoxes( typeConverter=TypeConverters.toBoolean, ) + limit = Param( + Params._dummy(), + "limit", + "Limit of boxes for crop.", + typeConverter=TypeConverters.toInt, + ) + defaultParams = MappingProxyType( { "inputCols": ["image", "boxes"], @@ -76,6 +84,7 @@ class ImageCropBoxes( "pageCol": "page", "propagateError": False, "noCrop": True, + "limit": 0, }, ) @@ -98,20 +107,36 @@ def transform_udf(self, image, data): ) img = image.to_pil() results = [] - for b in data.bboxes: + limit = self.getLimit() + + bboxes = data.bboxes[:limit] if limit > 0 else data.bboxes + + for b in bboxes: box = b if not isinstance(box, Box): box = Box(**box.asDict()) - if box.width > box.height: - results.append( - img.crop(box.bbox(self.getPadding())).rotate(-90, expand=True), + if box.width < box.height: + cropped_image = img.crop(box.bbox(self.getPadding())).rotate( + -90, + expand=True, ) else: - results.append(img.crop(box.bbox(self.getPadding()))) + cropped_image = img.crop(box.bbox(self.getPadding())) + results.append( + Image.from_pil( + cropped_image, + image.path, + image.imageType, + image.resolution, + ), + ) + if self.getNoCrop() and len(results) == 0: raise ImageCropError("No boxes to crop") if len(results) == 0: - results.append(img) + results.append( + Image.from_pil(img, image.path, image.imageType, image.resolution), + ) except Exception as e: exception = traceback.format_exc() @@ -120,7 +145,7 @@ def transform_udf(self, image, data): if self.getPropagateError(): raise ImageCropError from e return Image(image.path, image.imageType, data=bytes(), exception=exception) - return Image.from_pil(results[0], image.path, image.imageType, image.resolution) + return results def _transform(self, dataset): out_col = self.getOutputCol() @@ -133,7 +158,12 @@ def _transform(self, dataset): ) result = dataset.withColumn( out_col, - udf(self.transform_udf, Image.get_schema())(image_col, box_col), + explode( + udf(self.transform_udf, ArrayType(Image.get_schema()))( + image_col, + box_col, + ), + ), ) if not self.getKeepInputData(): diff --git a/scaledp/utils/templates/base.html b/scaledp/utils/templates/base.html index fcb9606..36b1052 100644 --- a/scaledp/utils/templates/base.html +++ b/scaledp/utils/templates/base.html @@ -1,5 +1,6 @@ -
-
+
+
{% for key, value in metadata.items() %} @@ -13,8 +14,12 @@
- + https://stabrise + .com/scaledp/
{% if metadata.Exception %}

{{metadata.Exception}}

diff --git a/scaledp/utils/templates/image.html b/scaledp/utils/templates/image.html index 4a7ba4d..92aaef9 100644 --- a/scaledp/utils/templates/image.html +++ b/scaledp/utils/templates/image.html @@ -1,5 +1,6 @@ {% extends "base.html" %} {% block body %} - + {% endblock %} diff --git a/scaledp/utils/templates/json.html b/scaledp/utils/templates/json.html index 93ed9a9..a528662 100644 --- a/scaledp/utils/templates/json.html +++ b/scaledp/utils/templates/json.html @@ -1,5 +1,6 @@ {% extends "base.html" %} {% block body %} -
{{text|safe}}
+
{{text|safe}}
{% endblock %} diff --git a/scaledp/utils/templates/ner.html b/scaledp/utils/templates/ner.html index 0874bbb..dd6f33b 100644 --- a/scaledp/utils/templates/ner.html +++ b/scaledp/utils/templates/ner.html @@ -1,5 +1,6 @@ {% extends "base.html" %} {% block body %} -
{{ ner|safe }}
+
{{ ner|safe }}
{% endblock %} diff --git a/scaledp/utils/templates/text.html b/scaledp/utils/templates/text.html index f1e21d9..a53c811 100644 --- a/scaledp/utils/templates/text.html +++ b/scaledp/utils/templates/text.html @@ -1,5 +1,6 @@ {% extends "base.html" %} {% block body %} -

{{text}}

+

{{text}}

{% endblock %} diff --git a/tests/image/test_image_crop_boxes.py b/tests/image/test_image_crop_boxes.py new file mode 100644 index 0000000..b0d4113 --- /dev/null +++ b/tests/image/test_image_crop_boxes.py @@ -0,0 +1,50 @@ +import tempfile + +from pyspark.ml.pipeline import PipelineModel + +from scaledp.enums import PSM +from scaledp.image.ImageCropBoxes import ImageCropBoxes +from scaledp.models.recognizers.TesseractOcr import TesseractOcr + + +def test_image_crop_boxes_ocr(image_df): + + # Initialize the OCR stage with specific parameters + ocr = TesseractOcr( + keepInputData=True, + scoreThreshold=0.5, + psm=PSM.SPARSE_TEXT.value, + scaleFactor=2.0, + ) + + # Initialize the ImageCropBoxes stage + crop = ImageCropBoxes( + inputCols=["image", "text"], + limit=2, + ) + + # Create the pipeline with the OCR and ImageDrawBoxes stages + pipeline = PipelineModel(stages=[ocr, crop]) + + # Run the pipeline on the input image dataframe + result = pipeline.transform(image_df).collect() + + # Verify the pipeline result + assert len(result) == 2 + assert hasattr(result[0], "cropped_image") + + # Save the output image to a temporary file for verification + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp: + temp.write(result[0].cropped_image.data) + temp.close() + + # Print the path to the temporary file + print("file://" + temp.name) + + # Verify the OCR stage output + ocr_result = result[0].text + assert len(ocr_result) > 0 + + # Verify the draw stage output + cropped_image = result[0].cropped_image + assert cropped_image.exception == "" diff --git a/tests/models/detectors/test_layout_detector.py b/tests/models/detectors/test_layout_detector.py index 243e260..0805878 100644 --- a/tests/models/detectors/test_layout_detector.py +++ b/tests/models/detectors/test_layout_detector.py @@ -36,6 +36,7 @@ def layout_detector(): ) +@pytest.mark.skip("Requires PaddleOCR which may not be installed in all environments.") def test_layout_detector_with_drawn_boxes(image_df): """Test LayoutDetector with drawn boxes on the original image.""" detector = LayoutDetector( @@ -92,6 +93,7 @@ def test_layout_detector_with_drawn_boxes(image_df): assert "Error in object detection" in str(e) or "PaddleOCR" in str(e) +@pytest.mark.skip("Requires PaddleOCR which may not be installed in all environments.") def test_layout_detector_with_custom_layout_types(): """Test LayoutDetector with custom layout types.""" detector = LayoutDetector( diff --git a/tests/pdf/test_pdf_assembler.py b/tests/pdf/test_pdf_assembler.py index b34df1a..c903fbe 100644 --- a/tests/pdf/test_pdf_assembler.py +++ b/tests/pdf/test_pdf_assembler.py @@ -1,10 +1,10 @@ import tempfile -from models.detectors.DBNetOnnxDetector import DBNetOnnxDetector from pyspark.ml import PipelineModel from pyspark.sql import DataFrame from scaledp import ImageDrawBoxes, TesseractRecognizer, TessLib +from scaledp.models.detectors.DBNetOnnxDetector import DBNetOnnxDetector from scaledp.models.recognizers.TesseractOcr import TesseractOcr from scaledp.pdf import PdfAddTextLayer, PdfAssembler, PdfDataToImage, SingleImageToPdf from scaledp.pipeline.PandasPipeline import PandasPipeline