Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ jobs:
#----------------------------------------------
- name: Install dependencies
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction -E ml
run: |
poetry install --no-interaction -E ml

#----------------------------------------------
# run test suite
Expand All @@ -121,7 +122,8 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_BASE_URL: 'https://generativelanguage.googleapis.com/v1beta/'
run:
run: |
poetry install --no-interaction # ensure the root package is installed
poetry run pytest --cov-report=xml --cov=scaledp tests/

#----------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ RUN mkdir -p /etc/apt/sources.list.d
RUN mv wtf-bookworm.sources /etc/apt/sources.list.d/

RUN apt-get update && apt-get install --no-install-recommends --yes \
tesseract-ocr openjdk-8-jdk
tesseract-ocr tesseract-ocr-spa openjdk-8-jdk

EXPOSE 8888

Expand Down
2,619 changes: 239 additions & 2,380 deletions poetry.lock

Large diffs are not rendered by default.

22 changes: 17 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scaledp"
version = "0.2.3rc11"
version = "0.2.3rc46"
description = "ScaleDP is a library for processing documents using Apache Spark and LLMs"
authors = ["Mykola Melnyk <mykola@stabrise.com>"]
repository = "https://github.com/StabRise/scaledp"
Expand Down Expand Up @@ -31,14 +31,17 @@ torch = [
#{version = "==2.2.0", platform = "darwin", optional = true },
{version = ">=2.4.1", source = "pytorch_cpu", optional = true}
]
dspy = {version = "2.5.43", optional = true}
#dspy = {version = "2.5.43", optional = true}
levenshtein = "^0.27.1"
pydantic = ">=1.8.0"
huggingface-hub = "^0.28.1"
tenacity = "^9.0.0"
openai = "^1.58.0"
tenacity = ">=8.2.3"
openai = ">=1.58.0"
sparkdantic = "^2.0.0"
img2pdf = "^0.6.1"
pycrafter = "^0.0.7"
shapely = "^2.1.1"
pyclipper = "^1.3.0.post6"


[tool.poetry.extras]
Expand Down Expand Up @@ -70,6 +73,7 @@ black = "^24.10.0"
ultralytics = "^8.3.40"
pre-commit = "^3.7.1"
ruff = "^0.5.0"
craft-text-detector-updated = "^0.4.7"

[build-system]
#requires = ["poetry-core<2.0.0"]
Expand Down Expand Up @@ -145,7 +149,14 @@ lint.ignore = [
"N802",
"SLF001", # call protected method
"PLE0604",
"RET504",
"ANN204"
]
exclude = [
"scaledp/models/detectors/yolo/*.py",
"scaledp/models/detectors/paddle_onnx/*.py",
]


[tool.ruff.lint.per-file-ignores]
"tests/*" = [
Expand All @@ -157,9 +168,10 @@ lint.ignore = [
]



[tool.ruff.lint.pydocstyle]
convention = "pep257"
ignore-decorators = ["typing.overload"]

[tool.ruff.lint.pylint]
allow-magic-value-types = ["int", "str", "float", "bytes"]
allow-magic-value-types = ["int", "str", "float", "bytes"]
2 changes: 2 additions & 0 deletions scaledp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from scaledp.image.ImageDrawBoxes import ImageDrawBoxes
from scaledp.models.detectors.DocTRTextDetector import DocTRTextDetector
from scaledp.models.detectors.YoloDetector import YoloDetector
from scaledp.models.detectors.YoloOnnxDetector import YoloOnnxDetector
from scaledp.models.extractors.DSPyExtractor import DSPyExtractor
from scaledp.models.extractors.LLMExtractor import LLMExtractor
from scaledp.models.extractors.LLMVisualExtractor import LLMVisualExtractor
Expand Down Expand Up @@ -210,6 +211,7 @@ def ScaleDPSession(
"EasyOcr",
"DocTROcr",
"YoloDetector",
"YoloOnnxDetector",
"ImageCropBoxes",
"DSPyExtractor",
"TesseractRecognizer",
Expand Down
2 changes: 1 addition & 1 deletion scaledp/image/DataToImage.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,5 +93,5 @@ def _transform(self, dataset):
),
)
if not self.getKeepInputData():
result = result.drop(input_col)
result = result.drop(self.getInputCol())
return result
79 changes: 58 additions & 21 deletions scaledp/image/ImageDrawBoxes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import math
import random
import traceback
from types import MappingProxyType
Expand Down Expand Up @@ -127,7 +128,7 @@ def getDisplayText(self, box):
text.append(val)
return ":".join(text)

def transform_udf(self, image, data):
def transform_udf(self, image, *data_list: Any):

def get_color():
return "#{:06x}".format(random.randint(0, 0xFFFFFF))
Expand All @@ -146,10 +147,12 @@ def get_color():
img1 = ImageDraw.Draw(img)
fill = self.getColor() if self.getFilled() else None

if hasattr(data, "entities"):
self.draw_ner_boxes(data, fill, get_color, img1)
else:
self.draw_boxes(data, fill, img1)
for data in data_list:

if hasattr(data, "entities"):
self.draw_ner_boxes(data, fill, get_color, img1)
else:
self.draw_boxes(data, fill, img1)

except Exception:
exception = traceback.format_exc()
Expand All @@ -164,13 +167,7 @@ def draw_boxes(self, data, fill, img1):
box = b
if not isinstance(box, Box):
box = Box(**box.asDict())
img1.rounded_rectangle(
box.shape(self.getPadding()),
outline=color,
radius=4,
fill=fill,
width=self.getLineWidth(),
)
self.draw_box(box, color, fill, img1)
text = self.getDisplayText(box)
if text:
img1.text(
Expand All @@ -183,6 +180,51 @@ def draw_boxes(self, data, fill, img1):
font_size=self.getTextSize(),
)

def draw_box(self, box, color, fill, img1):
if box.angle == 0:
# Draw normal rectangle if angle is 0
img1.rounded_rectangle(
box.shape(self.getPadding()),
outline=color,
radius=4,
fill=fill,
width=self.getLineWidth(),
)
else:
# Draw rotated rectangle for non-zero angles
center_x = box.x + box.width / 2
center_y = box.y + box.height / 2
points = [
# Top-left
(-box.width / 2, -box.height / 2),
# Top-right
(box.width / 2, -box.height / 2),
# Bottom-right
(box.width / 2, box.height / 2),
# Bottom-left
(-box.width / 2, box.height / 2),
]
# Rotate points and translate to center
rotated_points = []

angle_rad = math.radians(box.angle)
for px, py in points:
# Rotate point
rx = px * math.cos(angle_rad) - py * math.sin(angle_rad)
ry = px * math.sin(angle_rad) + py * math.cos(angle_rad)
# Translate to center
rx += center_x
ry += center_y
rotated_points.append((rx, ry))

# Draw polygon with rotated points
img1.polygon(
rotated_points,
outline=color,
fill=fill,
width=self.getLineWidth(),
)

def draw_ner_boxes(self, data, fill, get_color, img1):
black_list = self.getBlackList()
white_list = self.getWhiteList()
Expand All @@ -209,13 +251,8 @@ def draw_ner_boxes(self, data, fill, get_color, img1):
if not isinstance(box, Box):
box = Box(**box.asDict())
text = self.getDisplayText(ner)
img1.rounded_rectangle(
box.shape(self.getPadding()),
outline=color,
radius=4,
fill=fill,
width=self.getLineWidth(),
)
self.draw_box(box, color, fill, img1)

if text:
tbox = list(
img1.textbbox(
Expand Down Expand Up @@ -254,7 +291,7 @@ def _preprocessing(self, dataset):
def _transform(self, dataset):
out_col = self.getOutputCol()
image_col = self._validate(self.getInputCols()[0], dataset)
box_col = self._validate(self.getInputCols()[1], dataset)
box_cols = [self._validate(col, dataset) for col in self.getInputCols()[1:]]

dataset = self._preprocessing(dataset)

Expand All @@ -264,7 +301,7 @@ def _transform(self, dataset):
)
result = dataset.withColumn(
out_col,
udf(self.transform_udf, Image.get_schema())(image_col, box_col),
udf(self.transform_udf, Image.get_schema())(image_col, *box_cols),
)

if not self.getKeepInputData():
Expand Down
13 changes: 11 additions & 2 deletions scaledp/models/detectors/BaseDetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ class BaseDetector(
typeConverter=TypeConverters.toFloat,
)

onlyRotated = Param(
Params._dummy(),
"onlyRotated",
"Return only rotated boxes.",
typeConverter=TypeConverters.toBoolean,
)

def get_params(self):
return json.dumps({k.name: v for k, v in self.extractParamMap().items()})

Expand Down Expand Up @@ -99,8 +106,10 @@ def transform_udf(self, image, params=None):
exception=image.exception,
)
try:
logging.info("Convert image")
image_pil = image.to_pil()
scale_factor = self.getScaleFactor()
logging.info("Resize image")
if scale_factor != 1.0:
resized_image = image_pil.resize(
(
Expand All @@ -110,7 +119,7 @@ def transform_udf(self, image, params=None):
)
else:
resized_image = image_pil

logging.info("Call detector on image")
result = self.call_detector([(resized_image, image.path)], params)
except Exception as e:
exception = traceback.format_exc()
Expand Down Expand Up @@ -190,7 +199,7 @@ def _transform(self, dataset):
)

if not self.getKeepInputData():
result = result.drop(input_col)
result = result.drop(self.getInputCol())
return result

def setScaleFactor(self, value):
Expand Down
Loading