diff --git a/.gitignore b/.gitignore index 64f3f48..fd0d2e5 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,6 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ \ No newline at end of file +.idea/ +/.vscode/settings.json +/tests/testresources/pdfs/private/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8bd9bac --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,8 @@ +## [unreleased] + +### ๐Ÿš€ Features + +- Added possibility to use subfields in show utils and added padding option to the YoloOnnxDetector +- Added LineOrientation detector model to the TesseractRecognizer +- Added FaceDetector, SignatureDetector +- Added PdfAssembler diff --git a/cliff.toml b/cliff.toml new file mode 100644 index 0000000..b278c49 --- /dev/null +++ b/cliff.toml @@ -0,0 +1,92 @@ +# git-cliff ~ configuration file +# https://git-cliff.org/docs/configuration + + +[changelog] +# A Tera template to be rendered for each release in the changelog. +# See https://keats.github.io/tera/docs/#introduction +body = """ +{% if version %}\ + ## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} +{% else %}\ + ## [unreleased] +{% endif %}\ +{% for group, commits in commits | group_by(attribute="group") %} + ### {{ group | striptags | trim | upper_first }} + {% for commit in commits %} + - {% if commit.scope %}*({{ commit.scope }})* {% endif %}\ + {% if commit.breaking %}[**breaking**] {% endif %}\ + {{ commit.message | upper_first }}\ + {% endfor %} +{% endfor %} +""" +# Remove leading and trailing whitespaces from the changelog's body. +trim = true +# Render body even when there are no releases to process. +render_always = true +# An array of regex based postprocessors to modify the changelog. +postprocessors = [ + # Replace the placeholder with a URL. + #{ pattern = '', replace = "https://github.com/orhun/git-cliff" }, +] +# render body even when there are no releases to process +# render_always = true +# output file path +# output = "test.md" + +[git] +# Parse commits according to the conventional commits specification. +# See https://www.conventionalcommits.org +conventional_commits = true +# Exclude commits that do not match the conventional commits specification. +filter_unconventional = true +# Require all commits to be conventional. +# Takes precedence over filter_unconventional. +require_conventional = false +# Split commits on newlines, treating each line as an individual commit. +split_commits = false +# An array of regex based parsers to modify commit messages prior to further processing. +commit_preprocessors = [ + # Replace issue numbers with link templates to be updated in `changelog.postprocessors`. + #{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](/issues/${2}))"}, + # Check spelling of the commit message using https://github.com/crate-ci/typos. + # If the spelling is incorrect, it will be fixed automatically. + #{ pattern = '.*', replace_command = 'typos --write-changes -' }, +] +# Prevent commits that are breaking from being excluded by commit parsers. +protect_breaking_commits = false +# An array of regex based parsers for extracting data from the commit message. +# Assigns commits to groups. +# Optionally sets the commit's scope and can decide to exclude commits from further processing. +commit_parsers = [ + { message = "^feat", group = "๐Ÿš€ Features" }, + { message = "^fix", group = "๐Ÿ› Bug Fixes" }, + { message = "^doc", group = "๐Ÿ“š Documentation" }, + { message = "^perf", group = "โšก Performance" }, + { message = "^refactor", group = "๐Ÿšœ Refactor" }, + { message = "^style", group = "๐ŸŽจ Styling" }, + { message = "^test", group = "๐Ÿงช Testing" }, + { message = "^chore\\(release\\): prepare for", skip = true }, + { message = "^chore\\(deps.*\\)", skip = true }, + { message = "^chore\\(pr\\)", skip = true }, + { message = "^chore\\(pull\\)", skip = true }, + { message = "^chore|^ci", group = "โš™๏ธ Miscellaneous Tasks" }, + { body = ".*security", group = "๐Ÿ›ก๏ธ Security" }, + { message = "^revert", group = "โ—€๏ธ Revert" }, + { message = ".*", group = "๐Ÿ’ผ Other" }, +] +# Exclude commits that are not matched by any commit parser. +filter_commits = false +# An array of link parsers for extracting external references, and turning them into URLs, using regex. +link_parsers = [] +# Include only the tags that belong to the current branch. +use_branch_tags = false +# Order releases topologically instead of chronologically. +topo_order = false +# Order releases topologically instead of chronologically. +topo_order_commits = true +# Order of commits in each group/release within the changelog. +# Allowed values: newest, oldest +sort_commits = "oldest" +# Process submodules commits +recurse_submodules = false diff --git a/poetry.lock b/poetry.lock index 4ac458f..6a0b811 100644 --- a/poetry.lock +++ b/poetry.lock @@ -526,7 +526,7 @@ files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -markers = {main = "sys_platform == \"win32\" or platform_system == \"Windows\""} +markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\""} [[package]] name = "coloredlogs" @@ -1140,6 +1140,26 @@ tqdm = "*" [package.extras] test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-requests", "types-setuptools"] +[[package]] +name = "git-cliff" +version = "2.10.1" +description = "" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +files = [ + {file = "git_cliff-2.10.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:49e5808150d526ad6b728313b77636fe962c7ee6729409f2d42aa6cbe323506b"}, + {file = "git_cliff-2.10.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:973962f2486d33ddbb624aa2d2e2d370e03721a164c471601736afce75e0935d"}, + {file = "git_cliff-2.10.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704ff6cea09fd90dd524a8235143897a2d1c8ef9fb045070275f4cf6b0040616"}, + {file = "git_cliff-2.10.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d72c0bf314014f30ec4287a2dee9f1a14f4c67f73b75991684391e2a98d9b9b"}, + {file = "git_cliff-2.10.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:49157fdb81384d4282f918a5a533418c943aae1cfc3cde572adb95cfabcb55a8"}, + {file = "git_cliff-2.10.1-py3-none-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36e3537450fb2c295fa8b1b6400c72a166022241de294c24c7bee3ae43284a78"}, + {file = "git_cliff-2.10.1-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:56f54211a51a8ce3208f3871ee09899607ec539d62aba900b23a53b0be442591"}, + {file = "git_cliff-2.10.1-py3-none-win32.whl", hash = "sha256:c121069d52136889c7e2f8a93ed878f5fdffe5707f76a0badc098c5b6b71fc97"}, + {file = "git_cliff-2.10.1-py3-none-win_amd64.whl", hash = "sha256:893f595bfbea536668eaeb7959025982a577d37ccfab4f6cbc0b9d6e265da93f"}, + {file = "git_cliff-2.10.1.tar.gz", hash = "sha256:2f288e732584e2aff65e86990a12ffeb58898931db96f9b219e016335492da97"}, +] + [[package]] name = "h11" version = "0.14.0" @@ -1191,6 +1211,28 @@ files = [ [package.dependencies] numpy = ">=1.19.3" +[[package]] +name = "hf-xet" +version = "1.1.10" +description = "Fast transfer of large files with the Hugging Face Hub." +optional = false +python-versions = ">=3.8" +groups = ["main", "test"] +markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\"" +files = [ + {file = "hf_xet-1.1.10-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:686083aca1a6669bc85c21c0563551cbcdaa5cf7876a91f3d074a030b577231d"}, + {file = "hf_xet-1.1.10-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:71081925383b66b24eedff3013f8e6bbd41215c3338be4b94ba75fd75b21513b"}, + {file = "hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b6bceb6361c80c1cc42b5a7b4e3efd90e64630bcf11224dcac50ef30a47e435"}, + {file = "hf_xet-1.1.10-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eae7c1fc8a664e54753ffc235e11427ca61f4b0477d757cc4eb9ae374b69f09c"}, + {file = "hf_xet-1.1.10-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0a0005fd08f002180f7a12d4e13b22be277725bc23ed0529f8add5c7a6309c06"}, + {file = "hf_xet-1.1.10-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f900481cf6e362a6c549c61ff77468bd59d6dd082f3170a36acfef2eb6a6793f"}, + {file = "hf_xet-1.1.10-cp37-abi3-win_amd64.whl", hash = "sha256:5f54b19cc347c13235ae7ee98b330c26dd65ef1df47e5316ffb1e87713ca7045"}, + {file = "hf_xet-1.1.10.tar.gz", hash = "sha256:408aef343800a2102374a883f283ff29068055c111f003ff840733d3b715bb97"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "httpcore" version = "1.0.7" @@ -1240,19 +1282,20 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "0.28.1" +version = "0.35.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" groups = ["main", "test"] files = [ - {file = "huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7"}, - {file = "huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae"}, + {file = "huggingface_hub-0.35.3-py3-none-any.whl", hash = "sha256:0e3a01829c19d86d03793e4577816fe3bdfc1602ac62c7fb220d593d351224ba"}, + {file = "huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a"}, ] [package.dependencies] filelock = "*" fsspec = ">=2023.5.0" +hf-xet = {version = ">=1.1.3,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""} packaging = ">=20.9" pyyaml = ">=5.1" requests = "*" @@ -1260,16 +1303,19 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "ty", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "ty", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] hf-transfer = ["hf-transfer (>=0.1.4)"] +hf-xet = ["hf-xet (>=1.1.2,<2.0.0)"] inference = ["aiohttp"] -quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"] +mcp = ["aiohttp", "mcp (>=1.8.0)", "typer"] +oauth = ["authlib (>=1.3.2)", "fastapi", "httpx", "itsdangerous"] +quality = ["libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "ruff (>=0.9.0)", "ty"] tensorflow = ["graphviz", "pydot", "tensorflow"] tensorflow-testing = ["keras (<3.0)", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures (<16.0)", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] torch = ["safetensors[torch]", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] @@ -2799,6 +2845,174 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.1.3.1" +description = "CUBLAS native runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"}, + {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"}, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.1.105" +description = "CUDA profiling tools runtime libs." +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"}, + {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"}, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.1.105" +description = "NVRTC native runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"}, + {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"}, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.1.105" +description = "CUDA Runtime native Libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"}, + {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"}, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.1.0.70" +description = "cuDNN runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, + {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, +] + +[package.dependencies] +nvidia-cublas-cu12 = "*" + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.0.2.54" +description = "CUFFT native runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"}, + {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"}, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.2.106" +description = "CURAND native runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"}, + {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"}, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.4.5.107" +description = "CUDA solver native runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"}, + {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"}, +] + +[package.dependencies] +nvidia-cublas-cu12 = "*" +nvidia-cusparse-cu12 = "*" +nvidia-nvjitlink-cu12 = "*" + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.1.0.106" +description = "CUSPARSE native runtime libraries" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"}, + {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"}, +] + +[package.dependencies] +nvidia-nvjitlink-cu12 = "*" + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.20.5" +description = "NVIDIA Collective Communication Library (NCCL) Runtime" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"}, + {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"}, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.4.127" +description = "Nvidia JIT LTO Library" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"}, + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"}, + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"}, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.1.105" +description = "NVIDIA Tools Extension" +optional = false +python-versions = ">=3" +groups = ["main", "test"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"}, + {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"}, +] + [[package]] name = "onnxruntime" version = "1.22.0" @@ -3385,7 +3599,7 @@ description = "Run a subprocess in a pseudo terminal" optional = false python-versions = "*" groups = ["test"] -markers = "os_name != \"nt\" or sys_platform != \"win32\" and sys_platform != \"emscripten\"" +markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\" or os_name != \"nt\"" files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, @@ -4986,7 +5200,7 @@ files = [ {file = "setuptools-78.1.0-py3-none-any.whl", hash = "sha256:3e386e96793c8702ae83d17b853fb93d3e09ef82ec62722e61da5cd22376dcd8"}, {file = "setuptools-78.1.0.tar.gz", hash = "sha256:18fd474d4a82a5f83dac888df697af65afa82dec7323d09c3e37d1f14288da54"}, ] -markers = {main = "python_version >= \"3.12\""} +markers = {main = "sys_platform == \"darwin\" or python_version >= \"3.12\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] @@ -5384,6 +5598,62 @@ files = [ {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] +[[package]] +name = "torch" +version = "2.4.1" +description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" +optional = false +python-versions = ">=3.8.0" +groups = ["main", "test"] +markers = "sys_platform == \"darwin\"" +files = [ + {file = "torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:362f82e23a4cd46341daabb76fba08f04cd646df9bfaf5da50af97cb60ca4971"}, + {file = "torch-2.4.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:e8ac1985c3ff0f60d85b991954cfc2cc25f79c84545aead422763148ed2759e3"}, + {file = "torch-2.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:91e326e2ccfb1496e3bee58f70ef605aeb27bd26be07ba64f37dcaac3d070ada"}, + {file = "torch-2.4.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:d36a8ef100f5bff3e9c3cea934b9e0d7ea277cb8210c7152d34a9a6c5830eadd"}, + {file = "torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0b5f88afdfa05a335d80351e3cea57d38e578c8689f751d35e0ff36bce872113"}, + {file = "torch-2.4.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:ef503165f2341942bfdf2bd520152f19540d0c0e34961232f134dc59ad435be8"}, + {file = "torch-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:092e7c2280c860eff762ac08c4bdcd53d701677851670695e0c22d6d345b269c"}, + {file = "torch-2.4.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ddddbd8b066e743934a4200b3d54267a46db02106876d21cf31f7da7a96f98ea"}, + {file = "torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:fdc4fe11db3eb93c1115d3e973a27ac7c1a8318af8934ffa36b0370efe28e042"}, + {file = "torch-2.4.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:18835374f599207a9e82c262153c20ddf42ea49bc76b6eadad8e5f49729f6e4d"}, + {file = "torch-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:ebea70ff30544fc021d441ce6b219a88b67524f01170b1c538d7d3ebb5e7f56c"}, + {file = "torch-2.4.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:72b484d5b6cec1a735bf3fa5a1c4883d01748698c5e9cfdbeb4ffab7c7987e0d"}, + {file = "torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c99e1db4bf0c5347107845d715b4aa1097e601bdc36343d758963055e9599d93"}, + {file = "torch-2.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b57f07e92858db78c5b72857b4f0b33a65b00dc5d68e7948a8494b0314efb880"}, + {file = "torch-2.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:f18197f3f7c15cde2115892b64f17c80dbf01ed72b008020e7da339902742cf6"}, + {file = "torch-2.4.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:5fc1d4d7ed265ef853579caf272686d1ed87cebdcd04f2a498f800ffc53dab71"}, + {file = "torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:40f6d3fe3bae74efcf08cb7f8295eaddd8a838ce89e9d26929d4edd6d5e4329d"}, + {file = "torch-2.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c9299c16c9743001ecef515536ac45900247f4338ecdf70746f2461f9e4831db"}, + {file = "torch-2.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bce130f2cd2d52ba4e2c6ada461808de7e5eccbac692525337cfb4c19421846"}, + {file = "torch-2.4.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:a38de2803ee6050309aac032676536c3d3b6a9804248537e38e098d0e14817ec"}, +] + +[package.dependencies] +filelock = "*" +fsspec = "*" +jinja2 = "*" +networkx = "*" +nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +setuptools = "*" +sympy = "*" +triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} +typing-extensions = ">=4.8.0" + +[package.extras] +opt-einsum = ["opt-einsum (>=3.3)"] +optree = ["optree (>=0.11.0)"] + [[package]] name = "torch" version = "2.6.0+cpu" @@ -5391,6 +5661,7 @@ description = "Tensors and Dynamic neural networks in Python with strong GPU acc optional = false python-versions = ">=3.9.0" groups = ["main", "test"] +markers = "sys_platform != \"darwin\"" files = [ {file = "torch-2.6.0+cpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102"}, {file = "torch-2.6.0+cpu-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9"}, @@ -5429,6 +5700,46 @@ type = "legacy" url = "https://download.pytorch.org/whl/cpu" reference = "pytorch_cpu" +[[package]] +name = "torchvision" +version = "0.19.1" +description = "image and video datasets and models for torch deep learning" +optional = false +python-versions = ">=3.8" +groups = ["main", "test"] +markers = "sys_platform == \"darwin\"" +files = [ + {file = "torchvision-0.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:54e8513099e6f586356c70f809d34f391af71ad182fe071cc328a28af2c40608"}, + {file = "torchvision-0.19.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:20a1f5e02bfdad7714e55fa3fa698347c11d829fa65e11e5a84df07d93350eed"}, + {file = "torchvision-0.19.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:7b063116164be52fc6deb4762de7f8c90bfa3a65f8d5caf17f8e2d5aadc75a04"}, + {file = "torchvision-0.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:f40b6acabfa886da1bc3768f47679c61feee6bde90deb979d9f300df8c8a0145"}, + {file = "torchvision-0.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:40514282b4896d62765b8e26d7091c32e17c35817d00ec4be2362ea3ba3d1787"}, + {file = "torchvision-0.19.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:5a91be061ae5d6d5b95e833b93e57ca4d3c56c5a57444dd15da2e3e7fba96050"}, + {file = "torchvision-0.19.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d71a6a6fe3a5281ca3487d4c56ad4aad20ff70f82f1d7c79bcb6e7b0c2af00c8"}, + {file = "torchvision-0.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:70dea324174f5e9981b68e4b7cd524512c106ba64aedef560a86a0bbf2fbf62c"}, + {file = "torchvision-0.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27ece277ff0f6cdc7fed0627279c632dcb2e58187da771eca24b0fbcf3f8590d"}, + {file = "torchvision-0.19.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:c659ff92a61f188a1a7baef2850f3c0b6c85685447453c03d0e645ba8f1dcc1c"}, + {file = "torchvision-0.19.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:c07bf43c2a145d792ecd9d0503d6c73577147ece508d45600d8aac77e4cdfcf9"}, + {file = "torchvision-0.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:b4283d283675556bb0eae31d29996f53861b17cbdcdf3509e6bc050414ac9289"}, + {file = "torchvision-0.19.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4e4f5b24ea6b087b02ed492ab1e21bba3352c4577e2def14248cfc60732338"}, + {file = "torchvision-0.19.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:9281d63ead929bb19143731154cd1d8bf0b5e9873dff8578a40e90a6bec3c6fa"}, + {file = "torchvision-0.19.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:4d10bc9083c4d5fadd7edd7b729700a7be48dab4f62278df3bc73fa48e48a155"}, + {file = "torchvision-0.19.1-cp38-cp38-win_amd64.whl", hash = "sha256:ccf085ef1824fb9e16f1901285bf89c298c62dfd93267a39e8ee42c71255242f"}, + {file = "torchvision-0.19.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:731f434d91586769e255b5d70ed1a4457e0a1394a95f4aacf0e1e7e21f80c098"}, + {file = "torchvision-0.19.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:febe4f14d4afcb47cc861d8be7760ab6a123cd0817f97faf5771488cb6aa90f4"}, + {file = "torchvision-0.19.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e328309b8670a2e889b2fe76a1c2744a099c11c984da9a822357bd9debd699a5"}, + {file = "torchvision-0.19.1-cp39-cp39-win_amd64.whl", hash = "sha256:6616f12e00a22e7f3fedbd0fccb0804c05e8fe22871668f10eae65cf3f283614"}, +] + +[package.dependencies] +numpy = "*" +pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" +torch = "2.4.1" + +[package.extras] +gdown = ["gdown (>=4.7.3)"] +scipy = ["scipy"] + [[package]] name = "torchvision" version = "0.21.0+cpu" @@ -5436,6 +5747,7 @@ description = "image and video datasets and models for torch deep learning" optional = false python-versions = ">=3.9" groups = ["main", "test"] +markers = "sys_platform != \"darwin\"" files = [ {file = "torchvision-0.21.0+cpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:4ed0a1be50676a7c589ba83b62c9dc0267a87e852b8cd9b7d6db27ab36c6d552"}, {file = "torchvision-0.21.0+cpu-cp310-cp310-win_amd64.whl", hash = "sha256:554ca0f5948ac89911299f8bfb6f23936d867387ea213ab235adc2814b510d0c"}, @@ -5594,6 +5906,30 @@ torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] +[[package]] +name = "triton" +version = "3.0.0" +description = "A language and compiler for custom Deep Learning operations" +optional = false +python-versions = "*" +groups = ["main", "test"] +markers = "python_version <= \"3.12\" and platform_system == \"Linux\" and platform_machine == \"x86_64\" and sys_platform == \"darwin\"" +files = [ + {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"}, + {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"}, + {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, + {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, + {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, +] + +[package.dependencies] +filelock = "*" + +[package.extras] +build = ["cmake (>=3.20)", "lit"] +tests = ["autopep8", "flake8", "isort", "llnl-hatchet", "numpy", "pytest", "scipy (>=1.7.1)"] +tutorials = ["matplotlib", "pandas", "tabulate"] + [[package]] name = "types-python-dateutil" version = "2.9.0.20241206" @@ -5670,8 +6006,8 @@ requests = ">=2.23.0" scipy = ">=1.4.1" seaborn = ">=0.11.0" torch = [ - {version = ">=1.8.0,<2.4.0 || >2.4.0", markers = "sys_platform == \"win32\""}, {version = ">=1.8.0", markers = "sys_platform != \"win32\""}, + {version = ">=1.8.0,<2.4.0 || >2.4.0", markers = "sys_platform == \"win32\""}, ] torchvision = ">=0.9.0" tqdm = ">=4.64.0" @@ -5910,11 +6246,11 @@ files = [ [extras] llm = [] -ml = ["torch", "torchvision", "transformers"] +ml = ["transformers"] ocr = ["easyocr", "python-doctr", "surya-ocr"] paddle = [] [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "dbc223fa004895653ea3ee28ab16deef00cc87824450ddc9149056d6bd549ff0" +content-hash = "8dd2983084085f5f23e928b2644fde1da1ae8dcd8a783ade3fd3f450ccf2db65" diff --git a/pyproject.toml b/pyproject.toml index 8a13f38..dc67e5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scaledp" -version = "0.2.4rc5" +version = "0.2.4rc11" description = "ScaleDP is a library for processing documents using Apache Spark and LLMs" authors = ["Mykola Melnyk "] repository = "https://github.com/StabRise/scaledp" @@ -20,21 +20,24 @@ pytesseract = "0.3.13" pytest = "^7.4.4" PyMuPDF = "1.24.11" numpy = "^1.26.4" -pyarrow = "17.0.0" #18.1.0 +pyarrow = ">=17.0.0" #18.1.0 filelock = "*" surya-ocr = {version = "0.8.1", optional = true} easyocr = {version = "1.7.2", optional = true} python-doctr = {version = "0.10.0", optional = true} transformers = {version = "^4.36.0", optional = true} -torchvision= {version= ">=0.18.0", source = "pytorch_cpu", optional = true} +torchvision= [ + {version = ">=0.18.0", markers="sys_platform == 'darwin'", optional = true }, + {version= ">=0.18.0", markers="sys_platform != 'darwin'", source = "pytorch_cpu", optional = true} + ] torch = [ - #{version = "==2.2.0", platform = "darwin", optional = true }, - {version = ">=2.4.1", source = "pytorch_cpu", optional = true} + {version = "==2.4.1", markers="sys_platform == 'darwin'", optional = true }, + {version = ">=2.4.1", markers="sys_platform != 'darwin'", source = "pytorch_cpu", optional = true} ] #dspy = {version = "2.5.43", optional = true} levenshtein = "^0.27.1" pydantic = ">=1.8.0" -huggingface-hub = "^0.28.1" +huggingface-hub = "^0.35.3" tenacity = ">=8.2.3" openai = ">=1.58.0" sparkdantic = "^2.0.0" @@ -46,7 +49,10 @@ onnxruntime = "1.22.0" [tool.poetry.extras] -ml = ["transformers", "torch", "torchvision"] +ml = ["transformers", + #"torch", + #"torchvision" +] ocr = ["easyocr", "python-doctr", "surya-ocr"] llm = ["dspy"] paddle = ["paddleocr", "paddlepaddle",] @@ -67,8 +73,12 @@ pytest-resource-path = "1.3.0" coverage = "7.6.4" pytest-cov="5.0.0" transformers = "^4.36.0" -torchvision= {version= ">=0.18.0", source = "pytorch_cpu"} -torch = {version= ">=2.4.1", source = "pytorch_cpu"} +#torchvision= {version= ">=0.18.0", source = "pytorch_cpu"} +#torch = {version= ">=2.4.1", source = "pytorch_cpu"} +torch = [ + {version = "==2.4.1", markers="sys_platform == 'darwin'", optional = true }, + {version = ">=2.4.1", markers="sys_platform != 'darwin'", source = "pytorch_cpu", optional = true} +] python-doctr = "0.10.0" surya-ocr = "0.8.1" black = "^24.10.0" @@ -77,6 +87,10 @@ pre-commit = "^3.7.1" ruff = "^0.5.0" craft-text-detector-updated = "^0.4.7" + +[tool.poetry.group.dev.dependencies] +git-cliff = "^2.10.1" + [build-system] #requires = ["poetry-core<2.0.0"] requires = ["poetry-core>=1.0.0"] diff --git a/scaledp/README.md b/scaledp/README.md index 74db88f..974c514 100644 --- a/scaledp/README.md +++ b/scaledp/README.md @@ -37,9 +37,13 @@ ```bash poetry version patch - poetry publish --build ``` +### Publish + +```bash +poetry publish --build +``` ## Pre-commit @@ -53,3 +57,12 @@ To run pre-commit on all files: pre-commit run --all-files ``` +## Update changelogs + +```bash + poetry run git cliff --unreleased -o +``` + +## Deps + +crafter diff --git a/scaledp/__init__.py b/scaledp/__init__.py index 74644e5..192ebdb 100644 --- a/scaledp/__init__.py +++ b/scaledp/__init__.py @@ -13,6 +13,7 @@ from scaledp.image.ImageCropBoxes import ImageCropBoxes from scaledp.image.ImageDrawBoxes import ImageDrawBoxes from scaledp.models.detectors.DocTRTextDetector import DocTRTextDetector +from scaledp.models.detectors.FaceDetector import FaceDetector from scaledp.models.detectors.LayoutDetector import LayoutDetector from scaledp.models.detectors.SignatureDetector import SignatureDetector from scaledp.models.detectors.YoloDetector import YoloDetector @@ -28,7 +29,12 @@ from scaledp.models.recognizers.SuryaOcr import SuryaOcr from scaledp.models.recognizers.TesseractOcr import TesseractOcr from scaledp.models.recognizers.TesseractRecognizer import TesseractRecognizer +from scaledp.pdf.PdfAddTextLayer import PdfAddTextLayer +from scaledp.pdf.PdfAssembler import PdfAssembler +from scaledp.pdf.PdfDataToDocument import PdfDataToDocument from scaledp.pdf.PdfDataToImage import PdfDataToImage +from scaledp.pdf.PdfDataToSingleImage import PdfDataToSingleImage +from scaledp.pdf.SingleImageToPdf import SingleImageToPdf from scaledp.text.TextToDocument import TextToDocument from scaledp.utils.show_utils import ( show_image, @@ -216,6 +222,7 @@ def ScaleDPSession( "YoloDetector", "YoloOnnxDetector", "SignatureDetector", + "FaceDetector", "ImageCropBoxes", "DSPyExtractor", "TesseractRecognizer", @@ -224,6 +231,11 @@ def ScaleDPSession( "LLMExtractor", "LLMOcr", "LLMNer", + "PdfDataToDocument", + "PdfDataToSingleImage", + "PdfAddTextLayer", + "PdfAssembler", + "SingleImageToPdf", "__version__", "files", *dir(enums), diff --git a/scaledp/image/DataToImage.py b/scaledp/image/DataToImage.py index ede7374..75ead2e 100644 --- a/scaledp/image/DataToImage.py +++ b/scaledp/image/DataToImage.py @@ -80,7 +80,10 @@ def transform_udf(self, input, path, resolution): def _transform(self, dataset): out_col = self.getOutputCol() input_col = self._validate(self.getInputCol(), dataset) - path_col = self._validate(self.getPathCol(), dataset) + try: + path_col = self._validate(self.getPathCol(), dataset) + except Exception: + path_col = lit("memory") resolution = ( dataset["resolution"] if "resolution" in dataset.columns else lit(0) ) diff --git a/scaledp/models/detectors/BaseDetector.py b/scaledp/models/detectors/BaseDetector.py index e96eefe..8fbfa95 100644 --- a/scaledp/models/detectors/BaseDetector.py +++ b/scaledp/models/detectors/BaseDetector.py @@ -122,7 +122,6 @@ def transform_udf(self, image, params=None): logging.info("Call detector on image") result = self.call_detector([(resized_image, image.path)], params) except Exception as e: - raise e exception = traceback.format_exc() exception = ( f"{self.uid}: Error in object detection: {exception}, {image.exception}" diff --git a/scaledp/models/detectors/FaceDetector.py b/scaledp/models/detectors/FaceDetector.py new file mode 100644 index 0000000..48f1776 --- /dev/null +++ b/scaledp/models/detectors/FaceDetector.py @@ -0,0 +1,26 @@ +from types import MappingProxyType + +from scaledp.enums import Device +from scaledp.models.detectors.YoloOnnxDetector import YoloOnnxDetector + + +class FaceDetector(YoloOnnxDetector): + defaultParams = MappingProxyType( + { + "inputCol": "image", + "outputCol": "boxes", + "keepInputData": False, + "scaleFactor": 1.0, + "scoreThreshold": 0.2, + "device": Device.CPU, + "batchSize": 2, + "partitionMap": False, + "numPartitions": 0, + "pageCol": "page", + "pathCol": "path", + "propagateError": False, + "task": "detect", + "onlyRotated": False, + "model": "StabRise/face_detection", + }, + ) diff --git a/scaledp/models/detectors/SignatureDetector.py b/scaledp/models/detectors/SignatureDetector.py index 8c8613e..5d4c8eb 100644 --- a/scaledp/models/detectors/SignatureDetector.py +++ b/scaledp/models/detectors/SignatureDetector.py @@ -1,5 +1,26 @@ -from scaledp import YoloOnnxDetector +from types import MappingProxyType + +from scaledp.enums import Device +from scaledp.models.detectors.YoloOnnxDetector import YoloOnnxDetector class SignatureDetector(YoloOnnxDetector): - pass + defaultParams = MappingProxyType( + { + "inputCol": "image", + "outputCol": "signatures", + "keepInputData": False, + "scaleFactor": 1.0, + "scoreThreshold": 0.2, + "device": Device.CPU, + "batchSize": 2, + "partitionMap": False, + "numPartitions": 0, + "pageCol": "page", + "pathCol": "path", + "propagateError": False, + "task": "detect", + "onlyRotated": False, + "model": "StabRise/signature_detection", + }, + ) diff --git a/scaledp/models/detectors/YoloOnnxDetector.py b/scaledp/models/detectors/YoloOnnxDetector.py index 90ab708..0580c02 100644 --- a/scaledp/models/detectors/YoloOnnxDetector.py +++ b/scaledp/models/detectors/YoloOnnxDetector.py @@ -27,6 +27,14 @@ class YoloOnnxDetector(BaseDetector, HasDevice, HasBatchSize): typeConverter=TypeConverters.toString, ) + # Add padding param: integer percent to expand detected boxes + padding = Param( + Params._dummy(), + "padding", + "Padding percent to expand detected boxes (integer).", + typeConverter=TypeConverters.toInt, + ) + defaultParams = MappingProxyType( { "inputCol": "image", @@ -43,6 +51,7 @@ class YoloOnnxDetector(BaseDetector, HasDevice, HasBatchSize): "propagateError": False, "task": "detect", "onlyRotated": False, + "padding": 0, # default padding percent }, ) @@ -88,9 +97,30 @@ def call_detector(cls, images, params): # Convert PIL to NumPy (RGB) image_np = np.array(image) raw_boxes, scores, class_ids = detector.detect_objects(image_np) - + # Expand boxes by padding percent if provided + pad_percent = int(params.get("padding", 0)) if params is not None else 0 + h_img, w_img = image_np.shape[:2] for box in raw_boxes: - boxes.append(Box.from_bbox(box)) + # Assume box format is [x1, y1, x2, y2] + if pad_percent and len(box) >= 4: + x1, y1, x2, y2 = ( + float(box[0]), + float(box[1]), + float(box[2]), + float(box[3]), + ) + w = x2 - x1 + h = y2 - y1 + dx = (pad_percent / 100.0) * w + dy = (pad_percent / 100.0) * h + x1_new = max(0.0, x1 - dx) + y1_new = max(0.0, y1 - dy) + x2_new = min(float(w_img - 1), x2 + dx) + y2_new = min(float(h_img - 1), y2 + dy) + expanded_box = [x1_new, y1_new, x2_new, y2_new] + else: + expanded_box = box + boxes.append(Box.from_bbox(expanded_box)) results_final.append( DetectorOutput(path=image_path, type="yolo", bboxes=boxes), ) diff --git a/scaledp/models/detectors/yolo/yolo.py b/scaledp/models/detectors/yolo/yolo.py index dcc2027..7542c1e 100644 --- a/scaledp/models/detectors/yolo/yolo.py +++ b/scaledp/models/detectors/yolo/yolo.py @@ -132,9 +132,6 @@ def prepare_input(self, image): self.img_height, self.img_width = image.shape[:2] input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - print(input_img.shape) - print(self.input_shape) - print(f"Input width: {self.input_width}, Input height: {self.input_height}") # Rescale image with padding instead of simple resize input_img = self.rescale_image_with_padding( diff --git a/scaledp/params.py b/scaledp/params.py index d508686..ea1bca8 100644 --- a/scaledp/params.py +++ b/scaledp/params.py @@ -525,6 +525,14 @@ def _validate(self, column_name: str, dataset: Any) -> Any: Validate input schema. """ if column_name not in dataset.columns: + if len(column_name.split(".")) > 1: + root_col = column_name.split(".")[0] + if root_col not in dataset.columns: + raise ValueError( + f"Missing input column in transformer {self.uid}: " + f"Column '{root_col}' is not present.", + ) + return dataset[column_name] raise ValueError( f"Missing input column in transformer {self.uid}: " f"Column '{column_name}' is not present.", diff --git a/scaledp/pdf/PdfAddTextLayer.py b/scaledp/pdf/PdfAddTextLayer.py new file mode 100644 index 0000000..14b9f07 --- /dev/null +++ b/scaledp/pdf/PdfAddTextLayer.py @@ -0,0 +1,174 @@ +import io +import logging +import traceback +from types import MappingProxyType +from typing import Any + +import fitz +from pyspark import keyword_only +from pyspark.ml import Transformer +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql.functions import udf + +from scaledp.params import HasColumnValidator, HasInputCols, HasOutputCol, HasResolution +from scaledp.schemas.PdfDocument import PdfDocument + + +class PdfAddTextLayer( + Transformer, + DefaultParamsReadable, + DefaultParamsWritable, + HasOutputCol, + HasInputCols, + HasColumnValidator, + HasResolution, +): + """Add text layer to PDF document using text from Document schema.""" + + DEFAULT_PARAMS = MappingProxyType( + { + "inputCols": ["pdf", "text"], + "outputCol": "pdf_with_text", + "resolution": 300, # DPI for coordinate transformation + }, + ) + + @keyword_only + def __init__(self, **kwargs: Any) -> None: + super(PdfAddTextLayer, self).__init__() + self._setDefault(**self.DEFAULT_PARAMS) + self._set(**kwargs) + + def transform_udf(self, pdf_doc, text_doc): + """Transform PDF and text documents to create PDF with text layer.""" + try: + # Check for exceptions in input documents + if pdf_doc.exception != "": + return PdfDocument( + path=pdf_doc.path, + data=bytes(), + width=pdf_doc.width, + height=pdf_doc.height, + exception=pdf_doc.exception, + ) + + if text_doc.exception != "": + return PdfDocument( + path=pdf_doc.path, + data=bytes(), + width=pdf_doc.width, + height=pdf_doc.height, + exception=text_doc.exception, + ) + + if not pdf_doc.data: + return PdfDocument( + path=pdf_doc.path, + data=bytes(), + width=pdf_doc.width, + height=pdf_doc.height, + exception="PDF document has no data", + ) + + # Open the PDF document + pdf_document = fitz.open(stream=pdf_doc.data, filetype="pdf") + + if len(pdf_document) == 0: + pdf_document.close() + return PdfDocument( + path=pdf_doc.path, + data=bytes(), + width=pdf_doc.width, + height=pdf_doc.height, + exception="PDF document has no pages", + ) + + # Get the first page (assuming single page PDF as per requirement) + page = pdf_document[0] + + # Calculate scale factor from image coordinates to PDF coordinates + pdf_dpi = 72.0 # PDF native DPI + scale_factor = pdf_dpi / self.getResolution() + + # Add text layer using bounding boxes from Document + if text_doc.bboxes: + for bbox in text_doc.bboxes: + # Convert image coordinates to PDF coordinates + # Image coordinates: origin at top-left, y increases downward + # PDF coordinates: origin at bottom-left, y increases upward + + pdf_x = bbox.x * scale_factor + pdf_y = (bbox.y - 0.2 * bbox.height) * scale_factor # Flip Y axis + pdf_height = bbox.height * scale_factor + + # Insert text at the specified position using the correct PyMuPDF method + page.insert_text( + point=fitz.Point( + pdf_x, + pdf_y + pdf_height, + ), # Bottom-left of text + text=bbox.text, + fontsize=max(8, pdf_height * 0.8), # Scale font size + # with bbox height + color=(0, 0, 0), # Black text + overlay=False, + ) + + # Save the modified PDF to bytes + output_buffer = io.BytesIO() + pdf_document.save(output_buffer) + pdf_bytes = output_buffer.getvalue() + pdf_document.close() + + return PdfDocument( + path=pdf_doc.path, + data=pdf_bytes, + width=pdf_doc.width, + height=pdf_doc.height, + exception="", + ) + + except Exception: + exception = traceback.format_exc() + exception = f"PdfAddTextLayer: {exception}" + logging.warning(exception) + return PdfDocument( + path=pdf_doc.path if pdf_doc else "", + data=bytes(), + width=pdf_doc.width if pdf_doc else None, + height=pdf_doc.height if pdf_doc else None, + exception=exception, + ) + + def _transform(self, dataset): + """Transform the dataset by adding text layer to PDF documents.""" + output_col = self.getOutputCol() + input_cols = self.getInputCols() + + # Validate that we have exactly 2 input columns + if len(input_cols) != 2: + raise ValueError( + f"PdfAddTextLayer requires exactly 2 input columns " + f"(PDF and text), got {len(input_cols)}", + ) + + pdf_col, text_col = input_cols + + # Validate input columns exist + if pdf_col not in dataset.columns: + raise ValueError( + f"PDF input column '{pdf_col}' is not present in the DataFrame.", + ) + + if text_col not in dataset.columns: + raise ValueError( + f"Text input column '{text_col}' is not present in the DataFrame.", + ) + + pdf_column = dataset[pdf_col] + text_column = dataset[text_col] + + return dataset.withColumn( + output_col, + udf(self.transform_udf, PdfDocument.get_schema())(pdf_column, text_column), + ) diff --git a/scaledp/pdf/PdfAssembler.py b/scaledp/pdf/PdfAssembler.py new file mode 100644 index 0000000..55fea7d --- /dev/null +++ b/scaledp/pdf/PdfAssembler.py @@ -0,0 +1,218 @@ +import logging +import traceback +from types import MappingProxyType +from typing import Any, List + +import fitz +import pandas as pd +from pyspark import keyword_only +from pyspark.ml import Transformer +from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable +from pyspark.sql.functions import udf + +from scaledp.params import ( + HasInputCol, + HasOutputCol, + Param, + Params, + TypeConverters, +) +from scaledp.schemas.PdfDocument import PdfDocument + + +class HasGroupByCol(Params): + """ + Mixin for param groupByCol: column name to group by. + """ + + groupByCol: "Param[str]" = Param( + Params._dummy(), + "groupByCol", + "column name to group by.", + typeConverter=TypeConverters.toString, + ) + + def __init__(self) -> None: + super(HasGroupByCol, self).__init__() + + def getGroupByCol(self) -> str: + """ + Gets the value of groupByCol or its default value. + """ + return self.getOrDefault(self.groupByCol) + + def setGroupByCol(self, value): + """ + Sets the value of :py:attr:`groupByCol`. + """ + return self._set(groupByCol=value) + + +class PdfAssembler( + Transformer, + HasInputCol, + HasOutputCol, + HasGroupByCol, + DefaultParamsReadable, + DefaultParamsWritable, +): + """ + Assembles single-page PDFs into a single PDF document. + + Takes a column containing single-page PDF documents, groups them by origin, + and creates a single PDF using PyMuPDF (fitz). + """ + + DEFAULT_PARAMS = MappingProxyType( + { + "inputCol": "pdf", + "outputCol": "assembled_pdf", + "groupByCol": "path", + }, + ) + + @keyword_only + def __init__(self, **kwargs: Any) -> None: + super(PdfAssembler, self).__init__() + self._setDefault(**self.DEFAULT_PARAMS) + self._set(**kwargs) + + def convert_to_pdf(self, pdfs: List[PdfDocument]) -> PdfDocument: + """ + Convert a list of single-page PDF documents into a single PDF. + + Args: + pdfs: List of PdfDocument objects representing single pages + + Returns: + PdfDocument: A single PDF document containing all pages + """ + try: + if not pdfs or len(pdfs) == 0: + return PdfDocument( + path="", + data=bytes(), + exception="No PDFs to assemble", + ) + + # Filter out invalid PDFs and sort by page if available + valid_pdfs = [] + for pdf_page in pdfs: + if ( + pdf_page.data is not None + and len(pdf_page.data) > 0 + and pdf_page.exception == "" + ): + valid_pdfs.append(pdf_page) + + if not valid_pdfs: + return PdfDocument( + path=pdfs[0].path if pdfs else "", + data=bytes(), + exception="No valid PDFs to assemble", + ) + + # Create new PDF document + pdf = fitz.open() + + for pdf_page in valid_pdfs: + try: + # Open the single page PDF + page_doc = fitz.open("pdf", pdf_page.data) + # Insert all pages from this document (should be just one) + pdf.insert_pdf(page_doc) + page_doc.close() + except Exception as e: + logging.warning(f"Failed to insert page from {pdf_page.path}: {e}") + continue + + # Write the assembled PDF to bytes + pdf_bytes = pdf.write() + pdf.close() + + return PdfDocument( + path=valid_pdfs[0].path, + data=pdf_bytes, + exception="", + ) + + except Exception: + exception = traceback.format_exc() + exception = f"PdfAssembler: {exception}" + logging.warning(exception) + return PdfDocument( + path=pdfs[0].path if pdfs and len(pdfs) > 0 else "", + data=bytes(), + exception=exception, + ) + + def _transform(self, dataset): + """ + Transform the dataset by grouping single-page PDFs and assembling them. + """ + output_col = self.getOutputCol() + input_col = self.getInputCol() + group_by_col = self.getGroupByCol() + + if input_col not in dataset.columns: + raise ValueError( + f"Input column '{input_col}' is not present in the DataFrame.", + ) + + if group_by_col not in dataset.columns: + raise ValueError( + f"Group by column '{group_by_col}' is not present in the DataFrame.", + ) + + # Check if we're working with pandas DataFrame (PandasPipeline) + if isinstance(dataset, pd.DataFrame): + # Pandas DataFrame approach + # Sort by path and page number if available + sort_columns = [] + if "path" in dataset.columns: + sort_columns.append("path") + if "page_number" in dataset.columns: + sort_columns.append("page_number") + + if sort_columns: + dataset = dataset.sort_values(sort_columns).reset_index(drop=True) + + # Group by the specified column and collect PDFs for each group + grouped = dataset.groupby(group_by_col)[input_col].apply(list).reset_index() + grouped.columns = [group_by_col, "pdfs"] + + # Apply the conversion function to each group + assembled_pdfs = [] + for _, row in grouped.iterrows(): + assembled_pdf = self.convert_to_pdf(row["pdfs"]) + assembled_pdfs.append(assembled_pdf) + + # Create result DataFrame + result = grouped[[group_by_col]].copy() + result[output_col] = assembled_pdfs + + else: + # Spark DataFrame approach (original implementation) + # Sort by path and page number if available, then group by the specified column + sorted_dataset = dataset + if "path" in dataset.columns: + sorted_dataset = dataset.orderBy("path") + if "page_number" in dataset.columns: + sorted_dataset = sorted_dataset.orderBy("page_number") + + # Group by the specified column and collect all PDFs for each group + from pyspark.sql.functions import collect_list + + grouped_dataset = sorted_dataset.groupBy(dataset[group_by_col]).agg( + collect_list(dataset[input_col]).alias("pdfs"), + ) + + # Apply the UDF to assemble PDFs for each group + result_dataset = grouped_dataset.withColumn( + output_col, + udf(self.convert_to_pdf, PdfDocument.get_schema())("pdfs"), + ) + + return result_dataset + + return result diff --git a/scaledp/pdf/PdfDataToImage.py b/scaledp/pdf/PdfDataToImage.py index 12841ef..c087165 100644 --- a/scaledp/pdf/PdfDataToImage.py +++ b/scaledp/pdf/PdfDataToImage.py @@ -8,7 +8,7 @@ from pyspark.ml import Transformer from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable from pyspark.pandas import DataFrame -from pyspark.sql.functions import udf +from pyspark.sql.functions import lit, udf from pyspark.sql.types import ArrayType, Row from scaledp.enums import ImageType @@ -110,7 +110,10 @@ def transform_udf(self, input: Row, path: Row) -> list[Image]: def _transform(self, dataset: DataFrame) -> DataFrame: out_col = self.getOutputCol() input_col = self._validate(self.getInputCol(), dataset) - path_col = dataset[self.getPathCol()] + try: + path_col = self._validate(self.getPathCol(), dataset) + except Exception: + path_col = lit("memory") df_1 = dataset.withColumn( "temp_data", diff --git a/scaledp/pdf/__init__.py b/scaledp/pdf/__init__.py index 8b13789..46f8837 100644 --- a/scaledp/pdf/__init__.py +++ b/scaledp/pdf/__init__.py @@ -1 +1,17 @@ +from scaledp.pdf.PdfAddTextLayer import PdfAddTextLayer +from scaledp.pdf.PdfAssembler import PdfAssembler +from scaledp.pdf.PdfDataToDocument import PdfDataToDocument +from scaledp.pdf.PdfDataToImage import PdfDataToImage +from scaledp.pdf.PdfDataToSingleImage import PdfDataToSingleImage +from scaledp.pdf.PdfDataToText import PdfDataToText +from scaledp.pdf.SingleImageToPdf import SingleImageToPdf +__all__ = [ + "PdfDataToImage", + "PdfDataToSingleImage", + "PdfAddTextLayer", + "PdfAssembler", + "PdfDataToDocument", + "PdfDataToText", + "SingleImageToPdf", +] diff --git a/scaledp/pipeline/PandasPipeline.py b/scaledp/pipeline/PandasPipeline.py index 974abb1..f76f62a 100644 --- a/scaledp/pipeline/PandasPipeline.py +++ b/scaledp/pipeline/PandasPipeline.py @@ -149,7 +149,7 @@ def __init__(self, stages) -> None: self.setStages(stages) def fromFile(self, filename: str) -> Any: - with Path.open(filename, "rb") as f: + with Path(filename).open("rb") as f: data = f.read() data = DatasetPd({"content": [data], "path": [filename], "resolution": [0]}) diff --git a/scaledp/utils/show_utils.py b/scaledp/utils/show_utils.py index aeb9dac..bcda3dc 100644 --- a/scaledp/utils/show_utils.py +++ b/scaledp/utils/show_utils.py @@ -184,6 +184,9 @@ def show_pdf( if column_type == "binary": df = PdfDataToImage(inputCol=column).transform(df) column = "image" + elif "struct" in column_type and "data" in column_type: + df = PdfDataToImage(inputCol="data").transform(df.select(f"{column}.data")) + column = "image" else: raise ValueError("Column must be binary") for id_, row in enumerate(df.limit(limit).select(column).collect()): diff --git a/tests/conftest.py b/tests/conftest.py index 70440f3..4f69427 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ def image_file(resource_path_root): @pytest.fixture def image_rotated_text_file(resource_path_root): - return (resource_path_root / "images/RotatedText1.png").absolute().as_posix() + return (resource_path_root / "images/RotatedText.png").absolute().as_posix() @pytest.fixture @@ -115,15 +115,22 @@ def image_pdf_df(spark_session, resource_path_root): @pytest.fixture -def signatures_pdf_df(spark_session, resource_path_root): +def signatures_pdf_file(resource_path_root): + return ( + (resource_path_root / "pdfs" / "SampleWithSignatures.pdf").absolute().as_posix() + ) + + +@pytest.fixture +def signatures_pdf_df(spark_session, signatures_pdf_file): return spark_session.read.format("binaryFile").load( - (resource_path_root / "pdfs" / "signatures.pdf").absolute().as_posix(), + signatures_pdf_file, ) @pytest.fixture -def signatures_pdf_file(spark_session, resource_path_root): - return (resource_path_root / "pdfs" / "signatures.pdf").absolute().as_posix() +def face_pdf_file(spark_session, resource_path_root): + return (resource_path_root / "pdfs" / "SampleWithFace.pdf").absolute().as_posix() @pytest.fixture @@ -190,6 +197,17 @@ def image_signature_df(spark_session, resource_path_root): return bin_to_image.transform(df) +@pytest.fixture +def image_face_df(spark_session, resource_path_root): + df = spark_session.read.format("binaryFile").load( + (resource_path_root / "images" / "document_with_face.png") + .absolute() + .as_posix(), + ) + bin_to_image = DataToImage().setImageType(ImageType.WEBP.value) + return bin_to_image.transform(df) + + @pytest.fixture def receipt_json(receipt_json_path: Path) -> Path: return receipt_json_path.open("r").read() diff --git a/tests/image/test_data_to_image.py b/tests/image/test_data_to_image.py index 0ef8756..8aca14a 100644 --- a/tests/image/test_data_to_image.py +++ b/tests/image/test_data_to_image.py @@ -12,6 +12,33 @@ def test_data_to_image(raw_image_df): assert result[0].image.path == result[0].path assert result[0].image.exception == "" + to_image = DataToImage() + result = to_image.transform(raw_image_df) + result1 = ( + DataToImage(inputCol="image.data", outputCol="image1") + .transform(result) + .collect() + ) + + assert len(result1) == 1 + # present image field + assert hasattr(result1[0], "image") + # image has right path field + assert result1[0].image.path == result1[0].path + assert result1[0].image.exception == "" + + +def test_data_to_image_without_path(raw_image_df): + to_image = DataToImage() + result = to_image.transform(raw_image_df.drop("path")).collect() + + assert len(result) == 1 + # present image field + assert hasattr(result[0], "image") + # image has right path field + assert result[0].image.path == "memory" + assert result[0].image.exception == "" + def test_wrong_data_to_image(binary_pdf_df): to_image = DataToImage() diff --git a/tests/models/detectors/test_face_detector.py b/tests/models/detectors/test_face_detector.py new file mode 100644 index 0000000..66f9867 --- /dev/null +++ b/tests/models/detectors/test_face_detector.py @@ -0,0 +1,90 @@ +import tempfile + +from pyspark.ml import PipelineModel + +from scaledp import ImageDrawBoxes, PdfDataToImage +from scaledp.enums import Device +from scaledp.models.detectors.FaceDetector import FaceDetector +from scaledp.pipeline.PandasPipeline import PandasPipeline + + +def test_face_detector(image_face_df): + + detector = FaceDetector( + device=Device.CPU, + keepInputData=True, + partitionMap=True, + numPartitions=0, + scoreThreshold=0.25, + task="detect", + padding=20, + ) + + draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "boxes"], + filled=False, + color="green", + lineWidth=5, + displayDataList=[], + ) + # Transform the image dataframe through the OCR stage + pipeline = PipelineModel(stages=[detector, draw]) + result = pipeline.transform(image_face_df) + + data = result.select("image_with_boxes", "boxes").collect() + + # Verify the pipeline result + assert len(data) == 1, "Expected exactly one result" + + # # Check that exceptions is empty + assert data[0].boxes.exception == "" + + # Save the output image to a temporary file for verification + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp: + temp.write(data[0].image_with_boxes.data) + temp.close() + + # Print the path to the temporary file + print("file://" + temp.name) + + +def test_face_pdf_detector_pandas(face_pdf_file, patch_spark): + + pdf = PdfDataToImage( + inputCol="content", + outputCol="image", + pageLimit=1, + ) + + detector = FaceDetector( + device=Device.CPU, + keepInputData=True, + partitionMap=False, + numPartitions=0, + scoreThreshold=0.25, + task="detect", + ) + + draw = ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "boxes"], + filled=False, + color="green", + lineWidth=5, + displayDataList=["score", "angle"], + ) + # Transform the image dataframe through the OCR stage + pipeline = PandasPipeline(stages=[pdf, detector, draw]) + data = pipeline.fromFile(face_pdf_file) + + # Verify the pipeline result + assert len(data) == 1, "Expected exactly one result" + + # Save the output image to a temporary file for verification + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp: + temp.write(data["image_with_boxes"][0].data) + temp.close() + + # Print the path to the temporary file + print("file://" + temp.name) diff --git a/tests/models/detectors/test_layout_detector.py b/tests/models/detectors/test_layout_detector.py index 8b982d6..243e260 100644 --- a/tests/models/detectors/test_layout_detector.py +++ b/tests/models/detectors/test_layout_detector.py @@ -36,16 +36,6 @@ def layout_detector(): ) -def test_layout_detector_initialization(layout_detector): - """Test that LayoutDetector initializes correctly.""" - assert layout_detector.getInputCol() == "image" - assert layout_detector.getOutputCol() == "layout_boxes" - assert layout_detector.getScoreThreshold() == 0.5 - assert layout_detector.getDevice() == Device.CPU - assert layout_detector.getWhiteList() == [] - assert layout_detector.getModel() == "PP-DocLayout_plus-L" - - def test_layout_detector_with_drawn_boxes(image_df): """Test LayoutDetector with drawn boxes on the original image.""" detector = LayoutDetector( @@ -114,15 +104,3 @@ def test_layout_detector_with_custom_layout_types(): assert detector.getWhiteList() == ["text", "table"] assert detector.getModel() == "PP-DocLayout-M" - - -def test_layout_detector_output_schema(layout_detector): - """Test that the output schema is correct.""" - schema = layout_detector.outputSchema() - - # Check that the schema has the expected fields - field_names = [field.name for field in schema.fields] - expected_fields = ["path", "type", "bboxes", "exception"] - - for field in expected_fields: - assert field in field_names diff --git a/tests/models/detectors/test_signature_detector.py b/tests/models/detectors/test_signature_detector.py index da3f966..9432c6b 100644 --- a/tests/models/detectors/test_signature_detector.py +++ b/tests/models/detectors/test_signature_detector.py @@ -1,7 +1,5 @@ import tempfile -import pyspark -from pipeline.PandasPipeline import PandasPipeline, pathSparkFunctions from pyspark.ml import PipelineModel from scaledp import ( @@ -10,7 +8,7 @@ SignatureDetector, ) from scaledp.enums import Device -from scaledp.pdf.PdfDataToSingleImage import PdfDataToSingleImage +from scaledp.pipeline.PandasPipeline import PandasPipeline def test_signature_detector(image_signature_df): @@ -20,14 +18,12 @@ def test_signature_detector(image_signature_df): keepInputData=True, partitionMap=True, numPartitions=0, - scoreThreshold=0.25, task="detect", - model="/home/mykola/PycharmProjects/scaledp-models/detection/document/signature/detector_yolo_1cls.onnx", ) draw = ImageDrawBoxes( keepInputData=True, - inputCols=["image", "boxes"], + inputCols=["image", "signatures"], filled=False, color="green", lineWidth=5, @@ -43,7 +39,7 @@ def test_signature_detector(image_signature_df): assert len(data) == 1, "Expected exactly one result" # # Check that exceptions is empty - assert data[0].boxes.exception == "" + assert data[0].signatures.exception == "" # Save the output image to a temporary file for verification with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp: @@ -56,28 +52,23 @@ def test_signature_detector(image_signature_df): def test_signature_pdf_detector(signatures_pdf_df): - pdf = PdfDataToSingleImage(outputCol="image", keepInputData=True) - - detector = SignatureDetector( - device=Device.CPU, - keepInputData=True, - partitionMap=False, - numPartitions=0, - scoreThreshold=0.25, - task="detect", - model="/home/mykola/PycharmProjects/scaledp-models/detection/document/signature/detector_yolo_1cls.onnx", + pipeline = PipelineModel( + stages=[ + PdfDataToImage(outputCol="image"), + SignatureDetector( + device=Device.CPU, + keepInputData=True, + outputCol="signatures", + scoreThreshold=0.20, + ), + ImageDrawBoxes( + keepInputData=True, + inputCols=["image", "signatures"], + filled=True, + color="black", + ), + ], ) - - draw = ImageDrawBoxes( - keepInputData=True, - inputCols=["image", "boxes"], - filled=False, - color="green", - lineWidth=5, - displayDataList=["score", "angle"], - ) - # Transform the image dataframe through the OCR stage - pipeline = PipelineModel(stages=[pdf, detector, draw]) result = pipeline.transform(signatures_pdf_df) data = result.collect() @@ -86,7 +77,7 @@ def test_signature_pdf_detector(signatures_pdf_df): assert len(data) == 1, "Expected exactly one result" # # Check that exceptions is empty - assert data[0].boxes.exception == "" + assert data[0].signatures.exception == "" # Save the output image to a temporary file for verification with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp: @@ -97,11 +88,7 @@ def test_signature_pdf_detector(signatures_pdf_df): print("file://" + temp.name) -def test_signature_pdf_detector_pandas(signatures_pdf_file): - pathSparkFunctions(pyspark) - - # pdf = PdfDataToSingleImage(inputCol="content", outputCol="image", - # keepInputData=True) +def test_signature_pdf_detector_pandas(signatures_pdf_file, patch_spark): pdf = PdfDataToImage( inputCol="content", @@ -121,7 +108,7 @@ def test_signature_pdf_detector_pandas(signatures_pdf_file): draw = ImageDrawBoxes( keepInputData=True, - inputCols=["image", "boxes"], + inputCols=["image", "signatures"], filled=False, color="green", lineWidth=5, diff --git a/tests/pdf/test_single_image_to_pdf.py b/tests/pdf/test_single_image_to_pdf.py index 2ce87da..b6e5523 100644 --- a/tests/pdf/test_single_image_to_pdf.py +++ b/tests/pdf/test_single_image_to_pdf.py @@ -41,3 +41,23 @@ def test_image_to_pdf(image_df): # Write PDF data to temporary file temp.write(result[0].pdf.data) + + +def test_image_to_pdf_show(image_df): + """ + Test function to convert image DataFrame to PDF format. + + Args: + image_df: DataFrame containing image data to be converted + + Side Effects: + - Creates a temporary PDF file with the converted image + - Prints the local file path of the created PDF + """ + # Initialize the PDF converter + image_to_pdf = SingleImageToPdf() + + # Transform the image DataFrame to PDF format and cache for performance + result_df = image_to_pdf.transform(image_df).cache() + + result_df.show_pdf(column="pdf", limit=1) diff --git a/tests/pytest.ini b/tests/pytest.ini index 9e1fe95..7284a33 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -5,3 +5,4 @@ spark_options = addopts = --log-cli-level=INFO -s env = PYARROW_IGNORE_TIMEZONE = 1 + OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES diff --git a/tests/testresources/images/document_with_face.png b/tests/testresources/images/document_with_face.png new file mode 100644 index 0000000..2d3f77e Binary files /dev/null and b/tests/testresources/images/document_with_face.png differ diff --git a/tests/testresources/pdfs/SampleWithFace.pdf b/tests/testresources/pdfs/SampleWithFace.pdf new file mode 100644 index 0000000..56e359e Binary files /dev/null and b/tests/testresources/pdfs/SampleWithFace.pdf differ diff --git a/tests/testresources/pdfs/SampleWithQRCode.pdf b/tests/testresources/pdfs/SampleWithQRCode.pdf new file mode 100644 index 0000000..98db21f Binary files /dev/null and b/tests/testresources/pdfs/SampleWithQRCode.pdf differ diff --git a/tests/testresources/pdfs/SampleWithRotatedText.pdf b/tests/testresources/pdfs/SampleWithRotatedText.pdf new file mode 100644 index 0000000..ab7b0b3 Binary files /dev/null and b/tests/testresources/pdfs/SampleWithRotatedText.pdf differ diff --git a/tests/testresources/pdfs/signatures.pdf b/tests/testresources/pdfs/SampleWithSignatures.pdf similarity index 91% rename from tests/testresources/pdfs/signatures.pdf rename to tests/testresources/pdfs/SampleWithSignatures.pdf index f59c38e..669ded0 100644 Binary files a/tests/testresources/pdfs/signatures.pdf and b/tests/testresources/pdfs/SampleWithSignatures.pdf differ