diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..0275714 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,29 @@ +name: tox + +on: + release: + types: + - published + +jobs: + publish: + name: Publish to PyPI registry + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/pdfss + permissions: + id-token: write + steps: + - name: Switch to using Python 3.11 by default + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - name: Install build + run: python3 -m pip install --user build + - name: Check out src from Git + uses: actions/checkout@v2 + - name: Build dists + run: python3 -m build + - name: Publish to pypi.org + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/tox.yml b/.github/workflows/tox.yml index 5c0458f..95e50c0 100644 --- a/.github/workflows/tox.yml +++ b/.github/workflows/tox.yml @@ -1,18 +1,12 @@ name: tox on: - create: - tags: - - "**" push: branches: - - "main" + - "main" pull_request: - release: - types: - - published schedule: - - cron: 0 0 1 * * # every month + - cron: 0 0 1 * * # every month jobs: build: @@ -22,106 +16,34 @@ jobs: fail-fast: false matrix: include: - - tox_env: check-manifest - - tox_env: lint - - tox_env: py39 - - tox_env: py310 - - tox_env: py311 + - tox_env: lint + - tox_env: py39 + - tox_env: py310 + - tox_env: py311 steps: - - uses: actions/checkout@v2 - - name: Find python version - id: py_ver - shell: python - if: ${{ contains(matrix.tox_env, 'py') }} - run: | - v = '${{ matrix.tox_env }}'.split('-')[0].lstrip('py') - print('::set-output name=version::{}.{}'.format(v[0],v[1:])) - - name: Install a default Python - uses: actions/setup-python@v2 - if: ${{ ! contains(matrix.tox_env, 'py') }} - with: - python-version: 3.11 - - name: Set up Python version - uses: actions/setup-python@v2 - if: ${{ contains(matrix.tox_env, 'py') }} - with: - python-version: ${{ steps.py_ver.outputs.version }} - - name: Install tox - run: | - pip install tox - - name: Run tox -e ${{ matrix.tox_env }} - run: | - echo "${{ matrix.PREFIX }} tox -e ${{ matrix.tox_env }}" - ${{ matrix.PREFIX }} tox -e ${{ matrix.tox_env }} - - publish: - name: Publish to PyPI registry - needs: - - build - runs-on: ubuntu-latest - - env: - PY_COLORS: 1 - - steps: - - name: Switch to using Python 3.11 by default - uses: actions/setup-python@v2 - with: - python-version: 3.11 - - name: Install build - run: python3 -m pip install --user build - - name: Check out src from Git - uses: actions/checkout@v2 - with: - # Get shallow Git history (default) for release events - # but have a complete clone for any other workflows. - # Both options fetch tags but since we're going to remove - # one from HEAD in non-create-tag workflows, we need full - # history for them. - fetch-depth: >- - ${{ - ( - ( - github.event_name == 'create' && - github.event.ref_type == 'tag' - ) || - github.event_name == 'release' - ) && - 1 || 0 - }} - - name: Drop Git tags from HEAD for non-tag-create and non-release events - if: >- - ( - github.event_name != 'create' || - github.event.ref_type != 'tag' - ) && - github.event_name != 'release' - run: >- - git tag --points-at HEAD - | - xargs git tag --delete - - name: Build dists - run: python3 -m build - - name: Publish to test.pypi.org - if: >- - ( - github.event_name == 'push' && - github.ref == format( - 'refs/heads/{0}', github.event.repository.default_branch - ) - ) || - ( - github.event_name == 'create' && - github.event.ref_type == 'tag' - ) - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.testpypi_password }} - repository_url: https://test.pypi.org/legacy/ - - name: Publish to pypi.org - if: >- # "create" workflows run separately from "push" & "pull_request" - github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.pypi_password }} + - uses: actions/checkout@v2 + - name: Find python version + id: py_ver + shell: python + if: ${{ contains(matrix.tox_env, 'py') }} + run: | + v = '${{ matrix.tox_env }}'.split('-')[0].lstrip('py') + print('::set-output name=version::{0}.{1}'.format(v[0],v[1:])) + - name: Install a default Python + uses: actions/setup-python@v2 + if: ${{ ! contains(matrix.tox_env, 'py') }} + with: + python-version: 3.11 + - name: Set up Python version + uses: actions/setup-python@v2 + if: ${{ contains(matrix.tox_env, 'py') }} + with: + python-version: ${{ steps.py_ver.outputs.version }} + - name: Install tox + run: | + pip install tox + - name: Run tox -e ${{ matrix.tox_env }} + run: | + echo "${{ matrix.PREFIX }} tox -e ${{ matrix.tox_env }}" + ${{ matrix.PREFIX }} tox -e ${{ matrix.tox_env }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4555099 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +# +# Should more or less follow lint and typing settings as found is tox.ini +# +# Once pre-commit package is installed in your environnement, install hooks +# with `pre-commit install` +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-added-large-files +- repo: local + hooks: + - id: ruff-check + name: ruff-check + entry: ruff check + language: system + types: [python] + - id: ruff-format + name: ruff + entry: ruff format --check --diff + language: system + types: [python] diff --git a/COPYING b/COPYING index ecbd12b..36caec9 100644 --- a/COPYING +++ b/COPYING @@ -224,4 +224,3 @@ The hypothetical commands `show w' and `show c' should show the appropriate part You should also get your employer (if you work as a programmer) or school, if any, to sign a “copyright disclaimer” for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . - diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 5de8860..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,13 +0,0 @@ -include .flake8 -include requirements.txt -include pytest.ini -include mypy.ini -include tox.ini -include COPYING -include pdfss/py.typed -include pdfss/*.py - -recursive-include test *.py *.yml -recursive-include test/data *.pdf - -prune .tox diff --git a/mypy.ini b/mypy.ini index b63cb2d..e393049 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,9 @@ [mypy] -files = pdfss +files = . +exclude = test/data show_error_codes = true +warn_unused_ignores = true +strict = true [mypy-pdfminer.*] ignore_missing_imports = True diff --git a/pdfss/__init__.py b/pdfss/__init__.py index 2071756..453cc10 100644 --- a/pdfss/__init__.py +++ b/pdfss/__init__.py @@ -63,9 +63,9 @@ .. autofunction:: py_dump .. autofunction:: dump_pdf_structure -""" # noqa +""" -from __future__ import generator_stop +from __future__ import annotations import logging import re @@ -73,11 +73,12 @@ import zlib from bisect import bisect from collections import defaultdict +from collections.abc import Callable, Iterable, Iterator from dataclasses import dataclass from datetime import date from functools import partial from io import BytesIO, TextIOWrapper -from typing import IO, List, Optional, Tuple, Union +from typing import IO, Any, BinaryIO from pdfminer import settings from pdfminer.converter import PDFPageAggregator @@ -86,10 +87,12 @@ LAParams, LTAnno, LTChar, + LTComponent, LTContainer, LTCurve, LTFigure, LTImage, + LTItem, LTLine, LTPage, LTRect, @@ -99,6 +102,7 @@ ) from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage +from typing_extensions import Self settings.STRICT = True @@ -118,14 +122,14 @@ orig_decompress = zlib.decompress -def hacked_decompress(data): +def hacked_decompress(data: bytes) -> bytes: try: return orig_decompress(data) except zlib.error: return decompress_corrupted(data) -def decompress_corrupted(data): +def decompress_corrupted(data: bytes) -> bytes: d = zlib.decompressobj(zlib.MAX_WBITS | 32) f = BytesIO(data) result_str = b"" @@ -156,7 +160,9 @@ def pdf2text(stream: IO[bytes]) -> TextIOWrapper: return TextIOWrapper(bytes_stream, "utf-8") -def iter_pdf_ltpages(stream, pages=None): +def iter_pdf_ltpages( + stream: BinaryIO, pages: list[int] | None = None +) -> Iterator[LTPage]: """Return a generator on :class:!`pdfminer.layout.LTPage` of each page in the given PDF `stream`. @@ -212,7 +218,7 @@ def c_amount_float(value: str) -> float: return round(c_str_float(value) * factor, 6) -def c_amount_float_unit(value: str) -> Tuple[float, str]: +def c_amount_float_unit(value: str) -> tuple[float, str]: """ >>> c_amount_float_unit('25 028,80 €/mois') (25028.8, 'mois') @@ -221,7 +227,7 @@ def c_amount_float_unit(value: str) -> Tuple[float, str]: return (c_amount_float(amount_str), unit.strip()) -def c_percent_float(value: str) -> Union[int, float]: +def c_percent_float(value: str) -> int | float: """ >>> c_percent_float('20,00%') 20.0 @@ -229,7 +235,7 @@ def c_percent_float(value: str) -> Union[int, float]: return c_str_float(value.replace("%", "")) -def c_str_period(value: str) -> Tuple[date, date]: +def c_str_period(value: str) -> tuple[date, date]: """ >>> c_str_period('du 01/05/2018 au 31/05/2018') (datetime.date(2018, 5, 1), datetime.date(2018, 5, 31)) @@ -239,7 +245,7 @@ def c_str_period(value: str) -> Tuple[date, date]: return (c_dmy_date(from_date_str), c_dmy_date(to_date_str)) -def c_str_float_unit(value: str) -> Tuple[Union[int, float], str]: +def c_str_float_unit(value: str) -> tuple[int | float, str]: """ >>> c_str_float_unit('25 028 kWh') (25028, 'kWh') @@ -250,7 +256,7 @@ def c_str_float_unit(value: str) -> Tuple[Union[int, float], str]: return c_str_float(float_str.strip()), unit.strip() -def c_str_float(value: str) -> Union[int, float]: +def c_str_float(value: str) -> int | float: """ >>> c_str_float('25 028,80') 25028.8 @@ -288,9 +294,9 @@ class LineInfo: def default_line_grouper( - font_size_diff_factor=0.15, - min_y_diff=1.1, -): + font_size_diff_factor: float = 0.15, + min_y_diff: float = 1.1, +) -> Callable[[LineInfo, LineInfo], bool]: """Return a line grouper function suitable for `group_line` argument of :func:`relayout`, configured with arguments. @@ -306,7 +312,7 @@ def default_line_grouper( """ - def default_group_line(linfo, latest_linfo): + def default_group_line(linfo: LineInfo, latest_linfo: LineInfo) -> bool: """Default line grouping function, merging lines if font size are compatible and Y coordinate diff is below some factor of font size, considering bold font variant. @@ -319,7 +325,7 @@ def default_group_line(linfo, latest_linfo): if ( linfo.font_name.endswith("-bold") and not latest_linfo.font_name.endswith("-bold") - ) or ( # noqa + ) or ( latest_linfo.font_name.endswith("-bold") and not linfo.font_name.endswith("-bold") ): @@ -329,15 +335,14 @@ def default_group_line(linfo, latest_linfo): # take care allowed_y_diff may be 0, 1.1 found empirically allowed_y_diff = max(allowed_y_diff, min_y_diff) - if diff < allowed_diff and (latest_linfo.y0 - linfo.y0) <= allowed_y_diff: - return True - - return False + return diff < allowed_diff and (latest_linfo.y0 - linfo.y0) <= allowed_y_diff return default_group_line -def default_text_merger(width_factor=1.4): +def default_text_merger( + width_factor: float = 1.4, +) -> Callable[[TextBlock, LTChar], bool]: """Return a text merger function suitable for `merge_text` argument of :func:`relayout`, configured with arguments. @@ -347,17 +352,16 @@ def default_text_merger(width_factor=1.4): """ - def default_merge_text(block, ltchar): + def default_merge_text(block: TextBlock, ltchar: LTChar) -> bool: width = ltchar.width * width_factor - if (ltchar.x0 - block.x1) <= width: - return True - - return False + return (ltchar.x0 - block.x1) <= width return default_merge_text -def default_iter_text(ltobj, skip_classes=None): +def default_iter_text( + ltobj: LTComponent, skip_classes: tuple[type[LTComponent], ...] | None = None +) -> Iterator[LTChar | LTAnno]: if skip_classes is not None and isinstance(ltobj, skip_classes): return @@ -373,14 +377,16 @@ def default_iter_text(ltobj, skip_classes=None): def relayout( - ltobj, - skip_classes=DEFAULT_SKIP_CLASSES, - skip_text=None, - iter_text=default_iter_text, - ltchar_filter=None, - merge_text=default_text_merger(), # noqa - group_line=default_line_grouper(), # noqa -): + ltobj: LTPage, + skip_classes: tuple[type[LTComponent], ...] | None = DEFAULT_SKIP_CLASSES, + skip_text: set[str] | None = None, + iter_text: Callable[ + [LTComponent, tuple[type[LTComponent], ...] | None], Iterator[LTChar | LTAnno] + ] = default_iter_text, + ltchar_filter: Callable[[LTChar], bool] | None = None, + merge_text: Callable[[TextBlock, LTChar], bool] = default_text_merger(), # noqa + group_line: Callable[[LineInfo, LineInfo], bool] = default_line_grouper(), # noqa +) -> list[LinesGroup]: """Return a list of :class:LinesGroup for given PDFMiner `ltobj` instance. :param skip_classes: tuple of PDFMiner classes that should be skipped (not @@ -407,19 +413,23 @@ def relayout( """ - def iter_ltchar_index_items(items): + def iter_ltchar_index_items( + items: Iterable[tuple[float, list[LTChar]]], + ) -> Iterator[LTChar]: for _, ltchars in sorted(items): - for ltchar in ltchars: - yield ltchar + yield from ltchars # Collect ltchar instances - ltline_index = defaultdict(partial(defaultdict, list)) + ltline_index: dict[tuple[float, str, float], dict[float, list[LTChar]]] = ( + defaultdict(partial(defaultdict, list)) + ) latest_is_anno = False # Hack for page containing a figure wrapping content while we want it # skipped if ( len(ltobj._objs) == 1 and isinstance(ltobj._objs[0], LTFigure) + and skip_classes is not None and LTFigure in skip_classes ): ltobj._objs = ltobj._objs[0]._objs @@ -429,13 +439,13 @@ def iter_ltchar_index_items(items): continue # remember ltchar was preceeded by a LTAnno - lttext.add_space_left = latest_is_anno + lttext.add_space_left = latest_is_anno # type: ignore[attr-defined] latest_is_anno = False if ltchar_filter is not None and not ltchar_filter(lttext): continue - key = (lttext.y0, lttext.fontname.lower(), lttext.fontsize) + key = (lttext.y0, lttext.fontname.lower(), lttext.fontsize) # type: ignore[attr-defined] ltchar_index = ltline_index[key] ltchar_index[lttext.x0].append(lttext) @@ -471,7 +481,7 @@ def iter_ltchar_index_items(items): line.append(ltchar) # Search for column groups - group_index = {} + group_index: dict[float, list[LinesGroup]] = {} previous_line_group = None for line in lines: group = _line_group(line, group_index, previous_line_group) @@ -484,7 +494,11 @@ def iter_ltchar_index_items(items): ) -def _line_group(line, group_index, previous_line_group): +def _line_group( + line: Line, + group_index: dict[float, list[LinesGroup]], + previous_line_group: LinesGroup | None, +) -> LinesGroup: """Return :class:LinesGroup in which `line` should be added, given `group_index` (groups indexed per their x start index, i.e. {x0: [LinesGroup]}) and `previous_line_group` (the group in which line above the current one has @@ -512,12 +526,9 @@ def _line_group(line, group_index, previous_line_group): # create a new group if there are too much vertical spacing # between the previous line and the current line - if (group[-1].y0 - line.y0) > (line.font_size * 2): - group = LinesGroup() - group_index[start_index].append(group) - # or if previous line overlap x coordinate - elif ( - previous_line_group[-1].blocks[-1].x1 > line.blocks[0].x0 + if (group[-1].y0 - line.y0) > (line.font_size * 2) or ( + previous_line_group is not None + and previous_line_group[-1].blocks[-1].x1 > line.blocks[0].x0 and previous_line_group[-1].blocks[0].x0 < line.blocks[-1].x1 ): group = LinesGroup() @@ -526,15 +537,15 @@ def _line_group(line, group_index, previous_line_group): return group -def _dump_ltchar_index(ltchar_index): +def _dump_ltchar_index(ltchar_index: dict[float, list[LTChar]]) -> str: """Return string representation of :func:relayout ltchar_index data structure, for debugging purpose. """ - def ltchar_text(ltchar, i): + def ltchar_text(ltchar: LTChar, i: int) -> str: text = ltchar.get_text() - if i > 0 and ltchar.add_space_left: + if i > 0 and ltchar.add_space_left: # type: ignore[attr-defined] text = " " + text return text @@ -545,59 +556,63 @@ def ltchar_text(ltchar, i): ) -def _dump_ltline_index(ltline_index): +def _dump_ltline_index( + ltline_index: dict[tuple[float, str, float], dict[float, list[LTChar]]], +) -> str: """Return string representation of :func:relayout ltline_index data structure, for debugging purpose. """ res = [] for key, ltchar_index in sorted(ltline_index.items(), reverse=True): - res.append("{}: {}".format(key, _dump_ltchar_index(ltchar_index))) + res.append(f"{key}: {_dump_ltchar_index(ltchar_index)}") return "\n".join(res) -class LinesGroup(list): - """A list of :class:`Line` logically grouped.""" - - class Line: """A logical line, holding a list of text blocks.""" - def __init__(self, font_name, font_size, y0, merge_text): + def __init__( + self, + font_name: str, + font_size: float, + y0: float, + merge_text: Callable[[TextBlock, LTChar], bool], + ): self.font_name = font_name self.font_size = font_size # ordered list of ltchar.x0, use index to get matching ltline from # :attr:`blocks` - self._block_index = [] + self._block_index: list[float] = [] # slave list of block - self.blocks = [] + self.blocks: list[TextBlock] = [] self.y0 = y0 self.merge_text = merge_text - def __repr__(self): + def __repr__(self) -> str: blocks_str = [] for block in self.blocks: blocks_str.append(repr(block)) return "[{}: {}]".format(self.font_size, ", ".join(blocks_str)) - def __str__(self): + def __str__(self) -> str: blocks_str = [] for block in self.blocks: blocks_str.append(str(block)) return "[{}]".format(", ".join(blocks_str)) - def insert_blank_at(self, index): + def insert_blank_at(self, index: int) -> None: self.blocks.insert(0, TextBlock("", 0, 0, 0)) self._block_index.insert(0, 0) - def append(self, ltchar): + def append(self, ltchar: LTChar) -> None: if ltchar.width == 0: # some chars (picto) have width = 0, set it relative to font size # arbitrarily, it's still better than 0. 10 division factor was # found empirically. - assert ltchar.fontsize - ltchar.width = ltchar.fontsize / 10 + assert ltchar.fontsize # type: ignore[attr-defined] + ltchar.width = ltchar.fontsize / 10 # type: ignore[attr-defined] ltchar.x1 = ltchar.x0 + ltchar.width index = bisect(self._block_index, ltchar.x1) @@ -605,18 +620,22 @@ def append(self, ltchar): if index > 0 and self.merge_text(self.blocks[index - 1], ltchar): block = self.blocks[index - 1] text = ltchar.get_text() - if ltchar.add_space_left: + if ltchar.add_space_left: # type: ignore[attr-defined] text = " " + text - block.append(text, ltchar.x0, ltchar.x1, ltchar.fontsize) + block.append(text, ltchar.x0, ltchar.x1, ltchar.fontsize) # type: ignore[attr-defined] self._block_index[index - 1] = ltchar.x1 else: - block = TextBlock(ltchar.get_text(), ltchar.x0, ltchar.x1, ltchar.fontsize) + block = TextBlock(ltchar.get_text(), ltchar.x0, ltchar.x1, ltchar.fontsize) # type: ignore[attr-defined] self.blocks.insert(index, block) self._block_index.insert(index, ltchar.x1) assert len(self.blocks) == len(self._block_index) +class LinesGroup(list[Line]): + """A list of :class:`Line` logically grouped.""" + + class TextBlock: """A logical group of text. @@ -626,19 +645,19 @@ class TextBlock: :attr latest_x0: the left coordinate of the latest char in the block """ - def __init__(self, text, x0, x1, font_size): + def __init__(self, text: str, x0: float, x1: float, font_size: float): self.text = text self.x0 = x0 self.x1 = x1 self.latest_x0 = x0 - def __repr__(self): - return "<{!r} ({}, {})]>".format(self.text, self.x0, self.x1) + def __repr__(self) -> str: + return f"<{self.text!r} ({self.x0}, {self.x1})]>" - def __str__(self): - return "<{!r}>".format(self.text) + def __str__(self) -> str: + return f"<{self.text!r}>" - def append(self, text, x0, x1, font_size): + def append(self, text: str, x0: float, x1: float, font_size: float) -> None: assert self.x0 <= x0, (self.x0, x0, self.text, text) assert self.x1 <= x1, (self.x1, x1, self.text, text) self.x1 = x1 @@ -649,7 +668,9 @@ def append(self, text, x0, x1, font_size): # Dump PDF data structures ############################################# -def dump_pdf_structure(filepath, pages=None, file=sys.stdout): +def dump_pdf_structure( + filepath: str, pages: list[int] | None = None, file: IO[str] = sys.stdout +) -> None: """Print PDFMiner's structure extracted from the given PDF file, to help debugging or building scrapers. @@ -661,18 +682,23 @@ def dump_pdf_structure(filepath, pages=None, file=sys.stdout): """ with open(filepath, "rb") as stream: for i, page in enumerate(iter_pdf_ltpages(stream, pages=pages)): - print("{} page {}".format("*" * 80, i + 1)) + print("{} page {}".format("*" * 80, i + 1)) # noqa: T201 objstack = [("", o) for o in reversed(page._objs)] while objstack: prefix, b = objstack.pop() if type(b) in [LTTextBox, LTTextLine, LTTextBoxHorizontal]: print(prefix, b, file=file) - objstack += ((prefix + " ", o) for o in reversed(b._objs)) + objstack += ((prefix + " ", o) for o in reversed(b._objs)) # type: ignore[attr-defined] else: print(prefix, b, file=file) -def py_dump(filepath, out=sys.stdout, pages=None, skip_classes=DEFAULT_SKIP_CLASSES): +def py_dump( + filepath: str, + out: IO[str] = sys.stdout, + pages: list[int] | None = None, + skip_classes: tuple[type[LTComponent], ...] = DEFAULT_SKIP_CLASSES, +) -> None: """Dump PDF `filepath` file as an importable python structure in `out` stream. :param filepath: path to the PDF file. @@ -690,11 +716,16 @@ def py_dump(filepath, out=sys.stdout, pages=None, skip_classes=DEFAULT_SKIP_CLAS with open(filepath, "rb") as input_stream: for i, page in enumerate(iter_pdf_ltpages(input_stream, pages=pages)): - print("\npage{} = ".format(i + 1), file=out, end="") + print(f"\npage{i + 1} = ", file=out, end="") py_dump_ltobj(page, out=out, skip_classes=skip_classes) -def py_dump_ltobj(ltobj, out=sys.stdout, skip_classes=None, indent=0): +def py_dump_ltobj( + ltobj: LTComponent, + out: IO[str] = sys.stdout, + skip_classes: tuple[type[LTComponent], ...] | None = None, + indent: int = 0, +) -> None: """Dump PDFMiner `ltobj` object as an importable python structure in `out` stream. @@ -741,22 +772,27 @@ class ltobj: **You should not use this directly**. """ - def __init__(self, __class__, __dict__, objs=None): - self.__class__ = __class__ + def __init__( + self, + __class__: type[LTItem], + __dict__: dict[str, Any], + objs: list[Self] | None = None, + ): + self.__class__ = __class__ # type: ignore[assignment] self.__dict__ = __dict__ if objs is not None: self._objs = objs if "x0" in __dict__: # bbox necessary for repr() but not exported - self.bbox = (self.x0, self.y0, self.x1, self.y1) + self.bbox = (self.x0, self.y0, self.x1, self.y1) # type: ignore[attr-defined] -def _clean_ltobj_dict(__dict__): +def _clean_ltobj_dict(__dict__: dict[str, Any]) -> dict[str, Any]: """Return a dictionary from an ltobj's __dict__, removing entries that should not be exported and rounding float for better readability. """ - def round_value(v): + def round_value(v: Any) -> Any: if isinstance(v, float): return round(v, 2) if isinstance(v, tuple): @@ -770,9 +806,16 @@ def round_value(v): } -def _ltchar_record_fontsize_init(self, matrix, font, fontsize, *args, **kwargs): +def _ltchar_record_fontsize_init( + self: LTChar, + matrix: tuple[float, float, float, float, float, float], + font: Any, + fontsize: float, + *args: Any, + **kwargs: Any, +) -> None: ltchar_init(self, matrix, font, fontsize, *args, **kwargs) - self.fontsize = fontsize + self.fontsize = fontsize # type: ignore[attr-defined] ltchar_init = LTChar.__init__ @@ -782,7 +825,7 @@ def _ltchar_record_fontsize_init(self, matrix, font, fontsize, *args, **kwargs): ######################################################################## if __name__ == "__main__": - pages: Optional[List[int]] = None + pages: list[int] | None = None if len(sys.argv) >= 3: pages = [int(arg) for arg in sys.argv[2:]] py_dump(sys.argv[1], pages=pages) diff --git a/pyproject.toml b/pyproject.toml index 6939360..417dd30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,8 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3 :: Only", "Operating System :: OS Independent", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", @@ -22,6 +24,7 @@ classifiers = [ requires-python = ">=3.9" dynamic = ["version"] dependencies = [ + "typing-extensions", "pdfminer.six", ] @@ -43,13 +46,3 @@ source = "vcs" [tool.hatch.version.raw-options] local_scheme = "no-local-version" - -[tool.black] -target-version = ['py39'] -include = '\.pyi?$' -exclude = '(\.tox|\.git)' - -[tool.isort] -profile = "black" -multi_line_output = 3 -known_first_party = ["pdfss"] diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..5559a51 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,37 @@ +target-version = "py39" + +[lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-debugger + "T10", + # flake8-logging + "G", + # flake8-comprehension + "C4", + # flake8-simplify + "SIM", + # flake8-print + "T20", + # individual rules + "RUF100", # unused-noqa + # imports + "I", +] + +# For error codes, see https://docs.astral.sh/ruff/rules/#error-e +ignore = [ +# line too long + "E501", +] + +[lint.isort] +known-first-party = ["pdfss"] +section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] diff --git a/test/data/broken_euro.py b/test/data/broken_euro.py index 54d6239..1fb608b 100644 --- a/test/data/broken_euro.py +++ b/test/data/broken_euro.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c1_10036338943_p1.py b/test/data/edf_c1_10036338943_p1.py index edc01d4..ccde39b 100644 --- a/test/data/edf_c1_10036338943_p1.py +++ b/test/data/edf_c1_10036338943_p1.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c1_10074973936_p5.py b/test/data/edf_c1_10074973936_p5.py index 9b3b98f..174de4d 100644 --- a/test/data/edf_c1_10074973936_p5.py +++ b/test/data/edf_c1_10074973936_p5.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c1_10080595767_p1.py b/test/data/edf_c1_10080595767_p1.py index ce73416..53499e7 100644 --- a/test/data/edf_c1_10080595767_p1.py +++ b/test/data/edf_c1_10080595767_p1.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c2_10067224248_p5.py b/test/data/edf_c2_10067224248_p5.py index a07c5ed..7f99254 100644 --- a/test/data/edf_c2_10067224248_p5.py +++ b/test/data/edf_c2_10067224248_p5.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c2_10073292263_p1.py b/test/data/edf_c2_10073292263_p1.py index df2fbcd..5a40a89 100644 --- a/test/data/edf_c2_10073292263_p1.py +++ b/test/data/edf_c2_10073292263_p1.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c2_10073292263_p18.py b/test/data/edf_c2_10073292263_p18.py index 4353048..ad12d89 100644 --- a/test/data/edf_c2_10073292263_p18.py +++ b/test/data/edf_c2_10073292263_p18.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c2_10073292263_p27.py b/test/data/edf_c2_10073292263_p27.py index d4db9ce..fd224f7 100644 --- a/test/data/edf_c2_10073292263_p27.py +++ b/test/data/edf_c2_10073292263_p27.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/edf_c2_10073292263_p30.py b/test/data/edf_c2_10073292263_p30.py index 6d9a565..f506e6a 100644 --- a/test/data/edf_c2_10073292263_p30.py +++ b/test/data/edf_c2_10073292263_p30.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/text_merge1.py b/test/data/text_merge1.py index 7c97785..053ece8 100644 --- a/test/data/text_merge1.py +++ b/test/data/text_merge1.py @@ -1,6 +1,12 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import ( + LTAnno, + LTChar, + LTPage, + LTTextBoxHorizontal, + LTTextLineHorizontal, +) +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/data/text_merge2.py b/test/data/text_merge2.py index 4ac484c..627afcf 100644 --- a/test/data/text_merge2.py +++ b/test/data/text_merge2.py @@ -1,6 +1,6 @@ -from pdfminer.layout import * -from pdfss import ltobj +from pdfminer.layout import LTAnno, LTChar, LTPage, LTTextLineHorizontal +from pdfss import ltobj page1 = ltobj( LTPage, diff --git a/test/test_pdfss.py b/test/test_pdfss.py index d981ad4..2aa2c9f 100644 --- a/test/test_pdfss.py +++ b/test/test_pdfss.py @@ -1,26 +1,28 @@ +import unittest from io import StringIO from os.path import abspath, dirname, join -import unittest +from typing import Any -import pdfss +from pdfminer.layout import LTChar +import pdfss HERE = abspath(dirname(__file__)) -def datafile(*filename): +def datafile(*filename: str) -> str: return join(HERE, "data", *filename) -def read_py(code): - exec_globals = {} - exec_locals = {} +def read_py(code: str) -> Any: + exec_globals: dict[str, Any] = {} + exec_locals: dict[str, Any] = {} exec(code, exec_globals, exec_locals) return exec_locals class PDF2TextTC(unittest.TestCase): - def test(self): + def test(self) -> None: filepath = datafile("Lentilles.pdf") with open(filepath, "rb") as stream: text_stream = pdfss.pdf2text(stream) @@ -32,16 +34,13 @@ def test(self): ) -def _relayout(filename): - def ltchar_filter(ltchar): +def _relayout(filename: str) -> list[list[list[str]]]: + def ltchar_filter(ltchar: LTChar) -> bool: if ltchar.x0 < 12: return False if ltchar.fontname == "PictosSIMM": return False - if ltchar.fontsize > 100: - return False - - return True + return not ltchar.fontsize > 100 # type: ignore[attr-defined] filepath = datafile(filename) with open(filepath) as stream: @@ -71,7 +70,7 @@ def ltchar_filter(ltchar): "5,50 %", }, ): - group_result = [] + group_result: list[list[str]] = [] result.append(group_result) for line in group: @@ -83,7 +82,7 @@ def ltchar_filter(ltchar): class RelayoutTC(unittest.TestCase): maxDiff = None - def test_edf_c1_10074973936_p5(self): + def test_edf_c1_10074973936_p5(self) -> None: result = _relayout("edf_c1_10074973936_p5.py") self.assertEqual( result, @@ -91,10 +90,7 @@ def test_edf_c1_10074973936_p5(self): [["5 / 20"]], [ ["BALON OVALE"], - [ - "Détail de votre facturation par site du 02/04/2018 " - "n° 1007497" - ], + ["Détail de votre facturation par site du 02/04/2018 n° 1007497"], ["Données contrat", "Données Point de Livraison"], ["Contrat électricité Prix Fixe", "LE POULAYER 31560 NAILLOUX"], [ @@ -257,7 +253,7 @@ def test_edf_c1_10074973936_p5(self): "100", ], [ - "Acheminement : Tarif HTA5 à Pointe Fixe Longue " "Utilisation", + "Acheminement : Tarif HTA5 à Pointe Fixe Longue Utilisation", "Opérateur Heures pleines hiver", "100", ], @@ -277,7 +273,7 @@ def test_edf_c1_10074973936_p5(self): ], ) - def test_edf_c1_10080595767_p1(self): + def test_edf_c1_10080595767_p1(self) -> None: result = _relayout("edf_c1_10080595767_p1.py") self.assertEqual( result, @@ -341,7 +337,7 @@ def test_edf_c1_10080595767_p1(self): ], ) - def test_edf_c1_10036338943_p1(self): + def test_edf_c1_10036338943_p1(self) -> None: result = _relayout("edf_c1_10036338943_p1.py") self.assertEqual( result, @@ -387,7 +383,7 @@ def test_edf_c1_10036338943_p1(self): ], ) - def test_edf_c2_10073292263_p1(self): + def test_edf_c2_10073292263_p1(self) -> None: result = _relayout("edf_c2_10073292263_p1.py") self.assertEqual( result, @@ -435,7 +431,7 @@ def test_edf_c2_10073292263_p1(self): ["Montant TVA (payée sur les débits)", "112 070,63 €"], ["Facture TTC", "672 423,80 €"], ["Montant restant dû avant facture", "1 084 117,12 €"], - ["Des montants dûs antérieurs n'ont pas été totalement " "réglés."], + ["Des montants dûs antérieurs n'ont pas été totalement réglés."], ["Montant total à payer (TTC)", "1 756 540,92 €"], ["à régler avant le 28/03/2018"], [ @@ -458,12 +454,12 @@ def test_edf_c2_10073292263_p1(self): ["Paiement par Prélèvement automatique"], ["Vous serez prélevé d'un montant de", "1 756 540,92 €"], ["à partir du :", "28/03/2018"], - ["sur le compte bancaire : FR XX XXXXX XXXXX " "00002009999 XX"], + ["sur le compte bancaire : FR XX XXXXX XXXXX 00002009999 XX"], ], ], ) - def test_edf_c2_10073292263_p18(self): + def test_edf_c2_10073292263_p18(self) -> None: result = _relayout("edf_c2_10073292263_p18.py") self.assertEqual( result, @@ -493,7 +489,7 @@ def test_edf_c2_10073292263_p18(self): ], ["Pertes Joule : 1,000", "Opérateur Heures pleines hiver", "528"], [ - "Acheminement : Tarif HTA5 à Pointe Fixe Longue " "Utilisation", + "Acheminement : Tarif HTA5 à Pointe Fixe Longue Utilisation", "Opérateur Heures creuses hiver", "528", ], @@ -577,7 +573,7 @@ def test_edf_c2_10073292263_p18(self): ], ) - def test_edf_c2_10073292263_p30(self): + def test_edf_c2_10073292263_p30(self) -> None: result = _relayout("edf_c2_10073292263_p30.py") self.assertEqual( result, @@ -609,7 +605,7 @@ def test_edf_c2_10073292263_p30(self): ], ["Pertes Joule : 1,000", "Opérateur Heures pleines hiver", "585"], [ - "Acheminement : Tarif HTA5 à Pointe Fixe " "Longue Utilisation", + "Acheminement : Tarif HTA5 à Pointe Fixe Longue Utilisation", "Opérateur Heures creuses hiver", "585", ], @@ -671,7 +667,7 @@ def test_edf_c2_10073292263_p30(self): ], ) - def test_edf_c2_10067224248_p5(self): + def test_edf_c2_10067224248_p5(self) -> None: result = _relayout("edf_c2_10067224248_p5.py") self.assertEqual( result, @@ -719,7 +715,7 @@ def test_edf_c2_10067224248_p5(self): [ ["Identifiant de comptage : 021539003020"], ["Type de compteur : Compteur HTA SAPHIR"], - ["Acheminement : Tarif HTA5 à Pointe Fixe " "Longue Utilisation"], + ["Acheminement : Tarif HTA5 à Pointe Fixe Longue Utilisation"], ], [["Pertes Joule : 1,000"]], [["386536", "385656"]], @@ -782,7 +778,7 @@ def test_edf_c2_10067224248_p5(self): "608", ], ], - [["Index de fin de période relevés (en gras) ou estimés en " "kWh"]], + [["Index de fin de période relevés (en gras) ou estimés en kWh"]], [["Période", "Index de début", "Index de fin"]], [ [ @@ -910,7 +906,7 @@ def test_edf_c2_10067224248_p5(self): ], ) - def test_text_block_separation(self): + def test_text_block_separation(self) -> None: result = _relayout("edf_c2_10073292263_p27.py") self.assertEqual( result, @@ -1029,21 +1025,16 @@ def test_text_block_separation(self): ], ) - def test_text_merge1(self): + def test_text_merge1(self) -> None: result = _relayout("text_merge1.py") self.assertEqual( result, [ - [ - [ - "Taxe Communale sur la Consommation Finale " - "d'Electricité (TCCFE)" - ] - ], + [["Taxe Communale sur la Consommation Finale d'Electricité (TCCFE)"]], ], ) - def test_text_merge2(self): + def test_text_merge2(self) -> None: result = _relayout("text_merge2.py") self.assertEqual( result, @@ -1052,7 +1043,7 @@ def test_text_merge2(self): ], ) - def test_euro_fix(self): + def test_euro_fix(self) -> None: result = _relayout("broken_euro.py") self.assertEqual( result, @@ -1063,7 +1054,7 @@ def test_euro_fix(self): class DumpPDFStructureTC(unittest.TestCase): - def test(self): + def test(self) -> None: filepath = datafile("Lentilles.pdf") output = StringIO() pdfss.dump_pdf_structure(filepath, file=output) @@ -1077,7 +1068,7 @@ def test(self): class PyDumpTC(unittest.TestCase): - def test(self): + def test(self) -> None: filepath = datafile("Lentilles.pdf") out = StringIO() pdfss.py_dump(filepath, out=out) diff --git a/tox.ini b/tox.ini index 8d612ba..46eea2f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,6 @@ [tox] isolated_build = true -envlist = - check-manifest,lint,py3 +envlist = lint,py3 [testenv] extras = @@ -12,26 +11,9 @@ commands = {envpython} -m pytest {posargs:--cov pdfss --cov-report term --cov-fail-under 88} [testenv:lint] -deps = - black - flake8 - flake8-bugbear - flake8-builtins - flake8-comprehensions - flake8-debugger - flake8-logging-format - flake8-rst-docstrings - flake8-pep3101 - #flake8-print - flake8-string-format -skip_install = true -commands = - black --check --diff {toxinidir} - flake8 - -[testenv:check-manifest] skip_install = true deps = - check-manifest + pre-commit + ruff commands = - {envpython} -m check_manifest {toxinidir} + pre-commit run --all-files --show-diff-on-failure