diff --git a/README.md b/README.md index 7c78b5a..2996f16 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ import os import asyncio import sys -from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OCRPreset +from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OcrPreset single_file = "docs/document.png" @@ -116,7 +116,7 @@ options = AnyparserOption( model="ocr", format="markdown", ocr_language=[OcrLanguage.JAPANESE], - ocr_preset=OCRPreset.SCAN, + ocr_preset=OcrPreset.SCAN, ) parser = Anyparser(options) @@ -226,7 +226,7 @@ The `Anyparser` class utilizes the `AnyparserOption` dataclass for flexible conf from dataclasses import dataclass from typing import List, Literal, Optional, Union -from anyparser_core import OcrLanguage, OCRPreset +from anyparser_core import OcrLanguage, OcrPreset @dataclass class AnyparserOption: @@ -255,7 +255,7 @@ class AnyparserOption: # OCR Configuration ocr_language: Optional[List[OcrLanguage]] = None # Languages for OCR processing - ocr_preset: Optional[OCRPreset] = None # Preset configuration for OCR + ocr_preset: Optional[OcrPreset] = None # Preset configuration for OCR # Crawler Configuration max_depth: Optional[int] = None # Maximum crawl depth @@ -278,7 +278,7 @@ class AnyparserOption: | `files` | `Optional[Union[str, List[str]]]` | `None` | Input files to process | | `url` | `Optional[str]` | `None` | URL for crawler model | | `ocr_language` | `Optional[List[OcrLanguage]]` | `None` | Languages for OCR processing | -| `ocr_preset` | `Optional[OCRPreset]` | `None` | Preset configuration for OCR | +| `ocr_preset` | `Optional[OcrPreset]` | `None` | Preset configuration for OCR | | `max_depth` | `Optional[int]` | `None` | Maximum crawl depth for crawler model | | `max_executions` | `Optional[int]` | `None` | Maximum number of pages to crawl | | `strategy` | `Optional[str]` | `None` | Crawling strategy: `"LIFO"` or `"FIFO"` | @@ -288,19 +288,19 @@ class AnyparserOption: The following OCR presets are available for optimized document processing: -- `OCRPreset.DOCUMENT` - General document processing -- `OCRPreset.HANDWRITING` - Handwritten text recognition -- `OCRPreset.SCAN` - Scanned document processing -- `OCRPreset.RECEIPT` - Receipt processing -- `OCRPreset.MAGAZINE` - Magazine/article processing -- `OCRPreset.INVOICE` - Invoice processing -- `OCRPreset.BUSINESS_CARD` - Business card processing -- `OCRPreset.PASSPORT` - Passport document processing -- `OCRPreset.DRIVER_LICENSE` - Driver's license processing -- `OCRPreset.IDENTITY_CARD` - ID card processing -- `OCRPreset.LICENSE_PLATE` - License plate recognition -- `OCRPreset.MEDICAL_REPORT` - Medical document processing -- `OCRPreset.BANK_STATEMENT` - Bank statement processing +- `OcrPreset.DOCUMENT` - General document processing +- `OcrPreset.HANDWRITING` - Handwritten text recognition +- `OcrPreset.SCAN` - Scanned document processing +- `OcrPreset.RECEIPT` - Receipt processing +- `OcrPreset.MAGAZINE` - Magazine/article processing +- `OcrPreset.INVOICE` - Invoice processing +- `OcrPreset.BUSINESS_CARD` - Business card processing +- `OcrPreset.PASSPORT` - Passport document processing +- `OcrPreset.DRIVER_LICENSE` - Driver's license processing +- `OcrPreset.IDENTITY_CARD` - ID card processing +- `OcrPreset.LICENSE_PLATE` - License plate recognition +- `OcrPreset.MEDICAL_REPORT` - Medical document processing +- `OcrPreset.BANK_STATEMENT` - Bank statement processing **Model Types for AI Data Pipelines:** diff --git a/anyparser_core/__init__.py b/anyparser_core/__init__.py index 6e813d2..b562427 100644 --- a/anyparser_core/__init__.py +++ b/anyparser_core/__init__.py @@ -1,4 +1,4 @@ -from .config.hardcoded import OcrLanguage, OCRPreset +from .config.hardcoded import OcrLanguage, OcrPreset from .form import build_form from .options import AnyparserOption, AnyparserParsedOption, UploadedFile from .parser import ( @@ -15,8 +15,8 @@ AnyparserUrl, ) from .validator import validate_and_parse, validate_option, validate_path +from .version import __version__ -__version__ = "1.0.1" __all__ = [ "Anyparser", "AnyparserCrawlDirective", @@ -35,6 +35,6 @@ "validate_option", "build_form", "Anyparser", - "OCRPreset", + "OcrPreset", "OcrLanguage", ] diff --git a/anyparser_core/config/hardcoded.py b/anyparser_core/config/hardcoded.py index c571d7d..4c245ec 100644 --- a/anyparser_core/config/hardcoded.py +++ b/anyparser_core/config/hardcoded.py @@ -21,7 +21,7 @@ ] -class OCRPreset(Enum): +class OcrPreset(Enum): """Enumeration of supported OCR presets for document processing.""" DOCUMENT = "document" diff --git a/anyparser_core/form.py b/anyparser_core/form.py index 85da8a0..9e24132 100644 --- a/anyparser_core/form.py +++ b/anyparser_core/form.py @@ -54,18 +54,18 @@ def add_field(name: str, value: Any) -> None: if parsed.model == "ocr": if parsed.ocr_language: add_field( - "ocrLanguage", ",".join([lang.value for lang in parsed.ocr_language]) + "ocr_language", ",".join([lang.value for lang in parsed.ocr_language]) ) if parsed.ocr_preset: - add_field("ocrPreset", parsed.ocr_preset.value) + add_field("ocr_preset", parsed.ocr_preset.value) if parsed.model == "crawler": add_field("url", parsed.url) - add_field("maxDepth", parsed.max_depth) - add_field("maxExecutions", parsed.max_executions) + add_field("max_depth", parsed.max_depth) + add_field("max_executions", parsed.max_executions) add_field("strategy", parsed.strategy) - add_field("traversalScope", parsed.traversal_scope) + add_field("traversal_scope", parsed.traversal_scope) else: # Add files to the form for file in parsed.files: diff --git a/anyparser_core/options.py b/anyparser_core/options.py index 7aa182e..43e8b5f 100644 --- a/anyparser_core/options.py +++ b/anyparser_core/options.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, field from typing import List, Literal, Optional, TypedDict, Union -from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset +from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset # Type aliases for better readability AnyparserFormatType = Literal["json", "markdown", "html"] @@ -26,7 +26,7 @@ class AnyparserOption: table: Optional[bool] = None files: Optional[Union[str, List[str]]] = None ocr_language: Optional[List[OcrLanguage]] = None - ocr_preset: Optional[OCRPreset] = None + ocr_preset: Optional[OcrPreset] = None url: Optional[str] = None max_depth: Optional[int] = None max_executions: Optional[int] = None @@ -54,7 +54,7 @@ class AnyparserParsedOption: image: Optional[bool] = None table: Optional[bool] = None ocr_language: Optional[List[OcrLanguage]] = None - ocr_preset: Optional[OCRPreset] = None + ocr_preset: Optional[OcrPreset] = None url: Optional[str] = None max_depth: Optional[int] = None max_executions: Optional[int] = None @@ -72,7 +72,7 @@ class DefaultOptions(TypedDict): image: Optional[bool] table: Optional[bool] ocr_language: Optional[List[OcrLanguage]] - ocr_preset: Optional[OCRPreset] + ocr_preset: Optional[OcrPreset] url: Optional[str] max_depth: Optional[int] max_executions: Optional[int] diff --git a/anyparser_core/parser.py b/anyparser_core/parser.py index 77ed5c5..fc4ce57 100644 --- a/anyparser_core/parser.py +++ b/anyparser_core/parser.py @@ -10,6 +10,7 @@ from .options import AnyparserOption from .request import async_request from .validator import validate_and_parse +from .version import __version__ @dataclass @@ -81,6 +82,7 @@ class AnyparserUrl: images: List[AnyparserImageReference] = field(default_factory=list) text: Optional[str] = field(default=None) + @dataclass class AnyparserPdfPage: """Represents a parsed PDF page with extracted content.""" @@ -152,7 +154,8 @@ async def parse( # Set up the headers, using the same boundary headers: Dict[str, str] = { - "Content-Type": f"multipart/form-data; boundary={boundary}" + "Content-Type": f"multipart/form-data; boundary={boundary}", + "User-Agent": f"anyparser_core@{__version__}", } if parsed.api_key: diff --git a/anyparser_core/validator/path.py b/anyparser_core/validator/path.py index b664e7e..b669d70 100644 --- a/anyparser_core/validator/path.py +++ b/anyparser_core/validator/path.py @@ -17,10 +17,7 @@ async def validate_path(file_paths: Union[str, List[str]]) -> PathValidationResu Validates file paths exist and are accessible """ if not file_paths or (isinstance(file_paths, str) and not file_paths.strip()): - return InvalidPathValidationResult( - error=FileNotFoundError("No files provided") - ) - + return InvalidPathValidationResult(error=FileNotFoundError("No files provided")) if isinstance(file_paths, (str, Path)): files = [file_paths] diff --git a/anyparser_core/version.py b/anyparser_core/version.py new file mode 100644 index 0000000..7863915 --- /dev/null +++ b/anyparser_core/version.py @@ -0,0 +1 @@ +__version__ = "1.0.2" diff --git a/changelogs/v1.0.2-changelog.md b/changelogs/v1.0.2-changelog.md new file mode 100644 index 0000000..1776d54 --- /dev/null +++ b/changelogs/v1.0.2-changelog.md @@ -0,0 +1,26 @@ +# Release anyparser-core@1.0.2 + +## Changes + +**User Agent** + +- Added a User-Agent header. +- Moved the version literal `__version__` to a separate file to prevent circular referencing. + +**Rename "OCRPreset" to "OcrPreset"** + +This pull request refactors the OCRPreset class to OcrPreset across the codebase for consistency in naming conventions. + +- Renamed OCRPreset to `OcrPreset` in files like `README.md`, `anyparser_core/__init__.py`, and examples. +- Updated variable names and documentation to reflect the new class name. +- Modified test files to use the updated class. + +This change is purely a refactor with no functional impact, aiming for consistency and improved readability. + +## Breaking Changes + +The class `OCRPreset` has been renamed to `OcrPreset` to maintain consistency in naming conventions. + +## Migration Guide + +Search and replace all instances of `OCRPreset` with `OcrPreset` in your codebase. diff --git a/examples/03_one_liner.py b/examples/03_one_liner.py index ff451d4..9ad04b1 100644 --- a/examples/03_one_liner.py +++ b/examples/03_one_liner.py @@ -4,17 +4,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from anyparser_core import Anyparser - -multiple_files = ["docs/sample.docx", "docs/sample.pdf"] +# ------------------------------------------------------------------------------ -result = asyncio.run(Anyparser().parse(multiple_files)) - -for item in result: - print("-" * 100) - print("File:", item.original_filename) - print("Checksum:", item.checksum) - print("Total characters:", item.total_characters) - print("Markdown:", item.markdown) +from anyparser_core import Anyparser -print("-" * 100) +print(asyncio.run(Anyparser().parse(["docs/sample.docx", "docs/sample.pdf"]))) diff --git a/examples/04_ocr.py b/examples/04_ocr.py index db7cc9c..b8cddb8 100644 --- a/examples/04_ocr.py +++ b/examples/04_ocr.py @@ -4,7 +4,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OCRPreset +from anyparser_core import Anyparser, AnyparserOption, OcrLanguage, OcrPreset single_file = "docs/document.png" @@ -14,7 +14,7 @@ model="ocr", format="markdown", ocr_language=[OcrLanguage.JAPANESE], - ocr_preset=OCRPreset.SCAN, + ocr_preset=OcrPreset.SCAN, ) parser = Anyparser(options) diff --git a/pyproject.toml b/pyproject.toml index 86405d9..e31c049 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [project] name = "anyparser-core" -version = "1.0.1" +version = "1.0.2" description = "Anyparser SDK for Python" readme = "README.md" requires-python = ">=3.9" diff --git a/tests/test_form.py b/tests/test_form.py index e2425c6..6977fc0 100644 --- a/tests/test_form.py +++ b/tests/test_form.py @@ -5,7 +5,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from anyparser_core import OcrLanguage, OCRPreset +from anyparser_core import OcrLanguage, OcrPreset from anyparser_core.form import build_form from anyparser_core.options import AnyparserParsedOption, UploadedFile @@ -55,15 +55,15 @@ def test_build_form_with_ocr_options(basic_parsed_option): """Test form building with OCR options""" basic_parsed_option.model = "ocr" basic_parsed_option.ocr_language = [OcrLanguage.ENGLISH, OcrLanguage.SPANISH] - basic_parsed_option.ocr_preset = OCRPreset.DOCUMENT + basic_parsed_option.ocr_preset = OcrPreset.DOCUMENT boundary = "boundary123" form_data = build_form(basic_parsed_option, boundary) form_str = form_data.decode("utf-8") - assert 'Content-Disposition: form-data; name="ocrLanguage"' in form_str + assert 'Content-Disposition: form-data; name="ocr_language"' in form_str assert "eng,spa" in form_str - assert 'Content-Disposition: form-data; name="ocrPreset"' in form_str + assert 'Content-Disposition: form-data; name="ocr_preset"' in form_str assert "document" in form_str @@ -132,12 +132,12 @@ def test_build_form_ocr(): format="json", model="ocr", ocr_language=[OcrLanguage.JAPANESE], - ocr_preset=OCRPreset.SCAN, + ocr_preset=OcrPreset.SCAN, files=[], ) form_data = build_form(option, "boundary") - assert b'name="ocrLanguage"' in form_data - assert b'name="ocrPreset"' in form_data + assert b'name="ocr_language"' in form_data + assert b'name="ocr_preset"' in form_data def test_build_form_crawler(): @@ -155,10 +155,10 @@ def test_build_form_crawler(): ) form_data = build_form(option, "boundary") assert b'name="url"' in form_data - assert b'name="maxDepth"' in form_data - assert b'name="maxExecutions"' in form_data + assert b'name="max_depth"' in form_data + assert b'name="max_executions"' in form_data assert b'name="strategy"' in form_data - assert b'name="traversalScope"' in form_data + assert b'name="traversal_scope"' in form_data def test_build_form_with_files(tmp_path): diff --git a/tests/test_options.py b/tests/test_options.py index d04d65b..4466de6 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -5,7 +5,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset +from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset from anyparser_core.options import ( AnyparserOption, AnyparserParsedOption, @@ -61,7 +61,7 @@ def test_build_options_with_custom_options(mock_api_key, monkeypatch): image=False, table=False, ocr_language=[OcrLanguage.ENGLISH], - ocr_preset=OCRPreset.DOCUMENT, + ocr_preset=OcrPreset.DOCUMENT, ) options = build_options(custom_options) @@ -72,7 +72,7 @@ def test_build_options_with_custom_options(mock_api_key, monkeypatch): assert options["image"] is False assert options["table"] is False assert options["ocr_language"] == [OcrLanguage.ENGLISH] - assert options["ocr_preset"] == OCRPreset.DOCUMENT + assert options["ocr_preset"] == OcrPreset.DOCUMENT def test_anyparser_parsed_file(): @@ -94,7 +94,7 @@ def test_anyparser_parsed_option(): image=False, table=True, ocr_language=[OcrLanguage.ENGLISH], - ocr_preset=OCRPreset.HANDWRITING, + ocr_preset=OcrPreset.HANDWRITING, ) assert len(parsed_option.files) == 1 @@ -105,7 +105,7 @@ def test_anyparser_parsed_option(): assert parsed_option.image is False assert parsed_option.table is True assert parsed_option.ocr_language == [OcrLanguage.ENGLISH] - assert parsed_option.ocr_preset == OCRPreset.HANDWRITING + assert parsed_option.ocr_preset == OcrPreset.HANDWRITING def test_anyparser_option_validation(): @@ -117,7 +117,7 @@ def test_anyparser_option_validation(): # Test invalid OCR preset with pytest.raises(ValueError): - options = build_options(AnyparserOption(ocr_preset=OCRPreset("invalid"))) + options = build_options(AnyparserOption(ocr_preset=OcrPreset("invalid"))) validate_option(options) # Test missing API URL diff --git a/tests/test_validator_main.py b/tests/test_validator_main.py index 84ac8b8..c1e398d 100644 --- a/tests/test_validator_main.py +++ b/tests/test_validator_main.py @@ -7,7 +7,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset +from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset from anyparser_core.options import AnyparserOption, AnyparserParsedOption from anyparser_core.validator.main import validate_and_parse from anyparser_core.validator.url import InvalidUrlError @@ -154,13 +154,13 @@ async def test_validate_and_parse_with_ocr_options( api_url="https://api.example.com", api_key="test-key", ocr_language=[OcrLanguage.ENGLISH, OcrLanguage.SPANISH], - ocr_preset=OCRPreset.DOCUMENT, + ocr_preset=OcrPreset.DOCUMENT, ) result = await validate_and_parse(sample_file, options) assert result.ocr_language == [OcrLanguage.ENGLISH, OcrLanguage.SPANISH] - assert result.ocr_preset == OCRPreset.DOCUMENT + assert result.ocr_preset == OcrPreset.DOCUMENT @pytest.mark.asyncio diff --git a/tests/test_validator_option.py b/tests/test_validator_option.py index 87ea4e2..eea015c 100644 --- a/tests/test_validator_option.py +++ b/tests/test_validator_option.py @@ -5,7 +5,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from anyparser_core.config.hardcoded import OcrLanguage, OCRPreset +from anyparser_core.config.hardcoded import OcrLanguage, OcrPreset from anyparser_core.validator.option import validate_option @@ -41,7 +41,7 @@ def test_validate_option_valid_options(): "api_url": "https://api.example.com", "api_key": "test-key", "ocr_language": [OcrLanguage.ENGLISH, OcrLanguage.SPANISH], - "ocr_preset": OCRPreset.DOCUMENT, + "ocr_preset": OcrPreset.DOCUMENT, } # Should not raise any exceptions diff --git a/tests/test_version.py b/tests/test_version.py index 535da70..d335616 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -8,4 +8,4 @@ def test_version(): """Test version is a string.""" assert isinstance(__version__, str) - assert __version__ == "1.0.1" + assert __version__ == "1.0.2"