diff --git a/scripts/codacy_issues.py b/scripts/codacy_issues.py index 6881203..8816d62 100644 --- a/scripts/codacy_issues.py +++ b/scripts/codacy_issues.py @@ -63,13 +63,14 @@ def get_level_priority(level: str | None) -> int | None: def normalize_provider(value: str) -> str | None: - """Normalize provider short code. - - Args: - value: Provider identifier. - + """ + Normalize a provider identifier to a supported short code. + + Parameters: + value (str): Provider identifier to normalize (expected 'gh', 'gl', or 'bb'). + Returns: - Provider code if valid, otherwise None. + str | None: The provider code ('gh', 'gl', or 'bb') if valid, `None` otherwise. """ return value if value in ("gh", "gl", "bb") else None @@ -113,12 +114,26 @@ def assert_valid_choice(name: str, value: str, choices: list[str]) -> str: def encode_segment(value: str) -> str: - """URL-encode a path segment.""" + """ + URL-encode a URL path segment so it is safe for inclusion in a path. + + Returns: + encoded (str): The percent-encoded representation of the input string. + """ return urllib.parse.quote(value, safe="") def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: - """Build a Codacy API URL from a path and query parameters.""" + """ + Constructs a full Codacy API URL using the configured base origin and base path. + + Parameters: + pathname (str): Pathname to append to the base path (should begin with a forward slash). + query (dict[str, str] | None): Optional mapping of query parameter names to values; values are URL-encoded. + + Returns: + url (str): The complete URL including query string if `query` is provided. + """ # Ensure we keep origin and base path url = f"{BASE_URL.scheme}://{BASE_URL.netloc}{BASE_PATH}{pathname}" if query: @@ -127,16 +142,17 @@ def build_codacy_url(pathname: str, query: dict[str, str] | None = None) -> str: def assert_codacy_url(url: str) -> str: - """Ensure the URL targets the Codacy API origin and analysis path. - - Args: - url: URL to validate. - + """ + Validate that `url` targets the configured Codacy API origin and begins with the `/analysis/` path. + + Parameters: + url (str): The full URL to validate. + Returns: - The original URL when valid. - + str: The original URL when it is confirmed to target the configured Codacy API origin and start with the `/analysis/` path. + Raises: - ValueError: If the URL is not within the expected origin/path. + ValueError: If the URL does not use the configured Codacy API origin or does not start with the expected `/analysis/` path. """ # Basic safety: must be same origin and start with /api/v3/analysis/ parsed = urllib.parse.urlparse(url) @@ -149,7 +165,18 @@ def assert_codacy_url(url: str) -> str: def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str: - """Build a repository issues API URL.""" + """ + Constructs the Codacy API URL to search repository issues for a given provider, organization, repository, and result limit. + + Parameters: + provider (str): Provider code (e.g., "gh", "gl", "bb"). + org (str): Organization or owner name. + repo (str): Repository name. + limit (int): Maximum number of results to request. + + Returns: + str: A Codacy API URL for the repository issues search endpoint with the `limit` query parameter set. + """ return build_codacy_url( f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" f"/repositories/{encode_segment(repo)}/issues/search", @@ -160,7 +187,20 @@ def build_repo_issues_url(provider: str, org: str, repo: str, limit: int) -> str def build_pr_issues_url( provider: str, org: str, repo: str, pr: str, limit: int, status: str ) -> str: - """Build a pull request issues API URL.""" + """ + Constructs the Codacy API URL for fetching issues of a pull request. + + Parameters: + provider (str): Provider code (e.g., "gh", "gl", "bb"). + org (str): Organization or owner name. + repo (str): Repository name. + pr (str): Pull request identifier. + limit (int): Maximum number of issues to request. + status (str): Issue status filter (e.g., "all", "open", "closed"). + + Returns: + str: The Codacy API URL for the pull-request issues endpoint including `status` and `limit` query parameters. + """ return build_codacy_url( f"/analysis/organizations/{encode_segment(provider)}/{encode_segment(org)}" f"/repositories/{encode_segment(repo)}/pull-requests/{encode_segment(pr)}/issues", @@ -169,7 +209,12 @@ def build_pr_issues_url( def get_git_origin_url() -> str | None: - """Return the git origin URL if available.""" + """ + Get the Git remote "origin" URL for the current repository, or None when it cannot be determined. + + Returns: + origin_url (str | None): The remote URL configured for 'origin' if the current directory is inside a Git work tree and the origin URL is available; `None` if not inside a Git repository, if the origin is not set, or on error. + """ # git repo check try: result = subprocess.run( @@ -203,7 +248,18 @@ class GitRemoteInfo: def parse_git_remote(url: str) -> GitRemoteInfo | None: - """Parse a git remote URL into provider/org/repo info.""" + """ + Extract provider, organization, and repository from a Git remote URL. + + Accepts HTTPS (https://host/org/repo[.git]) and SSH (git@host:org/repo[.git]) remote formats. + Provider is one of: "gh" for GitHub, "gl" for GitLab, "bb" for Bitbucket, or "unknown" for other hosts. + + Parameters: + url (str): Git remote URL to parse. + + Returns: + GitRemoteInfo | None: Parsed GitRemoteInfo with fields `provider`, `org`, and `repo`, or `None` if the URL could not be parsed. + """ # HTTPS m = re.match(r"^https?://([^/]+)/([^/]+)/([^/]+?)(?:\.git)?$", url) # SSH @@ -216,6 +272,16 @@ def parse_git_remote(url: str) -> GitRemoteInfo | None: host, org, repo = m.group(1), m.group(2), m.group(3) def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: + """ + Check whether a hostname is equal to a base domain or is a subdomain of that base domain. + + Parameters: + hostname (str): Hostname to test (e.g., "api.example.com"). + base_domain (str): Base domain to compare against (e.g., "example.com"). + + Returns: + `true` if `hostname` equals `base_domain` or ends with `.` followed by `base_domain`, `false` otherwise. + """ return hostname == base_domain or hostname.endswith("." + base_domain) if is_same_or_subdomain(host, "github.com"): @@ -233,15 +299,19 @@ def is_same_or_subdomain(hostname: str, base_domain: str) -> bool: def fetch_json( url: str, method: str = "GET", body: dict[str, Any] | None = None ) -> dict[str, Any]: - """Fetch JSON from the Codacy API. - - Args: - url: Codacy API URL. - method: HTTP method. - body: Optional JSON body for non-GET requests. - + """ + Fetch and return a JSON object from a validated Codacy API URL. + + Parameters: + url (str): Codacy API URL; must target the configured Codacy origin and start with the /analysis/ path. + method (str): HTTP method to use (e.g., "GET", "POST"). + body (dict[str, Any] | None): Optional JSON body for non-GET requests. + Returns: - Parsed JSON dictionary. + dict[str, Any]: The parsed JSON response as a dictionary. + + Raises: + RuntimeError: On HTTP errors, network errors, invalid JSON, or when the JSON root value is not an object. """ safe_url = assert_codacy_url(url) @@ -286,7 +356,18 @@ def fetch_json( # API # ================================ def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[str, Any]: - """Fetch issues for a repository.""" + """ + Request Codacy for issues belonging to a repository. + + Parameters: + provider (str): Provider code ('gh', 'gl', 'bb') indicating GitHub, GitLab, or Bitbucket. + org (str): Organization or owner name. + repo (str): Repository name. + limit (int): Maximum number of issues to return. + + Returns: + dict[str, Any]: Parsed JSON response from the Codacy API containing issue data. + """ url = build_repo_issues_url(provider, org, repo, limit) return fetch_json(url, method="POST", body={}) @@ -294,7 +375,20 @@ def fetch_repo_issues(provider: str, org: str, repo: str, limit: int) -> dict[st def fetch_pr_issues( provider: str, org: str, repo: str, pr: str, limit: int, status: str = "all" ) -> dict[str, Any]: - """Fetch issues for a pull request.""" + """ + Retrieve Codacy issues for a specific pull request. + + Parameters: + provider (str): Provider code ("gh", "gl", "bb"). + org (str): Organization or user name. + repo (str): Repository name. + pr (str): Pull request number or identifier. + limit (int): Maximum number of issues to request. + status (str): Issue status filter (for example "all", "open", "closed"). + + Returns: + dict: Parsed JSON response from the Codacy API. + """ url = build_pr_issues_url(provider, org, repo, pr, limit, status) return fetch_json(url, method="GET") @@ -303,17 +397,21 @@ def fetch_pr_issues( # AI Output Formatter # ================================ def format_for_ai(raw_issues: list[dict[str, Any]], min_level: str) -> list[str]: - """Format raw Codacy issues for AI output. - - Args: - raw_issues: Issue dictionaries from Codacy API. - min_level: Minimum severity level to include. - + """ + Format Codacy issue records into compact AI-friendly lines filtered by minimum severity. + + Each returned string has the form: + " | : | | | ". + + Parameters: + raw_issues: List of issue objects returned by the Codacy API (each item may be an issue or contain a `commitIssue` key). + min_level: Minimum severity level to include; must be one of the values in LEVELS. + Returns: - Formatted issue strings. - + A list of formatted issue strings matching the format above, including only issues whose severity is at or above `min_level`. + Raises: - ValueError: If min_level is invalid. + ValueError: If `min_level` is not a valid severity level. """ min_priority = get_level_priority(min_level) if min_priority is None: @@ -377,13 +475,17 @@ def apply_git_defaults(args: argparse.Namespace) -> None: def resolve_segments(args: argparse.Namespace) -> tuple[str, str, str | None]: - """Validate and return org/repo/pr segments. - - Args: - args: Parsed CLI arguments. - + """ + Validate CLI org, repo, and optional pr segments and return them. + + Parameters: + args (argparse.Namespace): Parsed CLI arguments with attributes `org`, `repo`, and optional `pr`. + Returns: - Tuple of (org, repo, pr). + tuple[str, str, str | None]: A tuple (org, repo, pr) where `pr` is None if not supplied. + + Raises: + ValueError: If any segment is empty or contains invalid characters. """ segment_pattern = re.compile(r"^[A-Za-z0-9_.-]+$") org = assert_valid_segment("org", args.org, segment_pattern) @@ -402,7 +504,21 @@ def build_payload( min_level: str, issues: list[str], ) -> dict[str, object]: - """Build the output payload for JSON serialization.""" + """ + Create a JSON-serializable payload describing the fetched issues and their scope. + + The returned dictionary contains: + - scope: "pull_request" when `pr` is set, otherwise "repository". + - organization: organization/owner name. + - repository: repository name. + - pullRequest: pull request identifier string when present, otherwise `None`. + - minLevel: the minimum severity level used to filter issues. + - total: the number of issues in `issues`. + - issues: list of formatted issue strings. + + Returns: + dict[str, object]: Payload ready for JSON serialization with the keys described above. + """ return { "scope": "pull_request" if pr else "repository", "organization": org, @@ -415,7 +531,14 @@ def build_payload( def main() -> int: - """Run the Codacy issues fetcher.""" + """ + Run the CLI: parse arguments, fetch Codacy issues (repository or pull request), format them for AI consumption, and write a JSON payload to stdout. + + Writes error messages to stderr when validation or fetching fails and prints the final JSON payload to stdout. + + Returns: + int: 0 on success, 1 on error. + """ args = parse_args(sys.argv[1:]) # --- Git auto-detect --- @@ -474,4 +597,4 @@ def main() -> int: raise SystemExit(main()) except Exception as e: print(str(e), file=sys.stderr) - raise SystemExit(1) from e + raise SystemExit(1) from e \ No newline at end of file diff --git a/src/exstruct/__init__.py b/src/exstruct/__init__.py index dee7580..6ad3744 100644 --- a/src/exstruct/__init__.py +++ b/src/exstruct/__init__.py @@ -90,28 +90,14 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> WorkbookData: """ - Extract an Excel workbook into WorkbookData. - - Args: - file_path: Path to .xlsx/.xlsm/.xls. - mode: "light" / "standard" / "verbose" - - light: cells + table detection only (no COM, shapes/charts empty). Print areas via openpyxl. - - standard: texted shapes + arrows + charts (COM if available), print areas included. Shape/chart size is kept but hidden by default in output. - - verbose: all shapes (including textless) with size, charts with size, and colors_map. - + Extracts an Excel workbook into a WorkbookData structure. + + Parameters: + file_path (str | Path): Path to the workbook file (.xlsx, .xlsm, .xls). + mode (ExtractionMode): Extraction detail level. "light" includes cells and table detection only (no COM, shapes/charts empty; print areas via openpyxl). "standard" includes texted shapes, arrows, charts (COM if available) and print areas. "verbose" also includes shape/chart sizes, cell link map, colors map, and formulas map. + Returns: - WorkbookData containing sheets, rows, shapes, charts, and print areas. - - Raises: - ValueError: If an invalid mode is provided. - - Examples: - Extract with hyperlinks (verbose) and inspect table candidates: - - >>> from exstruct import extract - >>> wb = extract("input.xlsx", mode="verbose") - >>> wb.sheets["Sheet1"].table_candidates - ['A1:B5'] + WorkbookData: Parsed workbook representation containing sheets, rows, shapes, charts, and print areas. """ include_links = True if mode == "verbose" else False include_colors_map = True if mode == "verbose" else None @@ -397,4 +383,4 @@ def process_excel( print_areas_dir=print_areas_dir, auto_page_breaks_dir=auto_page_breaks_dir, stream=stream, - ) + ) \ No newline at end of file diff --git a/src/exstruct/core/backends/base.py b/src/exstruct/core/backends/base.py index 0cf283c..7ffbdde 100644 --- a/src/exstruct/core/backends/base.py +++ b/src/exstruct/core/backends/base.py @@ -42,4 +42,9 @@ def extract_merged_cells(self) -> MergedCellData: """Extract merged cell ranges from the workbook.""" def extract_formulas_map(self) -> WorkbookFormulasMap | None: - """Extract formulas map from the workbook.""" + """ + Retrieve the workbook's formulas organized by worksheet. + + Returns: + WorkbookFormulasMap | None: A mapping of worksheet identifiers to their formulas, or `None` if the backend cannot provide a formulas map. + """ \ No newline at end of file diff --git a/src/exstruct/core/backends/com_backend.py b/src/exstruct/core/backends/com_backend.py index 81ec58c..0c1348f 100644 --- a/src/exstruct/core/backends/com_backend.py +++ b/src/exstruct/core/backends/com_backend.py @@ -63,14 +63,15 @@ def extract_print_areas(self) -> PrintAreaData: def extract_colors_map( self, *, include_default_background: bool, ignore_colors: set[str] | None ) -> WorkbookColorsMap | None: - """Extract colors_map via COM; logs and skips on failure. - - Args: - include_default_background: Whether to include default backgrounds. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract a workbook colors map using the Excel COM API. + + Parameters: + include_default_background (bool): Include the workbook's default background color in the resulting map. + ignore_colors (set[str] | None): Optional set of color keys to exclude from the map. + Returns: - WorkbookColorsMap or None when extraction fails. + WorkbookColorsMap | None: A mapping of workbook color definitions when extraction succeeds, or `None` if COM extraction fails. """ try: return extract_sheet_colors_map_com( @@ -86,10 +87,11 @@ def extract_colors_map( return None def extract_formulas_map(self) -> WorkbookFormulasMap | None: - """Extract formulas_map via COM; logs and skips on failure. - + """ + Extracts the workbook's formulas map using COM. + Returns: - WorkbookFormulasMap or None when extraction fails. + WorkbookFormulasMap or None: The extracted formulas map, or `None` if extraction failed. """ try: return extract_sheet_formulas_map_com(self.workbook) @@ -101,10 +103,13 @@ def extract_formulas_map(self) -> WorkbookFormulasMap | None: return None def extract_auto_page_breaks(self) -> PrintAreaData: - """Compute auto page-break rectangles per sheet using Excel COM. - + """ + Compute auto page-break rectangles for each worksheet using Excel COM. + + For each sheet, determine the sheet's print area (PageSetup.PrintArea or the used range) and split it into sub-rectangles along Excel's horizontal and vertical page breaks; parts that reference a different sheet are ignored. If extraction for a sheet fails, the sheet is skipped and a warning is logged. + Returns: - Mapping of sheet name to auto page-break areas. + Mapping from sheet name to a list of PrintArea entries. Each PrintArea describes a rectangular region with `r1` and `r2` as 1-based row indices and `c1` and `c2` as 0-based column indices. """ results: PrintAreaData = {} for sheet in self.workbook.sheets: @@ -247,4 +252,4 @@ def _split_csv_respecting_quotes(raw: str) -> list[str]: i += 1 if buf: parts.append("".join(buf).strip()) - return [p for p in parts if p] + return [p for p in parts if p] \ No newline at end of file diff --git a/src/exstruct/core/backends/openpyxl_backend.py b/src/exstruct/core/backends/openpyxl_backend.py index d67ae59..acaba4a 100644 --- a/src/exstruct/core/backends/openpyxl_backend.py +++ b/src/exstruct/core/backends/openpyxl_backend.py @@ -102,10 +102,11 @@ def extract_merged_cells(self) -> MergedCellData: return {} def extract_formulas_map(self) -> WorkbookFormulasMap | None: - """Extract formulas_map using openpyxl. - + """ + Extract a mapping of workbook formulas for each sheet. + Returns: - WorkbookFormulasMap or None when extraction fails. + WorkbookFormulasMap | None: A mapping from sheet name to its formulas, or `None` if extraction fails. """ try: return extract_sheet_formulas_map(self.file_path) @@ -116,13 +117,14 @@ def extract_formulas_map(self) -> WorkbookFormulasMap | None: return None def detect_tables(self, sheet_name: str) -> list[str]: - """Detect table candidates for a single sheet. - - Args: - sheet_name: Target worksheet name. - + """ + Detects table candidate ranges within the specified worksheet. + + Parameters: + sheet_name (str): Name of the worksheet to analyze for table candidates. + Returns: - List of table candidate ranges. + list[str]: Detected table candidate ranges as A1-style range strings; empty list if none are found or detection fails. """ try: return detect_tables_openpyxl(self.file_path, sheet_name) @@ -204,4 +206,4 @@ def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None: bounds = parse_range_zero_based(range_str) if bounds is None: return None - return (bounds.r1, bounds.c1, bounds.r2, bounds.c2) + return (bounds.r1, bounds.c1, bounds.r2, bounds.c2) \ No newline at end of file diff --git a/src/exstruct/core/cells.py b/src/exstruct/core/cells.py index 1024888..20e9385 100644 --- a/src/exstruct/core/cells.py +++ b/src/exstruct/core/cells.py @@ -56,13 +56,14 @@ class WorkbookColorsMap: sheets: dict[str, SheetColorsMap] def get_sheet(self, sheet_name: str) -> SheetColorsMap | None: - """Return the colors map for a sheet if available. - - Args: - sheet_name: Target worksheet name. - + """ + Retrieve the SheetColorsMap for a worksheet by name. + + Parameters: + sheet_name (str): Name of the worksheet to retrieve. + Returns: - SheetColorsMap for the sheet, or None if missing. + SheetColorsMap | None: The sheet's color map if present, `None` otherwise. """ return self.sheets.get(sheet_name) @@ -82,13 +83,14 @@ class WorkbookFormulasMap: sheets: dict[str, SheetFormulasMap] def get_sheet(self, sheet_name: str) -> SheetFormulasMap | None: - """Return the formulas map for a sheet if available. - - Args: - sheet_name: Target worksheet name. - + """ + Retrieve the formulas map for a worksheet. + + Parameters: + sheet_name (str): Name of the worksheet to look up. + Returns: - SheetFormulasMap for the sheet, or None if missing. + SheetFormulasMap | None: The sheet's formulas map if present, `None` if the worksheet is not found. """ return self.sheets.get(sheet_name) @@ -129,13 +131,14 @@ def extract_sheet_colors_map( def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: - """Extract formula strings for each worksheet. - - Args: - file_path: Excel workbook path. - + """ + Extract normalized formula strings from every worksheet in the workbook. + + Parameters: + file_path (Path): Path to the Excel workbook to read. + Returns: - WorkbookFormulasMap containing per-sheet formula maps. + WorkbookFormulasMap: Mapping of sheet names to SheetFormulasMap objects. Each SheetFormulasMap contains a mapping from normalized formula strings (each beginning with "=") to a list of cell coordinates (row, column) where that formula occurs. """ sheets: dict[str, SheetFormulasMap] = {} with openpyxl_workbook(file_path, data_only=False, read_only=False) as wb: @@ -146,13 +149,14 @@ def extract_sheet_formulas_map(file_path: Path) -> WorkbookFormulasMap: def extract_sheet_formulas_map_com(workbook: xw.Book) -> WorkbookFormulasMap: - """Extract formula strings for each worksheet via COM. - - Args: - workbook: xlwings workbook instance. - + """ + Collects and normalizes formulas from every worksheet in an xlwings workbook into per-sheet mappings. + + Parameters: + workbook: xlwings Book instance whose sheets will be scanned for formulas. + Returns: - WorkbookFormulasMap containing per-sheet formula maps. + WorkbookFormulasMap: maps sheet names to SheetFormulasMap objects. Each SheetFormulasMap.formulas_map maps a normalized formula string (consistent representation, e.g., beginning with "=") to a list of (row, column) tuples representing cell locations using Excel 1-based indices. """ sheets: dict[str, SheetFormulasMap] = {} for sheet in workbook.sheets: @@ -189,16 +193,16 @@ def extract_sheet_colors_map_com( include_default_background: bool, ignore_colors: set[str] | None, ) -> WorkbookColorsMap: - """Extract background colors for each worksheet via COM display formats. - - Args: - workbook: xlwings workbook instance. - include_default_background: Whether to include default (white) backgrounds - within the used range. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract per-sheet background color maps using the workbook's COM/display-format interfaces. + + Parameters: + workbook (xw.Book): xlwings workbook whose sheets will be inspected. + include_default_background (bool): If true, include default background colors (e.g., white) for cells inside each sheet's used range. + ignore_colors (set[str] | None): Optional set of normalized color keys to exclude from results. + Returns: - WorkbookColorsMap containing per-sheet color maps. + WorkbookColorsMap: Mapping of sheet names to SheetColorsMap containing detected background color positions for each worksheet. """ _prepare_workbook_for_display_format(workbook) sheets: dict[str, SheetColorsMap] = {} @@ -214,15 +218,16 @@ def extract_sheet_colors_map_com( def _extract_sheet_colors( ws: Worksheet, include_default_background: bool, ignore_colors: set[str] | None ) -> SheetColorsMap: - """Extract background colors for a single worksheet. - - Args: - ws: Target worksheet. - include_default_background: Whether to include default (white) backgrounds. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract the background color locations present on a single worksheet. + + Parameters: + ws (Worksheet): Worksheet to scan. + include_default_background (bool): If true, treat cells with the workbook default/background color as having a color key. + ignore_colors (set[str] | None): Optional set of color keys to ignore (keys are normalized before comparison). + Returns: - SheetColorsMap for the worksheet. + SheetColorsMap: Mapping from normalized color key to a list of cell coordinates where that color appears. Coordinates are tuples (row, col) where `row` is 1-based and `col` is 0-based. """ min_row, min_col, max_row, max_col = _get_used_range_bounds(ws) colors_map: dict[str, list[tuple[int, int]]] = {} @@ -247,13 +252,14 @@ def _extract_sheet_colors( def _extract_sheet_formulas(ws: Worksheet) -> SheetFormulasMap: - """Extract formula strings for a single worksheet. - - Args: - ws: Target worksheet. - + """ + Collect normalized formula strings from a worksheet and group their cell coordinates. + + Parameters: + ws (Worksheet): Worksheet to scan for formulas. + Returns: - SheetFormulasMap for the worksheet. + SheetFormulasMap: container with the sheet's name and a mapping from each normalized formula string (prefixed with "=") to a list of cell coordinates as (row, zero-based-column). """ min_row, min_col, max_row, max_col = _get_used_range_bounds(ws) formulas_map: dict[str, list[tuple[int, int]]] = {} @@ -297,13 +303,14 @@ def _normalize_formula_value(value: object) -> str | None: def _normalize_formula_from_com(value: object) -> str | None: - """Normalize a formula string returned by COM. - - Args: - value: Raw COM formula value. - + """ + Normalize a COM-returned cell formula into a string that begins with '='. + + Parameters: + value (object): Raw value returned from COM for a cell's formula. + Returns: - Formula string with leading "=", or None when not a formula. + str | None: The input string if it is non-empty and starts with '=', `None` otherwise. """ if value is None or not isinstance(value, str): return None @@ -318,15 +325,16 @@ def _normalize_formula_from_com(value: object) -> str | None: def _extract_sheet_colors_com( sheet: xw.Sheet, include_default_background: bool, ignore_colors: set[str] | None ) -> SheetColorsMap: - """Extract background colors for a single worksheet via COM. - - Args: - sheet: Target worksheet. - include_default_background: Whether to include default (white) backgrounds. - ignore_colors: Optional set of color keys to ignore. - + """ + Extract per-sheet background color mapping using COM/DisplayFormat. + + Parameters: + sheet (xw.Sheet): xlwings sheet object to inspect. + include_default_background (bool): If True, include cells whose background is the workbook default color. + ignore_colors (set[str] | None): Optional set of normalized color keys to exclude from the result. + Returns: - SheetColorsMap for the worksheet. + SheetColorsMap: Mapping from normalized color key (hex/theme/index canonical form) to a list of cell coordinates where that color appears. Each coordinate is a tuple (row, col) where `row` is the worksheet row number (1-based) and `col` is the zero-based column index. """ colors_map: dict[str, list[tuple[int, int]]] = {} used = sheet.used_range @@ -1715,4 +1723,4 @@ def _coerce_numeric_preserve_format(val: str) -> int | float | str: return float(quantized) except (InvalidOperation, Exception): return val - return val + return val \ No newline at end of file diff --git a/src/exstruct/core/integrate.py b/src/exstruct/core/integrate.py index ccd8131..402dddf 100644 --- a/src/exstruct/core/integrate.py +++ b/src/exstruct/core/integrate.py @@ -21,28 +21,29 @@ def extract_workbook( # noqa: C901 include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: - """Extract workbook and return WorkbookData. - - Falls back to cells+tables if Excel COM is unavailable. - - Args: - file_path: Workbook path. - mode: Extraction mode. - include_cell_links: Whether to include cell hyperlinks; None uses mode defaults. - include_print_areas: Whether to include print areas; None defaults to True. - include_auto_page_breaks: Whether to include auto page breaks. - include_colors_map: Whether to include colors map; None uses mode defaults. - include_default_background: Whether to include default background color. - ignore_colors: Optional set of color keys to ignore. - include_formulas_map: Whether to include formulas map; None uses mode defaults. - include_merged_cells: Whether to include merged cell ranges; None uses mode defaults. - include_merged_values_in_rows: Whether to keep merged values in rows. - + """ + Extract a workbook into a structured WorkbookData representation. + + May fall back to cells+tables extraction if Excel COM automation is unavailable. + + Parameters: + file_path (str | Path): Path to the workbook file. + mode (Literal['light', 'standard', 'verbose']): Extraction mode that controls detail level. + include_cell_links (bool | None): Include cell hyperlinks; `None` uses mode defaults. + include_print_areas (bool | None): Include print areas; `None` defaults to True. + include_auto_page_breaks (bool): Include automatic page break information. + include_colors_map (bool | None): Include a colors map; `None` uses mode defaults. + include_default_background (bool): Include default background color when present. + ignore_colors (set[str] | None): Set of color keys to ignore during color mapping. + include_formulas_map (bool | None): Include a map of cell formulas; `None` uses mode defaults. + include_merged_cells (bool | None): Include merged cell ranges; `None` uses mode defaults. + include_merged_values_in_rows (bool): Preserve merged cell values in row-wise output. + Returns: - Extracted WorkbookData. - + WorkbookData: The extracted workbook representation. + Raises: - ValueError: If mode is unsupported. + ValueError: If `mode` is not one of "light", "standard", or "verbose". """ inputs = resolve_extraction_inputs( file_path, @@ -58,4 +59,4 @@ def extract_workbook( # noqa: C901 include_merged_values_in_rows=include_merged_values_in_rows, ) result = run_extraction_pipeline(inputs) - return result.workbook + return result.workbook \ No newline at end of file diff --git a/src/exstruct/core/pipeline.py b/src/exstruct/core/pipeline.py index ff46dbe..4fc316c 100644 --- a/src/exstruct/core/pipeline.py +++ b/src/exstruct/core/pipeline.py @@ -272,13 +272,14 @@ def resolve_extraction_inputs( def build_pipeline_plan(inputs: ExtractionInputs) -> PipelinePlan: - """Build a pipeline plan based on resolved inputs. - - Args: - inputs: Resolved pipeline inputs. - + """ + Builds a pipeline plan describing which pre-COM and COM extraction steps to run for the given resolved inputs. + + Parameters: + inputs (ExtractionInputs): Resolved extraction configuration (including mode and COM/formulas flags). + Returns: - PipelinePlan containing pre-COM/COM steps and COM usage flag. + PipelinePlan: Plan containing ordered `pre_com_steps`, ordered `com_steps`, and `use_com` set to true when the pipeline should use COM (when `mode` is not "light" or `use_com_for_formulas` is true). """ return PipelinePlan( pre_com_steps=build_pre_com_pipeline(inputs), @@ -500,11 +501,12 @@ def step_extract_cells( def step_extract_print_areas_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: - """Extract print areas via openpyxl. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. + """ + Extract print areas from the workbook and populate artifacts.print_area_data. + + Parameters: + inputs (ExtractionInputs): Pipeline inputs containing the file path and extraction options. + artifacts (ExtractionArtifacts): Mutable artifact container; `artifacts.print_area_data` will be set to the extracted print area mapping. """ backend = OpenpyxlBackend(inputs.file_path) artifacts.print_area_data = backend.extract_print_areas() @@ -513,11 +515,14 @@ def step_extract_print_areas_openpyxl( def step_extract_formulas_map_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: - """Extract formulas_map via openpyxl; logs and skips on failure. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. + """ + Populate artifacts.formulas_map_data by extracting workbook formulas using openpyxl. + + Attempts to extract a WorkbookFormulasMap from the file at inputs.file_path and stores it on artifacts.formulas_map_data. If extraction fails, a warning is logged and artifacts.formulas_map_data is left unchanged. + + Parameters: + inputs (ExtractionInputs): Resolved pipeline inputs (provides file_path). + artifacts (ExtractionArtifacts): Mutable container to receive the extracted formulas map. """ backend = OpenpyxlBackend(inputs.file_path) try: @@ -532,11 +537,11 @@ def step_extract_formulas_map_openpyxl( def step_extract_colors_map_openpyxl( inputs: ExtractionInputs, artifacts: ExtractionArtifacts ) -> None: - """Extract colors_map via openpyxl; logs and skips on failure. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. + """ + Extract the workbook colors map using openpyxl and store it on the artifacts. + + Sets artifacts.colors_map_data to the colors map extracted from inputs.file_path, + respecting inputs.include_default_background and inputs.ignore_colors. """ backend = OpenpyxlBackend(inputs.file_path) artifacts.colors_map_data = backend.extract_colors_map( @@ -605,12 +610,13 @@ def step_extract_print_areas_com( def step_extract_auto_page_breaks_com( inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book ) -> None: - """Extract auto page breaks via COM. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. - workbook: xlwings workbook instance. + """ + Extract auto page break information from a COM workbook and store it in the artifacts. + + Parameters: + inputs (ExtractionInputs): Pipeline inputs that may influence extraction behavior. + artifacts (ExtractionArtifacts): Mutable artifact container; updated with extracted data. + workbook (xw.Book): xlwings COM workbook used to read auto page break settings. """ artifacts.auto_page_break_data = ComBackend(workbook).extract_auto_page_breaks() @@ -618,12 +624,14 @@ def step_extract_auto_page_breaks_com( def step_extract_formulas_map_com( inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book ) -> None: - """Extract formulas_map via COM; logs and skips on failure. - - Args: - inputs: Pipeline inputs. - artifacts: Artifact container to update. - workbook: xlwings workbook instance. + """ + Extract the workbook's formulas map via COM and store it into the artifacts. + + On success assigns the extracted WorkbookFormulasMap to artifacts.formulas_map_data. + On failure leaves artifacts.formulas_map_data unchanged and logs a warning. + + Parameters: + workbook (xlwings.Book): COM workbook to extract formulas from. """ try: artifacts.formulas_map_data = ComBackend(workbook).extract_formulas_map() @@ -663,14 +671,15 @@ def step_extract_colors_map_com( def _resolve_sheet_colors_map( colors_map_data: WorkbookColorsMap | None, sheet_name: str ) -> dict[str, list[tuple[int, int]]]: - """Resolve colors_map for a single sheet. - - Args: - colors_map_data: Optional workbook colors map container. - sheet_name: Target sheet name. - + """ + Resolve the colors map for a given sheet. + + Parameters: + colors_map_data (WorkbookColorsMap | None): Optional workbook-level colors map container. + sheet_name (str): Name of the sheet to resolve. + Returns: - colors_map dictionary for the sheet, or empty dict if unavailable. + dict[str, list[tuple[int, int]]]: Mapping of color keys to lists of (start_col, end_col) intervals for the sheet; empty dict if no colors map is available for the workbook or sheet. """ if not colors_map_data: return {} @@ -683,14 +692,15 @@ def _resolve_sheet_colors_map( def _resolve_sheet_formulas_map( formulas_map_data: WorkbookFormulasMap | None, sheet_name: str ) -> dict[str, list[tuple[int, int]]]: - """Resolve formulas_map for a single sheet. - - Args: - formulas_map_data: Optional workbook formulas map container. - sheet_name: Target sheet name. - + """ + Get the formulas map for a named sheet from a workbook formulas container. + + Parameters: + formulas_map_data: Optional workbook formulas map container; may be None. + sheet_name: Name of the sheet to resolve formulas for. + Returns: - formulas_map dictionary for the sheet, or empty dict if unavailable. + A mapping for the sheet (str -> list of (row, column) tuples) representing formula locations, or an empty dict if no data is available. """ if not formulas_map_data: return {} @@ -704,14 +714,18 @@ def _filter_rows_excluding_merged_values( rows: list[CellRow], merged_cells: list[MergedCellRange], ) -> list[CellRow]: - """Remove merged-cell values from rows. - - Args: - rows: Extracted rows. - merged_cells: Merged cell ranges. - + """ + Filter out cell values that originate from merged-cell ranges. + + Parameters: + rows (list[CellRow]): Extracted rows to filter. + merged_cells (list[MergedCellRange]): Merged cell ranges to exclude values from. + Returns: - Filtered rows with merged-cell values removed. + list[CellRow]: Rows where any cell whose column index falls inside a merged range has been removed. + - Rows with no remaining cells are omitted. + - Cell entries with non-integer column keys are preserved. + - `links` are retained only for cells that remain; if a row has no links after filtering, `links` is set to None. """ if not rows or not merged_cells: return rows @@ -816,23 +830,26 @@ def collect_sheet_raw_data( formulas_map_data: WorkbookFormulasMap | None = None, colors_map_data: WorkbookColorsMap | None = None, ) -> dict[str, SheetRawData]: - """Collect per-sheet raw data from extraction artifacts. - - Args: - cell_data: Extracted cell rows per sheet. - shape_data: Extracted shapes per sheet. - chart_data: Extracted charts per sheet. - merged_cell_data: Extracted merged cells per sheet. - workbook: xlwings workbook instance. - mode: Extraction mode. - print_area_data: Optional print area data per sheet. - auto_page_break_data: Optional auto page-break data per sheet. - formulas_map_data: Optional formulas map data. - colors_map_data: Optional colors map data. - include_merged_values_in_rows: Whether to keep merged values in rows. - + """ + Collect per-sheet raw extraction data and assemble SheetRawData for each sheet. + + For each sheet in cell_data this returns a SheetRawData containing rows (optionally excluding values contributed by merged cells), shapes, charts (omitted in "light" mode), detected table candidates, print/auto-print areas, per-sheet formulas map, per-sheet colors map, and merged cell ranges. + + Parameters: + cell_data (CellData): Extracted cell rows keyed by sheet name. + shape_data (ShapeData): Extracted shapes keyed by sheet name. + chart_data (ChartData): Extracted charts keyed by sheet name. + merged_cell_data (MergedCellData): Merged cell ranges keyed by sheet name. + workbook (xw.Book): xlwings workbook used to resolve sheets and detect tables. + mode (ExtractionMode): Extraction mode; when "light", charts are omitted. + include_merged_values_in_rows (bool): If False, remove values that originate from merged cells when building row data. + print_area_data (PrintAreaData | None): Optional print areas keyed by sheet name. + auto_page_break_data (PrintAreaData | None): Optional auto page-break areas keyed by sheet name. + formulas_map_data (WorkbookFormulasMap | None): Optional per-sheet formulas map to include in SheetRawData. + colors_map_data (WorkbookColorsMap | None): Optional per-sheet colors map to include in SheetRawData. + Returns: - Mapping of sheet name to raw sheet data. + dict[str, SheetRawData]: Mapping from sheet name to the assembled SheetRawData. """ result: dict[str, SheetRawData] = {} for sheet_name, rows in cell_data.items(): @@ -861,13 +878,14 @@ def collect_sheet_raw_data( def run_extraction_pipeline(inputs: ExtractionInputs) -> PipelineResult: - """Run the full extraction pipeline and return the result. - - Args: - inputs: Resolved pipeline inputs. - + """ + Execute the configured extraction pipeline and produce the extraction result. + + Parameters: + inputs (ExtractionInputs): Resolved pipeline inputs controlling which extraction steps run. + Returns: - PipelineResult with workbook data, artifacts, and execution state. + PipelineResult: Contains the constructed workbook data, collected artifacts, and pipeline execution state (including COM attempt/success and any fallback reason). """ plan = build_pipeline_plan(inputs) artifacts = run_pipeline(plan.pre_com_steps, inputs, ExtractionArtifacts()) @@ -941,15 +959,16 @@ def build_cells_tables_workbook( artifacts: ExtractionArtifacts, reason: str, ) -> WorkbookData: - """Build a WorkbookData containing cells + table_candidates (fallback). - - Args: - inputs: Pipeline inputs. - artifacts: Collected artifacts from extraction steps. - reason: Reason to log for fallback. - + """ + Builds a WorkbookData from available cell rows and detected table candidates to use as a fallback when COM-based extraction is not used or has failed. + + Parameters: + inputs (ExtractionInputs): Resolved extraction inputs that control which extra maps and merged-value handling to include. + artifacts (ExtractionArtifacts): Collected artifacts produced by pre-COM extraction steps; cell rows and any existing maps are consumed from here. + reason (str): Short description of why the fallback is being used (logged for debugging). + Returns: - WorkbookData constructed from cells and detected tables. + WorkbookData: A workbook composed from the available per-sheet cell rows, detected table candidates, merged-cell information, and any resolved formulas and colors maps. Shapes and charts are empty in this fallback path; formulas and colors maps are extracted from artifacts or from the Openpyxl backend when requested and not already present. """ logger.debug("Building fallback workbook: %s", reason) backend = OpenpyxlBackend(inputs.file_path) @@ -995,4 +1014,4 @@ def build_cells_tables_workbook( merged_cells=merged_cells, ) raw = WorkbookRawData(book_name=inputs.file_path.name, sheets=sheets) - return build_workbook_data(raw) + return build_workbook_data(raw) \ No newline at end of file diff --git a/src/exstruct/core/workbook.py b/src/exstruct/core/workbook.py index 3f33822..199eca6 100644 --- a/src/exstruct/core/workbook.py +++ b/src/exstruct/core/workbook.py @@ -19,15 +19,16 @@ def openpyxl_workbook( file_path: Path, *, data_only: bool, read_only: bool ) -> Iterator[Any]: - """Open an openpyxl workbook and ensure it is closed. - - Args: - file_path: Workbook path. - data_only: Whether to read formula results. - read_only: Whether to open in read-only mode. - + """ + Open an openpyxl Workbook for temporary use and ensure it is closed on exit. + + Parameters: + file_path (Path): Path to the workbook file. + data_only (bool): If True, read stored cell values instead of formulas. + read_only (bool): If True, open the workbook in optimized read-only mode. + Yields: - openpyxl workbook instance. + openpyxl.workbook.workbook.Workbook: The opened workbook instance. """ with warnings.catch_warnings(): warnings.filterwarnings( @@ -113,4 +114,4 @@ def _find_open_workbook(file_path: Path) -> xw.Book | None: except Exception as exc: logger.debug("Failed to inspect open Excel workbooks. (%r)", exc) return None - return None + return None \ No newline at end of file diff --git a/src/exstruct/engine.py b/src/exstruct/engine.py index 1d5b3b9..ef16a89 100644 --- a/src/exstruct/engine.py +++ b/src/exstruct/engine.py @@ -261,6 +261,24 @@ def _include_auto_print_areas(self) -> bool: def _filter_sheet( self, sheet: SheetData, include_auto_override: bool | None = None ) -> SheetData: + """ + Return a filtered copy of a SheetData according to the engine's output filters and resolved size/print-area flags. + + Parameters: + sheet: The original SheetData to filter. + include_auto_override: If not None, overrides the engine's automatic decision for including auto page-break areas; if None, the engine's auto rule is used. + + Returns: + A new SheetData where: + - rows are kept only if include_rows is enabled; otherwise an empty list. + - shapes are kept only if include_shapes is enabled; when kept and shape-size inclusion is disabled, each shape's width and height are cleared. + - charts are kept only if include_charts is enabled; when kept and chart-size inclusion is disabled, each chart's width and height are cleared. + - table_candidates are kept only if include_tables is enabled; otherwise an empty list. + - colors_map and formulas_map are preserved as-is. + - print_areas are kept only if print areas are included by the engine; otherwise an empty list. + - auto_print_areas are kept only if auto page-break areas are included (after applying include_auto_override); otherwise an empty list. + - merged_cells are kept only if include_merged_cells is enabled; otherwise set to None. + """ include_shape_size, include_chart_size = self._resolve_size_flags() include_print_areas = self._include_print_areas() include_auto_print_areas = ( @@ -335,15 +353,15 @@ def extract( self, file_path: str | Path, *, mode: ExtractionMode | None = None ) -> WorkbookData: """ - Extract a workbook and return normalized workbook data. - - Args: - file_path: Path to the .xlsx/.xlsm/.xls file to extract. - mode: Extraction mode; defaults to the engine's StructOptions.mode. - - light: COM-free; cells, table candidates, and print areas only. - - standard: Shapes with text/arrows plus charts; print areas included; - size fields retained but hidden from default output. - - verbose: All shapes (with size) and charts (with size). + Produce a normalized WorkbookData extracted from the given workbook file. + + Parameters: + file_path (str | Path): Path to the .xlsx/.xlsm/.xls file to extract. + mode (ExtractionMode | None): Extraction mode to use; if None the engine's configured mode is used. + Modes: "light", "standard", "verbose". + + Returns: + WorkbookData: Normalized workbook data extracted from the file. """ chosen_mode = mode or self.options.mode include_auto_page_breaks = ( @@ -575,4 +593,4 @@ def process( export_pdf(normalized_file_path, pdf_path) if image: images_dir = pdf_path.parent / f"{pdf_path.stem}_images" - export_sheet_images(normalized_file_path, images_dir, dpi=dpi) + export_sheet_images(normalized_file_path, images_dir, dpi=dpi) \ No newline at end of file diff --git a/src/exstruct/render/__init__.py b/src/exstruct/render/__init__.py index ad6cc8a..e30dbab 100644 --- a/src/exstruct/render/__init__.py +++ b/src/exstruct/render/__init__.py @@ -79,7 +79,15 @@ def _require_pdfium() -> ModuleType: def export_sheet_images( excel_path: str | Path, output_dir: str | Path, dpi: int = 144 ) -> list[Path]: - """Export each sheet as PNG (via PDF then pypdfium2 rasterization) and return paths in sheet order.""" + """ + Export each worksheet in the given Excel workbook to PNG files and return the image paths in workbook order. + + Returns: + paths (list[Path]): Paths to the generated PNG files, ordered by the corresponding worksheets. + + Raises: + RenderError: If export or rendering fails. + """ normalized_excel_path = Path(excel_path) normalized_output_dir = Path(output_dir) normalized_output_dir.mkdir(parents=True, exist_ok=True) @@ -106,6 +114,17 @@ def export_sheet_images( def _sanitize_sheet_filename(name: str) -> str: + """ + Create a filesystem-safe filename derived from an Excel sheet name. + + Replaces characters that are not allowed in filenames (\/:*?"<>|) with underscores, trims surrounding whitespace, and returns "sheet" if the result is empty. + + Parameters: + name (str): Original sheet name. + + Returns: + safe_name (str): Filename-safe string derived from `name`. + """ return "".join("_" if c in '\\/:*?"<>|' else c for c in name).strip() or "sheet" @@ -122,11 +141,27 @@ class _SheetApiProtocol(Protocol): def ExportAsFixedFormat( # noqa: N802 self, file_format: int, output_path: str, *args: object, **kwargs: object - ) -> None: ... + ) -> None: """ + Export the sheet or workbook to a fixed-format file (for example, PDF or XPS). + + Parameters: + file_format (int): Excel XlFixedFormatType enum value indicating the output format (e.g., the constant for PDF). + output_path (str): Filesystem path where the fixed-format file will be written. + *args (object): Additional positional arguments forwarded to the underlying Excel COM ExportAsFixedFormat call. + **kwargs (object): Additional keyword arguments forwarded to the underlying Excel COM ExportAsFixedFormat call. + """ + ... def _iter_sheet_apis(wb: xw.Book) -> list[tuple[int, str, _SheetApiProtocol]]: - """Return sheet index, name, and COM api handle in order.""" + """ + Enumerate workbook sheets and return each sheet's zero-based index, display name, and COM API handle in workbook order. + + If direct COM access to Worksheets is unavailable, falls back to iterating wb.sheets to build the same list. + + Returns: + List[tuple[int, str, _SheetApiProtocol]]: Tuples of (zero-based sheet index, sheet name, sheet COM API handle) in workbook order. + """ try: ws_collection = getattr(getattr(wb, "api", None), "Worksheets", None) if ws_collection is None: @@ -152,9 +187,10 @@ def _iter_sheet_apis(wb: xw.Book) -> list[tuple[int, str, _SheetApiProtocol]]: def _build_sheet_export_plan( wb: xw.Book, ) -> list[tuple[str, _SheetApiProtocol, str | None]]: - """Return export plan rows for sheets and their print areas. - - Each item is (sheet_name, sheet_api, print_area). + """ + Build an ordered export plan mapping each worksheet to its print areas. + + Each returned tuple is (sheet_name, sheet_api, print_area). The list preserves workbook sheet order; for sheets with no defined print areas `print_area` is `None`, and for sheets with multiple print areas there is one tuple per area. """ plan: list[tuple[str, _SheetApiProtocol, str | None]] = [] for _, sheet_name, sheet_api in _iter_sheet_apis(wb): @@ -168,7 +204,17 @@ def _build_sheet_export_plan( def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: - """Return print areas for a sheet API, split into individual ranges.""" + """ + Extract the sheet's print-area ranges as a list of strings. + + Retrieves the PageSetup.PrintArea value from the provided sheet API, splits it by commas while respecting single-quoted sections, and returns each range as a separate string. If the sheet has no print area or the property is inaccessible, an empty list is returned. + + Parameters: + sheet_api (_SheetApiProtocol): Excel sheet API object exposing a `PageSetup.PrintArea` attribute. + + Returns: + list[str]: List of print-area range strings in the order they appear, or an empty list if none are defined or on access failure. + """ try: page_setup = getattr(sheet_api, "PageSetup", None) if page_setup is None: @@ -182,7 +228,18 @@ def _extract_print_areas(sheet_api: _SheetApiProtocol) -> list[str]: def _split_csv_respecting_quotes(raw: str) -> list[str]: - """Split a CSV-like string while keeping commas inside single quotes intact.""" + """ + Split a comma-separated string into parts while treating single-quoted sections as atomic. + + This function splits raw on commas that are not inside single quotes. Text enclosed in single quotes is preserved (including internal commas). Two consecutive single quotes inside a quoted section are treated as an escaped single-quote pair. Leading and trailing whitespace is trimmed from each part and empty parts are removed. + + Parameters: + raw (str): The input CSV-like string that may contain single-quoted segments. + + Returns: + list[str]: A list of non-empty tokens obtained from splitting `raw` by unquoted commas, + with surrounding whitespace removed and quoted segments preserved. + """ parts: list[str] = [] buf: list[str] = [] in_quote = False @@ -216,7 +273,18 @@ def _rename_pages_for_print_area( base_index: int, safe_name: str, ) -> list[Path]: - """Rename multi-page outputs to unique prefixes for print areas.""" + """ + Rename the given image files so each gets a unique numeric prefix based on a base index and a safe sheet name. + + Parameters: + paths (list[Path]): Existing image files for a single sheet or print area (may include per-page suffixes). + output_dir (Path): Directory where renamed files will reside. + base_index (int): Zero-based starting index used to compute the numeric prefix for each output file. + safe_name (str): Filesystem-safe base name to use after the numeric prefix. + + Returns: + list[Path]: Paths to the renamed files in the same order as input, each named "{index:02d}_{safe_name}.png". + """ renamed: list[Path] = [] for path in paths: page_index = _page_index_from_suffix(path.stem) @@ -229,7 +297,17 @@ def _rename_pages_for_print_area( def _page_index_from_suffix(stem: str) -> int: - """Extract zero-based page index from a _pNN suffix when present.""" + """ + Extracts a zero-based page index from a filename stem ending with a "_pNN" numeric suffix. + + If the stem ends with "_p" followed by digits, returns that number minus one. If the suffix is missing, non-numeric, or less than 1, returns 0. + + Parameters: + stem (str): Filename stem to parse. + + Returns: + int: Zero-based page index derived from the "_pNN" suffix, or 0 when no valid suffix is present. + """ if "_p" not in stem: return 0 base, suffix = stem.rsplit("_p", 1) @@ -249,13 +327,16 @@ def _export_sheet_pdf( ignore_print_areas: bool, print_area: str | None = None, ) -> None: - """Export a sheet to PDF via Excel COM. - + """ + Export the given worksheet to a PDF file, optionally applying a temporary print area. + + If `print_area` is provided, it is applied to the sheet's PageSetup.PrintArea before exporting and restored afterwards. The function attempts to call ExportAsFixedFormat with an IgnorePrintAreas keyword; if that call fails due to an unexpected COM signature, it retries with a minimal argument set. + Args: - sheet_api: Target worksheet COM api. - pdf_path: Output PDF path. - ignore_print_areas: Whether to ignore print areas. - print_area: Optional print area string to apply for this export. + sheet_api: COM-like worksheet API exposing `PageSetup` and `ExportAsFixedFormat`. + pdf_path (Path): Filesystem path to write the PDF to. + ignore_print_areas (bool): If True, request that Excel ignore sheet print areas during export. + print_area (str | None): Optional print area string to apply for this export; if None, the sheet's current print area is left unchanged. """ original_print_area: object | None = None page_setup = None @@ -282,7 +363,18 @@ def _export_sheet_pdf( def _ensure_pdfium(use_subprocess: bool) -> ModuleType | None: - """Return pdfium module when needed, or None for subprocess rendering.""" + """ + Ensure the pypdfium2 dependency is available and return the pdfium module for in-process rendering. + + Parameters: + use_subprocess (bool): When True, confirm pypdfium2 is installed for subprocess rendering but do not keep the module in-process; when False, import and return the pdfium module for direct use. + + Returns: + ModuleType | None: The imported `pdfium` module when `use_subprocess` is False, or `None` when `use_subprocess` is True. + + Raises: + MissingDependencyError: If pypdfium2 (and required extras) is not installed. + """ if use_subprocess: _require_pdfium() return None @@ -297,7 +389,20 @@ def _export_sheet_images_with_app( use_subprocess: bool, pdfium: ModuleType | None, ) -> list[Path]: - """Export sheet images using Excel COM and PDF rendering.""" + """ + Export each worksheet of an Excel workbook to PNG images by exporting sheets to per-sheet PDFs and rendering those PDFs. + + Parameters: + excel_path (Path): Path to the source Excel workbook. + output_dir (Path): Directory where generated PNGs will be written. + temp_dir (Path): Temporary directory for per-sheet intermediate PDF files. + dpi (int): Dots per inch used when rasterizing PDF pages. + use_subprocess (bool): If True, render PDF pages in a subprocess; otherwise render in-process. + pdfium (ModuleType | None): In-process pypdfium2 module when rendering in-process, or None when subprocess rendering is used. + + Returns: + list[Path]: Paths to generated PNG images in the order corresponding to the workbook's sheets and print-area splits. + """ written: list[Path] = [] app: xw.App | None = None wb: xw.Book | None = None @@ -364,7 +469,15 @@ def _render_sheet_images( dpi: int, use_subprocess: bool, ) -> list[Path]: - """Render sheet PDF to PNGs using the configured renderer.""" + """ + Render a sheet PDF to one or more PNG files using either a subprocess or in-process renderer. + + Returns: + paths (list[Path]): Paths to the generated PNG files in output order. + + Raises: + RenderError: If in-process rendering is requested but the `pypdfium2` module (`pdfium`) is not provided. + """ if use_subprocess: return _render_pdf_pages_subprocess( sheet_pdf, @@ -391,14 +504,34 @@ def _normalize_multipage_paths( base_index: int, safe_name: str, ) -> list[Path]: - """Normalize multi-page outputs to unique prefixes when needed.""" + """ + Assign distinct, ordered filenames for multi-page sheet outputs. + + If `paths` contains a single file, the list is returned unchanged. If `paths` contains multiple files, each file is given a unique, numbered filename in `output_dir` using `base_index` and `safe_name` so pages are ordered and do not collide. + + Parameters: + paths (list[Path]): Existing file paths for a sheet's rendered pages. + output_dir (Path): Directory containing or intended to contain the output files. + base_index (int): Zero-based starting index used to compute numeric prefixes for filenames. + safe_name (str): Filesystem-safe base name included in the generated filenames. + + Returns: + list[Path]: Paths to the resulting files in `output_dir`. When multiple input paths are provided, returned paths reflect the new, uniquely prefixed filenames. + """ if len(paths) <= 1: return paths return _rename_pages_for_print_area(paths, output_dir, base_index, safe_name) def _use_render_subprocess() -> bool: - """Return True when PDF->PNG rendering should run in a subprocess.""" + """ + Decide whether PDF-to-PNG rendering should be performed in a subprocess. + + Reads the environment variable EXSTRUCT_RENDER_SUBPROCESS (case-insensitive). Subprocess rendering is disabled when the variable is set to "0" or "false"; if the variable is unset or set to any other value, subprocess rendering is enabled. + + Returns: + `true` if subprocess rendering is enabled, `false` otherwise. + """ return os.getenv("EXSTRUCT_RENDER_SUBPROCESS", "1").lower() not in {"0", "false"} @@ -492,4 +625,4 @@ def _render_pdf_pages_worker( queue.put({"error": str(exc)}) -__all__ = ["export_pdf", "export_sheet_images"] +__all__ = ["export_pdf", "export_sheet_images"] \ No newline at end of file diff --git a/tests/backends/test_auto_page_breaks.py b/tests/backends/test_auto_page_breaks.py index dd472d5..8c0e9f2 100644 --- a/tests/backends/test_auto_page_breaks.py +++ b/tests/backends/test_auto_page_breaks.py @@ -16,6 +16,13 @@ def test_extract_passes_auto_page_break_flag( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: + """ + Verify that extract_workbook is invoked with include_auto_page_breaks set to True. + + Creates a fake extractor that captures the include_auto_page_breaks argument, replaces + exstruct.engine.extract_workbook with it, runs ExStructEngine.extract against a dummy + workbook path configured to export auto page breaks, and asserts the captured flag is True. + """ called: dict[str, object] = {} def fake_extract( @@ -31,6 +38,20 @@ def fake_extract( include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: + """ + Test stub for workbook extraction that records the auto page breaks flag. + + This fake extractor captures the value of `include_auto_page_breaks` in the outer + `called` mapping and returns a minimal `WorkbookData` with `book_name` set to + the provided path's filename and an empty `sheets` mapping. + + Parameters: + path (Path): Filesystem path used to derive the returned `WorkbookData.book_name`. + include_auto_page_breaks (bool): Flag whose value is written to `called["include_auto_page_breaks"]`. + + Returns: + WorkbookData: A minimal workbook data object with `book_name` set to `path.name` and no sheets. + """ called["include_auto_page_breaks"] = include_auto_page_breaks return WorkbookData(book_name=path.name, sheets={}) @@ -82,4 +103,4 @@ class _DummyWorkbook: sheets = [_FailingSheet()] backend = ComBackend(_DummyWorkbook()) - assert backend.extract_auto_page_breaks() == {} + assert backend.extract_auto_page_breaks() == {} \ No newline at end of file diff --git a/tests/backends/test_backends.py b/tests/backends/test_backends.py index cbebc72..0046896 100644 --- a/tests/backends/test_backends.py +++ b/tests/backends/test_backends.py @@ -79,6 +79,12 @@ def test_openpyxl_backend_extract_formulas_map_returns_none_on_failure( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: def fake_formulas_map(file_path: Path) -> object: + """ + Test helper that always raises a RuntimeError to simulate a failure when extracting a formulas map. + + Raises: + RuntimeError: with message "boom". + """ raise RuntimeError("boom") monkeypatch.setattr( @@ -120,6 +126,15 @@ def test_com_backend_extract_formulas_map_returns_none_on_failure( monkeypatch: MonkeyPatch, ) -> None: def fake_formulas_map(workbook: object) -> object: + """ + Test stub that simulates a failure by always raising a RuntimeError. + + Parameters: + workbook (object): Workbook-like object (ignored); present to match the real function's signature. + + Raises: + RuntimeError: Always raised with message "boom". + """ raise RuntimeError("boom") monkeypatch.setattr( @@ -157,6 +172,11 @@ class _DummyWorkbook: def test_openpyxl_backend_extract_print_areas(tmp_path: Path) -> None: + """ + Verifies that OpenpyxlBackend.extract_print_areas reads an openpyxl workbook's print area and returns the corresponding zero-based ranges keyed by sheet name. + + Creates an in-memory workbook with a single sheet named "Sheet1", sets its print area to "A1:B2", saves and loads it via OpenpyxlBackend, then asserts the sheet is present, has at least one area, and that the first area's r1 and c1 are 1 and 0 respectively. + """ wb = Workbook() ws = wb.active ws.title = "Sheet1" @@ -178,6 +198,11 @@ def test_openpyxl_backend_extract_print_areas(tmp_path: Path) -> None: def test_openpyxl_backend_extract_print_areas_returns_empty_on_error( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: + """ + Ensure OpenpyxlBackend.extract_print_areas returns an empty dict when the workbook loader raises an error. + + Verifies that the backend handles errors from the underlying workbook opening function by returning an empty mapping of print areas. + """ def _raise(*_args: object, **_kwargs: object) -> None: raise RuntimeError("boom") @@ -226,31 +251,72 @@ def test_com_backend_parse_print_area_range_invalid() -> None: class _Location: def __init__(self, row: int | None = None, col: int | None = None) -> None: + """ + Initialize the location with row and column values. + + Parameters: + row (int | None): Row index or None. + col (int | None): Column index or None. + """ self.Row = row self.Column = col class _BreakItem: def __init__(self, row: int | None = None, col: int | None = None) -> None: + """ + Initialize the break item with an optional sheet location. + + Parameters: + row (int | None): Row index (1-based) for the location, or None if unspecified. + col (int | None): Column index (1-based) for the location, or None if unspecified. + """ self.Location = _Location(row=row, col=col) class _Breaks: def __init__(self, items: list[_BreakItem]) -> None: + """ + Initialize the Breaks collection from a list of break items. + + Parameters: + items (list[_BreakItem]): Sequence of `_BreakItem` instances representing page break entries; ordering corresponds to 1-based access via `Item`. + """ self._items = items self.Count = len(items) def Item(self, index: int) -> _BreakItem: + """ + Return the break item at the given 1-based position. + + Parameters: + index (int): 1-based position of the break to retrieve. + + Returns: + _BreakItem: The break item at the specified position. + """ return self._items[index - 1] class _RangeRows: def __init__(self, count: int) -> None: + """ + Initialize the breaks container with a specified item count. + + Parameters: + count (int): Number of break items the container should report via its `Count` attribute. + """ self.Count = count class _RangeCols: def __init__(self, count: int) -> None: + """ + Initialize the breaks container with a specified item count. + + Parameters: + count (int): Number of break items the container should report via its `Count` attribute. + """ self.Count = count @@ -271,6 +337,16 @@ class _PageSetup: class _SheetApi: def __init__(self) -> None: + """ + Initialize a fake sheet API used by COM backend tests with default page and range state. + + Creates default attributes: + - DisplayPageBreaks set to False. + - PageSetup populated with a default PrintArea. + - UsedRange populated with a default Address. + - HPageBreaks containing one horizontal break at row 2. + - VPageBreaks containing one vertical break at column 2. + """ self.DisplayPageBreaks = False self.PageSetup = _PageSetup() self.UsedRange = _UsedRange() @@ -278,6 +354,15 @@ def __init__(self) -> None: self.VPageBreaks = _Breaks([_BreakItem(col=2)]) def Range(self, _addr: str) -> _Range: + """ + Create and return a Range wrapper for the given Excel-style address. + + Parameters: + _addr (str): Excel-style address or range string (e.g., "A1", "A1:B2", or "Sheet1!A1:B2"). + + Returns: + _Range: An object representing the requested worksheet range. + """ return _Range() @@ -285,11 +370,21 @@ class _Sheet: name = "Sheet1" def __init__(self) -> None: + """ + Initialize a mock sheet and attach its API. + + Sets the `api` attribute to a new `_SheetApi` instance used by tests to simulate a sheet's COM-like API. + """ self.api = _SheetApi() class _DummyWorkbook: def __init__(self) -> None: + """ + Initialize a dummy workbook containing a single default sheet. + + The instance provides a `sheets` attribute set to a list with one `_Sheet` object. + """ self.sheets = [_Sheet()] @@ -302,6 +397,15 @@ def test_com_backend_extract_auto_page_breaks_success() -> None: class _RestoreErrorSheetApi: def __init__(self) -> None: + """ + Initialize a mock sheet API with default page, range, and break attributes. + + Creates: + - `_display`: boolean flag for DisplayPageBreaks (defaults to False). + - `PageSetup`: a default page setup object. + - `UsedRange`: a default used-range object. + - `HPageBreaks` and `VPageBreaks`: horizontal and vertical break collections, initialized empty. + """ self._display = False self.PageSetup = _PageSetup() self.UsedRange = _UsedRange() @@ -310,15 +414,39 @@ def __init__(self) -> None: @property def DisplayPageBreaks(self) -> bool: + """ + Get whether displaying page breaks is enabled on the sheet. + + Returns: + `True` if page break display is enabled, `False` otherwise. + """ return self._display @DisplayPageBreaks.setter def DisplayPageBreaks(self, value: bool) -> None: + """ + Set the sheet's DisplayPageBreaks flag. + + Parameters: + value (bool): True to enable display of automatic page breaks. Passing False will trigger a restore failure. + + Raises: + RuntimeError: If `value` is False (restore failed). + """ if value is False: raise RuntimeError("restore failed") self._display = value def Range(self, _addr: str) -> _Range: + """ + Create and return a Range wrapper for the given Excel-style address. + + Parameters: + _addr (str): Excel-style address or range string (e.g., "A1", "A1:B2", or "Sheet1!A1:B2"). + + Returns: + _Range: An object representing the requested worksheet range. + """ return _Range() @@ -326,15 +454,25 @@ class _RestoreErrorSheet: name = "Sheet1" def __init__(self) -> None: + """ + Create a sheet object whose underlying API simulates an error when restoring DisplayPageBreaks. + + This constructor assigns an instance of _RestoreErrorSheetApi to the `api` attribute so tests can exercise code paths that handle failures when restoring page-break state. + """ self.api = _RestoreErrorSheetApi() class _RestoreErrorWorkbook: def __init__(self) -> None: + """ + Create a mock workbook containing a single sheet that raises an error when restoring DisplayPageBreaks. + + The instance exposes a `sheets` attribute set to a list with one _RestoreErrorSheet(), which is used to simulate failures during page-break restoration in tests. + """ self.sheets = [_RestoreErrorSheet()] def test_com_backend_extract_auto_page_breaks_restore_error() -> None: backend = ComBackend(_RestoreErrorWorkbook()) areas = backend.extract_auto_page_breaks() - assert "Sheet1" in areas + assert "Sheet1" in areas \ No newline at end of file diff --git a/tests/backends/test_print_areas_openpyxl.py b/tests/backends/test_print_areas_openpyxl.py index 90b58b2..31362e3 100644 --- a/tests/backends/test_print_areas_openpyxl.py +++ b/tests/backends/test_print_areas_openpyxl.py @@ -14,6 +14,12 @@ def _make_book_with_print_area(path: Path) -> None: + """ + Create a simple Excel workbook with a single sheet named "Sheet1", set its print area to "A1:B2", write "x" to cell A1, save it to the given path, and close the file. + + Parameters: + path (Path): Filesystem path where the workbook will be saved. + """ wb = Workbook() ws = wb.active ws.title = "Sheet1" @@ -61,6 +67,12 @@ class _DefinedArea: class _DefinedNames: def get(self, _name: str) -> _DefinedArea: + """ + Create a default defined area object. + + Returns: + _DefinedArea: A new, empty/default defined-area instance. + """ return _DefinedArea() class _DummyWorkbook: @@ -109,4 +121,4 @@ def test_append_print_areas_skips_invalid_ranges() -> None: areas: PrintAreaData = {} _append_print_areas(areas, "Sheet1", "A1:B2,INVALID") assert "Sheet1" in areas - assert len(areas["Sheet1"]) == 1 + assert len(areas["Sheet1"]) == 1 \ No newline at end of file diff --git a/tests/com/test_render_smoke.py b/tests/com/test_render_smoke.py index d85ec52..6557a98 100644 --- a/tests/com/test_render_smoke.py +++ b/tests/com/test_render_smoke.py @@ -37,6 +37,11 @@ def test_render_smoke_pdf_and_png(tmp_path: Path) -> None: def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: + """ + Verify that processing a workbook with multiple print ranges across four sheets produces an images directory containing exactly four PNG files. + + Uses the test asset 'assets/multiple_print_ranges_4sheets.xlsx', runs process_excel with image output enabled, and asserts the generated images directory exists and contains four .png images. + """ xlsx = ( Path(__file__).resolve().parents[1] / "assets" @@ -55,4 +60,4 @@ def test_render_multiple_print_ranges_images(tmp_path: Path) -> None: images_dir = out_json.parent / f"{out_json.stem}_images" images = list(images_dir.glob("*.png")) assert images_dir.exists() - assert len(images) == 4 + assert len(images) == 4 \ No newline at end of file diff --git a/tests/core/test_cells_utils.py b/tests/core/test_cells_utils.py index 69bf9da..9a03248 100644 --- a/tests/core/test_cells_utils.py +++ b/tests/core/test_cells_utils.py @@ -71,6 +71,11 @@ def test_detect_tables_openpyxl_respects_table_params( def test_normalize_formula_value_prefers_array_text() -> None: + """ + Verify that _normalize_formula_value prefers an array-like object's text and treats an empty string as no formula. + + Asserts that an object with a `text` attribute is converted to a formula string prefixed with '=' (e.g., "=SUM(A1:A3)"), and that an empty string is normalized to None. + """ class _ArrayFormulaLike: text = "SUM(A1:A3)" @@ -143,6 +148,16 @@ class _DummySheet: used_range = _DummyUsedRange() def range(self, _start: object, _end: object) -> _DummyRange: + """ + Return a new _DummyRange representing a requested cell range. + + Parameters: + _start (object): Start coordinate or cell reference for the range request (ignored by this dummy implementation). + _end (object): End coordinate or cell reference for the range request (ignored by this dummy implementation). + + Returns: + _DummyRange: A fresh _DummyRange instance corresponding to the requested range. + """ return _DummyRange() class _DummyWorkbook: @@ -154,4 +169,4 @@ class _DummyWorkbook: assert sheet.formulas_map == { "=A1": [(1, 0)], "=SUM(A1)": [(2, 0)], - } + } \ No newline at end of file diff --git a/tests/core/test_mode_output.py b/tests/core/test_mode_output.py index fbcfdaa..202a87a 100644 --- a/tests/core/test_mode_output.py +++ b/tests/core/test_mode_output.py @@ -30,6 +30,11 @@ def _make_basic_book(path: Path) -> None: def _ensure_excel() -> None: + """ + Ensure Excel COM is available for tests and skip the current test if it is not. + + If the SKIP_COM_TESTS environment variable is set, this function skips the test. Otherwise it tries to start a hidden xlwings App and quits it; if starting the App fails, the function skips the test due to unavailable Excel COM. + """ if os.getenv("SKIP_COM_TESTS"): pytest.skip("SKIP_COM_TESTS is set; skipping Excel-dependent test.") try: @@ -190,4 +195,4 @@ def test_CLI_defaults_to_stdout(tmp_path: Path) -> None: ] result = subprocess.run(cmd, capture_output=True, text=True) assert result.returncode == 0 - assert '"book_name": "book.xlsx"' in result.stdout + assert '"book_name": "book.xlsx"' in result.stdout \ No newline at end of file diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py index 9596a76..d12dde3 100644 --- a/tests/core/test_pipeline.py +++ b/tests/core/test_pipeline.py @@ -223,6 +223,13 @@ def test_resolve_extraction_inputs_warns_on_xls_formulas( calls: list[str] = [] def _warn_once(key: str, message: str) -> None: + """ + Record a warning key in the shared `calls` list while ignoring the message. + + Parameters: + key (str): Identifier for the warning; appended to the module-level `calls` list. + message (str): Ignored placeholder kept for compatibility with expected callback signature. + """ calls.append(key) _ = message @@ -393,6 +400,16 @@ def _fake( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Provide a placeholder colors map for testing that is always empty. + + Parameters: + include_default_background (bool): Accepted for signature compatibility; has no effect on the returned value. + ignore_colors (set[str] | None): Accepted for signature compatibility; has no effect on the returned value. + + Returns: + WorkbookColorsMap: An empty colors map with no sheets. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -427,6 +444,11 @@ def _fake_com( include_default_background: bool, ignore_colors: set[str] | None, ) -> None: + """ + No-op placeholder that simulates a COM backend extraction step without producing any side effects. + + This function accepts a COM backend and related flags but intentionally performs no operations; it is used in tests as a stub implementation. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -438,6 +460,16 @@ def _fake_openpyxl( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Return an empty WorkbookColorsMap regardless of inputs. + + Parameters: + include_default_background (bool): Ignored; present for signature compatibility. + ignore_colors (set[str] | None): Ignored; present for signature compatibility. + + Returns: + WorkbookColorsMap: A colors map with no sheets. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -468,6 +500,12 @@ def test_step_extract_auto_page_breaks_com_sets_data( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: + """ + Return a stub mapping of sheet names to print areas containing a single 1x1 print area for "Sheet1". + + Returns: + dict[str, list[PrintArea]]: Mapping where "Sheet1" maps to a list with one PrintArea covering row 1, column 0 to row 1, column 0. + """ return {"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} monkeypatch.setattr(ComBackend, "extract_auto_page_breaks", _fake) @@ -502,12 +540,29 @@ def _fake_colors( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Return a fake workbook colors map used by tests. + + Parameters: + _backend (OpenpyxlBackend): Ignored backend parameter retained for signature compatibility. + include_default_background (bool): Whether the default background color would be included (ignored). + ignore_colors (set[str] | None): Set of color names to ignore (ignored). + + Returns: + object: A preconstructed colors map object used by tests. + """ _ = _backend _ = include_default_background _ = ignore_colors return colors_map def _fake_formulas(_: OpenpyxlBackend) -> object: + """ + Return the pre-captured formulas_map object. + + Returns: + The pre-captured `formulas_map` object. + """ return formulas_map monkeypatch.setattr(OpenpyxlBackend, "extract_colors_map", _fake_colors) @@ -539,6 +594,12 @@ def test_step_extract_formulas_map_openpyxl_skips_on_failure( tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" ) -> None: def _raise(_: OpenpyxlBackend) -> object: + """ + Always raises a RuntimeError with the message "boom". + + Raises: + RuntimeError: always raised with message "boom". + """ raise RuntimeError("boom") monkeypatch.setattr(OpenpyxlBackend, "extract_formulas_map", _raise) @@ -569,6 +630,12 @@ def test_step_extract_formulas_map_com_skips_on_failure( tmp_path: Path, monkeypatch: MonkeyPatch, caplog: "pytest.LogCaptureFixture" ) -> None: def _raise(_: ComBackend) -> object: + """ + Always raises a RuntimeError with message "boom". + + Raises: + RuntimeError: Always raised by this helper. + """ raise RuntimeError("boom") monkeypatch.setattr(ComBackend, "extract_formulas_map", _raise) @@ -631,6 +698,16 @@ def test_step_extract_shapes_com_sets_data( shapes_data = {"Sheet1": [object()]} def _fake(_: object, *, mode: str) -> dict[str, list[object]]: + """ + Provide a stub that supplies the module-level `shapes_data` mapping. + + Parameters: + _ (object): Placeholder positional argument; ignored. + mode (str): Mode selector; ignored. + + Returns: + dict[str, list[object]]: Mapping of sheet names to lists of shape objects from `shapes_data`. + """ _ = mode return shapes_data @@ -660,11 +737,26 @@ def test_step_extract_charts_com_sets_data( charts = [object()] def _fake(_: object, *, mode: str) -> list[object]: + """ + Return the captured charts list. + + Parameters: + mode (str): Ignored; accepted for compatibility with callers. + + Returns: + list[object]: The charts list captured from the enclosing scope. + """ _ = mode return charts class _Sheet: def __init__(self, name: str) -> None: + """ + Initialize the instance with a display name. + + Parameters: + name (str): The name to assign to the instance. + """ self.name = name class _Workbook: @@ -694,6 +786,11 @@ def test_step_extract_print_areas_com_skips_when_present( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _raise(_: ComBackend) -> object: + """ + Raise a RuntimeError indicating this code path must not be invoked. + + This function always raises RuntimeError("should not be called"). + """ raise RuntimeError("should not be called") monkeypatch.setattr(ComBackend, "extract_print_areas", _raise) @@ -721,6 +818,12 @@ def test_step_extract_print_areas_com_sets_data( tmp_path: Path, monkeypatch: MonkeyPatch ) -> None: def _fake(_: ComBackend) -> dict[str, list[PrintArea]]: + """ + Return a stub mapping of sheet names to print areas containing a single 1x1 print area for "Sheet1". + + Returns: + dict[str, list[PrintArea]]: Mapping where "Sheet1" maps to a list with one PrintArea covering row 1, column 0 to row 1, column 0. + """ return {"Sheet1": [PrintArea(r1=1, c1=0, r2=1, c2=0)]} monkeypatch.setattr(ComBackend, "extract_print_areas", _fake) @@ -754,6 +857,16 @@ def _fake_com( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Return a colors map object suitable for use as a COM backend response. + + Parameters: + include_default_background (bool): If true, the returned colors map should include the default background color. + ignore_colors (set[str] | None): Optional set of color identifiers to exclude from the returned map; `None` means no colors are excluded. + + Returns: + object: A colors map representing workbook-level color mappings. + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -765,6 +878,12 @@ def _raise( include_default_background: bool, ignore_colors: set[str] | None, ) -> object: + """ + Placeholder backend sentinel that always raises a RuntimeError when invoked. + + Raises: + RuntimeError: Always raised with message "should not be called". + """ _ = _backend _ = include_default_background _ = ignore_colors @@ -795,6 +914,16 @@ def test_run_com_pipeline_executes_steps(tmp_path: Path) -> None: calls: list[str] = [] def _step(_: ExtractionInputs, artifacts: ExtractionArtifacts, __: object) -> None: + """ + Test pipeline step that simulates shape extraction. + + Sets artifacts.shape_data to a mapping for "Sheet1" containing a single Shape and records invocation by appending "called" to the outer `calls` list. + + Parameters: + _ (ExtractionInputs): Unused extraction inputs placeholder. + artifacts (ExtractionArtifacts): Artifacts object to populate with shape data. + __ (object): Unused context placeholder. + """ calls.append("called") artifacts.shape_data = {"Sheet1": [Shape(id=1, text="", l=0, t=0)]} @@ -823,29 +952,87 @@ def test_run_extraction_pipeline_com_success( ) -> None: class _Sheet: def __init__(self, name: str) -> None: + """ + Initialize the instance with a display name. + + Parameters: + name (str): The name to assign to the instance. + """ self.name = name class _Sheets: def __init__(self) -> None: + """ + Initialize the object with a single default sheet named "Sheet1". + + Creates the internal mapping `self._sheets` and populates it with one `_Sheet` instance keyed by "Sheet1". + """ self._sheets = {"Sheet1": _Sheet("Sheet1")} def __getitem__(self, name: str) -> _Sheet: + """ + Access a worksheet by its name. + + Parameters: + name (str): The name of the sheet to retrieve. + + Returns: + _Sheet: The sheet object associated with `name`. + + Raises: + KeyError: If no sheet with the given name exists. + """ return self._sheets[name] class _Workbook: sheets = _Sheets() def _pre_step(_: ExtractionInputs, artifacts: ExtractionArtifacts) -> None: + """ + Populate artifacts with default minimal cell and merged-cell data for a single sheet. + + Parameters: + _ (ExtractionInputs): Unused extraction inputs placeholder. + artifacts (ExtractionArtifacts): Mutable extraction artifacts that will be updated with + `cell_data` set to a single row for "Sheet1" and `merged_cell_data` set to an empty list + for "Sheet1". + """ artifacts.cell_data = {"Sheet1": [CellRow(r=1, c={"0": "A"})]} artifacts.merged_cell_data = {"Sheet1": []} def _fake_plan(_: ExtractionInputs) -> PipelinePlan: + """ + Create a fixed PipelinePlan for tests that forces COM usage and provides a single pre-COM step. + + Parameters: + _ (ExtractionInputs): Ignored input; present to match the PipelinePlan factory signature. + + Returns: + PipelinePlan: A plan with `pre_com_steps` set to a list containing `_pre_step`, `com_steps` empty, and `use_com` set to `True`. + """ return PipelinePlan(pre_com_steps=[_pre_step], com_steps=[], use_com=True) def _fake_detect_tables(_: object) -> list[str]: + """ + Provide a detector that always reports no table ranges. + + The input workbook-like object is ignored. + + Returns: + list[str]: An empty list of table range identifiers. + """ return [] def _fake_workbook(_: Path) -> object: + """ + Provide a context manager that yields a lightweight fake workbook for tests. + + Parameters: + _ (Path): Ignored file path parameter retained to match the real backend signature. + + Returns: + object: A context manager whose `__enter__` returns a new `_Workbook` instance and whose `__exit__` does not suppress exceptions (returns `None`). + """ class _Context: def __enter__(self) -> _Workbook: return _Workbook() @@ -886,4 +1073,4 @@ def __exit__( result = run_extraction_pipeline(inputs) assert result.state.com_attempted is True assert result.state.com_succeeded is True - assert "Sheet1" in result.workbook.sheets + assert "Sheet1" in result.workbook.sheets \ No newline at end of file diff --git a/tests/core/test_pipeline_fallbacks.py b/tests/core/test_pipeline_fallbacks.py index 5099c87..322d9bc 100644 --- a/tests/core/test_pipeline_fallbacks.py +++ b/tests/core/test_pipeline_fallbacks.py @@ -51,6 +51,11 @@ def test_pipeline_fallback_skip_com_tests( def test_pipeline_fallback_com_unavailable( monkeypatch: MonkeyPatch, tmp_path: Path ) -> None: + """ + Verifies that the extraction pipeline falls back when COM access is unavailable. + + Creates a basic workbook, forces the COM-access entry point to raise, runs the extraction pipeline, and asserts that the pipeline records a fallback due to COM being unavailable (`FallbackReason.COM_UNAVAILABLE`), did not attempt COM (`com_attempted is False`), and that the resulting sheet "Sheet1" exists, contains rows, and has no shapes or charts. + """ path = tmp_path / "book.xlsx" _make_basic_book(path) monkeypatch.delenv("SKIP_COM_TESTS", raising=False) @@ -123,4 +128,4 @@ def _raise( sheet = result.workbook.sheets["Sheet1"] assert sheet.shapes == [] assert sheet.charts == [] - assert sheet.rows + assert sheet.rows \ No newline at end of file diff --git a/tests/engine/test_engine.py b/tests/engine/test_engine.py index 084ee12..3725036 100644 --- a/tests/engine/test_engine.py +++ b/tests/engine/test_engine.py @@ -38,6 +38,19 @@ def fake_extract( include_merged_cells: bool | None = None, include_merged_values_in_rows: bool = True, ) -> WorkbookData: + """ + Test helper that simulates workbook extraction for unit tests. + + Records the received `mode` and `include_print_areas` into the outer `called` mapping and returns a minimal WorkbookData whose `book_name` is the input path's filename and whose `sheets` is empty. + + Parameters: + path (Path): Path to the workbook; its filename is used for the returned WorkbookData.book_name. + mode (str): Extraction mode passed through and recorded. + include_print_areas (bool): Whether print areas were requested; the value is recorded in `called`. + + Returns: + WorkbookData: A WorkbookData instance with `book_name` set to path.name and an empty `sheets` mapping. + """ called["mode"] = mode called["include_print_areas"] = include_print_areas return WorkbookData(book_name=path.name, sheets={}) @@ -280,4 +293,4 @@ def fake_images(file_path: Path, images_dir: Path, *, dpi: int) -> None: assert calls["pdf_path"].suffix == ".pdf" assert isinstance(calls["images_dir"], Path) assert calls["images_dir"].name.endswith("_images") - assert calls["dpi"] == 144 + assert calls["dpi"] == 144 \ No newline at end of file diff --git a/tests/models/test_models_export.py b/tests/models/test_models_export.py index ab41ad1..2ea52c2 100644 --- a/tests/models/test_models_export.py +++ b/tests/models/test_models_export.py @@ -22,6 +22,12 @@ def _sheet() -> SheetData: + """ + Create a sample SheetData containing one row, no shapes or charts, and a single table candidate. + + Returns: + SheetData: A SheetData instance with one CellRow (r=1, c={"0": "A"}), empty shapes and charts lists, and table_candidates set to ["A1:B2"]. + """ return SheetData( rows=[CellRow(r=1, c={"0": "A"})], shapes=[], @@ -155,4 +161,4 @@ def test_sheet_json_includes_merged_cells_schema() -> None: ) data = json.loads(sheet.to_json()) assert data["merged_cells"]["schema"] == ["r1", "c1", "r2", "c2", "v"] - assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] + assert data["merged_cells"]["items"][0] == [1, 0, 1, 1, "merged"] \ No newline at end of file diff --git a/tests/render/test_render_init.py b/tests/render/test_render_init.py index 75731c6..3285144 100644 --- a/tests/render/test_render_init.py +++ b/tests/render/test_render_init.py @@ -583,6 +583,12 @@ def test_extract_print_areas_handles_exception() -> None: class _PageSetup: @property def PrintArea(self) -> str: + """ + Simulate accessing a worksheet's PrintArea and always raise an error to emulate a failure. + + Raises: + RuntimeError: Always raised to simulate an error when retrieving the PrintArea. + """ raise RuntimeError("boom") class _SheetApi: @@ -598,13 +604,33 @@ def test_iter_sheet_apis_prefers_worksheets_collection() -> None: class _WsApi: def __init__(self, name: str) -> None: + """ + Initialize the FakeSheet with the given Excel sheet name. + + Parameters: + name (str): The sheet's name to assign to the object's `Name` attribute. + """ self.Name = name class _Worksheets: def __init__(self) -> None: + """ + Initialize the fake PDF document stub. + + Sets the `Count` attribute to 2 to emulate a document with two pages. + """ self.Count = 2 def Item(self, index: int) -> _WsApi: + """ + Return a worksheet API stub for the sheet at the given index. + + Parameters: + index (int): One-based index of the worksheet within the workbook. + + Returns: + _WsApi: A worksheet API stub corresponding to the sheet at `index`. + """ return _WsApi(f"Sheet{index}") class _Api: @@ -623,6 +649,12 @@ def test_export_pdf_propagates_render_error( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: def _raise() -> xw.App: + """ + Always raises a RenderError to simulate failure when obtaining an Excel application. + + Raises: + RenderError: Always raised with the message "boom". + """ raise RenderError("boom") monkeypatch.setattr(render, "_require_excel_app", _raise) @@ -649,9 +681,24 @@ class _SheetApi: pass def _fake_iter(_: xw.Book) -> list[tuple[int, str, _SheetApi]]: + """ + Return a single-item list that mimics iterating workbook sheets for tests. + + Returns: + A list with one tuple (index, sheet name, sheet API stub): (0, "Sheet1", _SheetApi()). + """ return [(0, "Sheet1", _SheetApi())] def _fake_extract(_: _SheetApi) -> list[str]: + """ + Provide two fake print-area ranges for testing. + + Parameters: + _ (_SheetApi): Ignored sheet API placeholder. + + Returns: + list[str]: Two print-area ranges: "A1:B2" and "C3:D4". + """ return ["A1:B2", "C3:D4"] monkeypatch.setattr(render, "_iter_sheet_apis", _fake_iter) @@ -678,10 +725,25 @@ def test_export_sheet_pdf_skips_invalid_print_area(tmp_path: Path) -> None: class _BadPageSetup: @property def PrintArea(self) -> str: + """ + Represents the worksheet's PrintArea setting as an Excel range string. + + Returns: + str: The PrintArea range (e.g., "A1:B2"). + """ return "A1:B2" @PrintArea.setter def PrintArea(self, _value: object) -> None: + """ + Simulated setter for PrintArea that always fails. + + Parameters: + _value (object): Ignored; the provided value is not used because the setter always raises. + + Raises: + RuntimeError: Always raised with the message "bad". + """ raise RuntimeError("bad") class _SheetApi: @@ -690,6 +752,14 @@ class _SheetApi: def ExportAsFixedFormat( self, _file_format: int, _output_path: str, *args: object, **kwargs: object ) -> None: + """ + Simulate exporting a workbook/sheet to a fixed-format file by writing a minimal fake PDF header to the given path. + + Parameters: + _file_format (int): Ignored numeric format indicator. + _output_path (str): Filesystem path where the fake export file will be written. + *args, **kwargs: Additional arguments are accepted and ignored. + """ _ = args _ = kwargs @@ -730,6 +800,23 @@ def _fake_render( _dpi: int, _use_subprocess: bool, ) -> list[Path]: + """ + Simulates rendering a PDF sheet to image files for tests. + + On the first invocation this function returns an empty list to simulate a transient empty render result; on subsequent invocations it returns a single Path inside output_dir named "{sheet_index+1:02d}_{safe_name}.png". + + Parameters: + _pdfium: Ignored in the fake implementation (kept for signature compatibility). + _pdf_path: Ignored in the fake implementation (kept for signature compatibility). + output_dir (Path): Directory where the fake image path is located. + sheet_index (int): Zero-based index of the sheet; used to build the filename prefix. + safe_name (str): Sanitized sheet name used in the filename. + _dpi: Ignored in the fake implementation (kept for signature compatibility). + _use_subprocess: Ignored in the fake implementation (kept for signature compatibility). + + Returns: + list[Path]: Empty list on the first call, otherwise a list containing one Path pointing to the fake PNG file. + """ calls.append(1) if len(calls) == 1: return [] @@ -772,15 +859,35 @@ def test_export_sheet_pdf_does_not_swallow_export_errors(tmp_path: Path) -> None class _FlakyPageSetup: def __init__(self) -> None: + """ + Initialize a PageSetup-like test stub with a default print area and a setter call counter. + + The instance starts with `_print_area` set to "A1" and `_set_calls` set to 0 to track how many times the print area setter has been invoked. + """ self._print_area: object = "A1" self._set_calls = 0 @property def PrintArea(self) -> object: + """ + Retrieve the current PrintArea value from the PageSetup stub. + + Returns: + print_area (object): The stored PrintArea value (typically a string) or whatever was set on the stub. + """ return self._print_area @PrintArea.setter def PrintArea(self, value: object) -> None: + """ + Set the PrintArea value on this stub PageSetup instance. + + Parameters: + value (object): The print area value to assign. + + Raises: + RuntimeError: If the setter is invoked more than once (simulates a restore failure). + """ if self._set_calls >= 1: raise RuntimeError("restore failed") self._print_area = value @@ -794,6 +901,12 @@ class _ExplodingSheetApi: def ExportAsFixedFormat( self, file_format: int, output_path: str, *args: object, **kwargs: object ) -> None: + """ + Simulate exporting to a fixed format; this stub always raises an export error. + + Raises: + RuntimeError: with message "export failed" when invoked. + """ _ = file_format _ = output_path _ = args @@ -807,4 +920,4 @@ def ExportAsFixedFormat( pdf_path, ignore_print_areas=False, print_area="A1:B2", - ) + ) \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py index c85b1df..8ed8bc0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -18,17 +18,18 @@ def parametrize( | None = None, scope: Literal["session", "package", "module", "class", "function"] | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: - """Type-safe wrapper around pytest.mark.parametrize. - - Args: - argnames: Parameter names for the parametrized test. - argvalues: Parameter values for each test case. - indirect: Whether to treat parameters as fixtures. - ids: Optional case IDs or an ID factory. - scope: Optional fixture scope for parametrization. - + """ + Return a decorator that parametrizes a test callable with the given argument names and values. + + Parameters: + argnames: One or more parameter names (single string or sequence of strings) to inject into the test callable. + argvalues: An iterable of values or value-tuples to use for each generated test case. + indirect: If True or a sequence of names, treat corresponding parameters as fixtures and resolve them indirectly. + ids: Optional iterable of case identifiers or a callable that produces an identifier for each value. + scope: Optional fixture scope to apply when parameters are used as fixtures ("session", "package", "module", "class", or "function"). + Returns: - Decorator preserving the wrapped callable signature. + decorator: A decorator that applies the specified parametrization to a callable while preserving its signature. """ return cast( Callable[[Callable[P, R]], Callable[P, R]], @@ -39,4 +40,4 @@ def parametrize( ids=ids, scope=scope, ), - ) + ) \ No newline at end of file