From 30e14d8ab421580a8764c8c089a35b489848d60e Mon Sep 17 00:00:00 2001 From: Jared Dillard Date: Wed, 28 Jan 2026 11:59:48 -0800 Subject: [PATCH] Add hook to manipulate HTML --- docs/configuration.rst | 63 +++++++++++++ sphinx_simplepdf/builders/simplepdf.py | 125 ++++++++++++++++++++++++- 2 files changed, 183 insertions(+), 5 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 233c40e..4afcfb0 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -191,3 +191,66 @@ To reduce output noise the output can be filtered by a list of regular expressio ``simplepdf_weasyprint_filter = ["WARNING: Ignored"]`` To suppress all output, the quite flag `-q` should be used. + +simplepdf_html_hook +------------------- +.. versionadded:: 1.7 + +Path to a Python script and function that will be called to manipulate the HTML before PDF generation. +This allows custom transformations using BeautifulSoup. + +**Format:** ``"path/to/script.py:function_name"`` + +The path can be absolute or relative to the ``conf.py`` directory. + +**Example conf.py:** + +.. code-block:: python + + simplepdf_html_hook = "./hooks/pdf_hook.py:customize_html" + +**Example hook script (hooks/pdf_hook.py):** + +.. code-block:: python + + from bs4 import BeautifulSoup + + def customize_html(soup, app): + """ + Customize HTML before PDF generation. + + Args: + soup: BeautifulSoup object with parsed HTML + app: Sphinx application instance + Returns: + Modified BeautifulSoup object + """ + # Example: Remove navigation elements + for nav in soup.find_all("nav"): + nav.decompose() + + # Example: Add watermark + watermark = soup.new_tag("div", attrs={"class": "watermark"}) + watermark.string = "DRAFT" + body = soup.find("body") + if body: + body.insert(0, watermark) + + return soup + +**Function signature:** + +The hook function must accept two arguments: + +:soup: A ``BeautifulSoup`` object containing the parsed HTML +:app: The Sphinx application instance (provides access to ``config``, ``srcdir``, ``outdir``, etc.) + +The function must return a ``BeautifulSoup`` object. + +**Error handling:** + +- If the script file is not found, a ``ConfigError`` is raised +- If the function is not found in the script, a ``ConfigError`` is raised +- If the hook returns ``None``, a warning is logged and the original HTML is used +- If the hook returns a non-BeautifulSoup type, an error is raised +- If the hook raises an exception, it is wrapped in an ``ExtensionError`` diff --git a/sphinx_simplepdf/builders/simplepdf.py b/sphinx_simplepdf/builders/simplepdf.py index 509b486..fbc1669 100644 --- a/sphinx_simplepdf/builders/simplepdf.py +++ b/sphinx_simplepdf/builders/simplepdf.py @@ -1,8 +1,9 @@ +import importlib.util import os import re import subprocess from collections import Counter -from typing import Any +from typing import Any, Callable import sass import weasyprint @@ -10,6 +11,7 @@ from sphinx import __version__ from sphinx.application import Sphinx from sphinx.builders.singlehtml import SingleFileHTMLBuilder +from sphinx.errors import ConfigError, ExtensionError from sphinx.util import logging from sphinx_simplepdf.builders.debug import DebugPython @@ -114,7 +116,10 @@ def finish(self) -> None: with open(index_path, encoding="utf-8") as index_file: index_html = "".join(index_file.readlines()) - new_index_html = self._toctree_fix(index_html) + soup = BeautifulSoup(index_html, "html.parser") + soup = self._toctree_fix_soup(soup) + soup = self._execute_html_hook(soup) + new_index_html = str(soup) with open(index_path, "w", encoding="utf-8") as index_file: index_file.writelines(new_index_html) @@ -169,6 +174,108 @@ def finish(self) -> None: if (n == retries - 1) and not success: raise RuntimeError(f"maximum number of retries {retries} failed in weasyprint") + def _load_html_hook(self) -> Callable[[BeautifulSoup, Sphinx], BeautifulSoup] | None: + """Load the HTML hook function from the configured path. + + Returns: + The hook function if configured, None otherwise. + + Raises: + ConfigError: If the hook configuration is invalid. + """ + hook_path = self.config["simplepdf_html_hook"] + if hook_path is None: + return None + + # Parse the path:function_name format + if ":" not in hook_path: + raise ConfigError( + f"simplepdf_html_hook must be in format 'path/to/script.py:function_name', " + f"got '{hook_path}'" + ) + + script_path, function_name = hook_path.rsplit(":", 1) + + # Resolve path relative to conf.py directory + if not os.path.isabs(script_path): + script_path = os.path.join(self.app.confdir, script_path) + + # Check if file exists + if not os.path.isfile(script_path): + raise ConfigError( + f"simplepdf_html_hook script not found: {script_path}" + ) + + # Load the module + spec = importlib.util.spec_from_file_location("simplepdf_hook", script_path) + if spec is None or spec.loader is None: + raise ConfigError( + f"Failed to load simplepdf_html_hook script: {script_path}" + ) + + module = importlib.util.module_from_spec(spec) + try: + spec.loader.exec_module(module) + except Exception as e: + raise ConfigError( + f"Error loading simplepdf_html_hook script '{script_path}': {e}" + ) from e + + # Get the function + if not hasattr(module, function_name): + raise ConfigError( + f"Function '{function_name}' not found in simplepdf_html_hook script: {script_path}" + ) + + hook_func = getattr(module, function_name) + + if not callable(hook_func): + raise ConfigError( + f"simplepdf_html_hook '{function_name}' in '{script_path}' is not callable" + ) + + return hook_func + + def _execute_html_hook(self, soup: BeautifulSoup) -> BeautifulSoup: + """Execute the user-defined HTML hook if configured. + + Args: + soup: The BeautifulSoup object to pass to the hook. + + Returns: + The modified BeautifulSoup object. + + Raises: + ExtensionError: If the hook raises an exception or returns an invalid type. + """ + hook_func = self._load_html_hook() + if hook_func is None: + return soup + + logger.info("Executing simplepdf_html_hook") + + try: + result = hook_func(soup, self.app) + except Exception as e: + raise ExtensionError( + f"simplepdf_html_hook raised an exception: {e}" + ) from e + + if result is None: + logger.warning( + "simplepdf_html_hook returned None, using original HTML. " + "The hook should return a BeautifulSoup object." + ) + return soup + + if not isinstance(result, BeautifulSoup): + raise ExtensionError( + f"simplepdf_html_hook must return a BeautifulSoup object, " + f"got {type(result).__name__}" + ) + + return result + """ attempts to fix cases where a document has multiple chapters that have the same name. @@ -194,9 +301,16 @@ def finish(self) -> None: """ - def _toctree_fix(self, html): + def _toctree_fix_soup(self, soup: BeautifulSoup) -> BeautifulSoup: + """Fix toctree page numbering issues for documents with duplicate chapter names. + + Args: + soup: The BeautifulSoup object with parsed HTML. + + Returns: + The modified BeautifulSoup object. + """ print("checking for potential toctree page numbering errors") - soup = BeautifulSoup(html, "html.parser") sidebar = soup.find("div", class_="sphinxsidebarwrapper") # sidebar contains the toctree @@ -315,7 +429,7 @@ def _toctree_fix(self, html): heading.attrs["class"] = class_attr logger.debug(soup.prettify(formatter="html")) - return str(soup) + return soup def setup(app: Sphinx) -> dict[str, Any]: @@ -330,6 +444,7 @@ def setup(app: Sphinx) -> dict[str, Any]: app.add_config_value("simplepdf_theme", "simplepdf_theme", "html", types=[str]) app.add_config_value("simplepdf_theme_options", {}, "html", types=[dict]) app.add_config_value("simplepdf_sidebars", {"**": ["localtoc.html"]}, "html", types=[dict]) + app.add_config_value("simplepdf_html_hook", None, "html", types=[str]) app.add_builder(SimplePdfBuilder) return {