useblocks · jdillard · Jan 28, 2026
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -191,3 +191,66 @@ To reduce output noise the output can be filtered by a list of regular expressio
 ``simplepdf_weasyprint_filter = ["WARNING: Ignored"]``
 
 To suppress all output, the quite flag `-q` should be used.
+
+simplepdf_html_hook
+-------------------
+.. versionadded:: 1.7
+
+Path to a Python script and function that will be called to manipulate the HTML before PDF generation.
+This allows custom transformations using BeautifulSoup.
+
+**Format:** ``"path/to/script.py:function_name"``
+
+The path can be absolute or relative to the ``conf.py`` directory.
+
+**Example conf.py:**
+
+.. code-block:: python
+
+   simplepdf_html_hook = "./hooks/pdf_hook.py:customize_html"
+
+**Example hook script (hooks/pdf_hook.py):**
+
+.. code-block:: python
+
+   from bs4 import BeautifulSoup
+
+   def customize_html(soup, app):
+       """
+       Customize HTML before PDF generation.
+
+       Args:
+           soup: BeautifulSoup object with parsed HTML
+           app: Sphinx application instance
+       Returns:
+           Modified BeautifulSoup object
+       """
+       # Example: Remove navigation elements
+       for nav in soup.find_all("nav"):
+           nav.decompose()
+
+       # Example: Add watermark
+       watermark = soup.new_tag("div", attrs={"class": "watermark"})
+       watermark.string = "DRAFT"
+       body = soup.find("body")
+       if body:
+           body.insert(0, watermark)
+
+       return soup
+
+**Function signature:**
+
+The hook function must accept two arguments:
+
+:soup: A ``BeautifulSoup`` object containing the parsed HTML
+:app: The Sphinx application instance (provides access to ``config``, ``srcdir``, ``outdir``, etc.)
+
+The function must return a ``BeautifulSoup`` object.
+
+**Error handling:**
+
+- If the script file is not found, a ``ConfigError`` is raised
+- If the function is not found in the script, a ``ConfigError`` is raised
+- If the hook returns ``None``, a warning is logged and the original HTML is used
+- If the hook returns a non-BeautifulSoup type, an error is raised
+- If the hook raises an exception, it is wrapped in an ``ExtensionError``
diff --git a/sphinx_simplepdf/builders/simplepdf.py b/sphinx_simplepdf/builders/simplepdf.py
@@ -1,15 +1,17 @@
+import importlib.util
 import os
 import re
 import subprocess
 from collections import Counter
-from typing import Any
+from typing import Any, Callable
 
 import sass
 import weasyprint
 from bs4 import BeautifulSoup
 from sphinx import __version__
 from sphinx.application import Sphinx
 from sphinx.builders.singlehtml import SingleFileHTMLBuilder
+from sphinx.errors import ConfigError, ExtensionError
 from sphinx.util import logging
 
 from sphinx_simplepdf.builders.debug import DebugPython
@@ -114,7 +116,10 @@ def finish(self) -> None:
         with open(index_path, encoding="utf-8") as index_file:
             index_html = "".join(index_file.readlines())
 
-        new_index_html = self._toctree_fix(index_html)
+        soup = BeautifulSoup(index_html, "html.parser")
+        soup = self._toctree_fix_soup(soup)
+        soup = self._execute_html_hook(soup)
+        new_index_html = str(soup)
 
         with open(index_path, "w", encoding="utf-8") as index_file:
             index_file.writelines(new_index_html)
@@ -169,6 +174,108 @@ def finish(self) -> None:
                     if (n == retries - 1) and not success:
                         raise RuntimeError(f"maximum number of retries {retries} failed in weasyprint")
 
+    def _load_html_hook(self) -> Callable[[BeautifulSoup, Sphinx], BeautifulSoup] | None:
+        """Load the HTML hook function from the configured path.
+
+        Returns:
+            The hook function if configured, None otherwise.
+
+        Raises:
+            ConfigError: If the hook configuration is invalid.
+        """
+        hook_path = self.config["simplepdf_html_hook"]
+        if hook_path is None:
+            return None
+
+        # Parse the path:function_name format
+        if ":" not in hook_path:
+            raise ConfigError(
+                f"simplepdf_html_hook must be in format 'path/to/script.py:function_name', "
+                f"got '{hook_path}'"
+            )
+
+        script_path, function_name = hook_path.rsplit(":", 1)
+
+        # Resolve path relative to conf.py directory
+        if not os.path.isabs(script_path):
+            script_path = os.path.join(self.app.confdir, script_path)
+
+        # Check if file exists
+        if not os.path.isfile(script_path):
+            raise ConfigError(
+                f"simplepdf_html_hook script not found: {script_path}"
+            )
+
+        # Load the module
+        spec = importlib.util.spec_from_file_location("simplepdf_hook", script_path)
+        if spec is None or spec.loader is None:
+            raise ConfigError(
+                f"Failed to load simplepdf_html_hook script: {script_path}"
+            )
+
+        module = importlib.util.module_from_spec(spec)
+        try:
+            spec.loader.exec_module(module)
+        except Exception as e:
+            raise ConfigError(
+                f"Error loading simplepdf_html_hook script '{script_path}': {e}"
+            ) from e
+
+        # Get the function
+        if not hasattr(module, function_name):
+            raise ConfigError(
+                f"Function '{function_name}' not found in simplepdf_html_hook script: {script_path}"
+            )
+
+        hook_func = getattr(module, function_name)
+
+        if not callable(hook_func):
+            raise ConfigError(
+                f"simplepdf_html_hook '{function_name}' in '{script_path}' is not callable"
+            )
+
+        return hook_func
+
+    def _execute_html_hook(self, soup: BeautifulSoup) -> BeautifulSoup:
+        """Execute the user-defined HTML hook if configured.
+
+        Args:
+            soup: The BeautifulSoup object to pass to the hook.
+
+        Returns:
+            The modified BeautifulSoup object.
+
+        Raises:
+            ExtensionError: If the hook raises an exception or returns an invalid type.
+        """
+        hook_func = self._load_html_hook()
+        if hook_func is None:
+            return soup
+
+        logger.info("Executing simplepdf_html_hook")
+
+        try:
+            result = hook_func(soup, self.app)
+        except Exception as e:
+            raise ExtensionError(
+                f"simplepdf_html_hook raised an exception: {e}"
+            ) from e
+
+        if result is None:
+            logger.warning(
+                "simplepdf_html_hook returned None, using original HTML. "
+                "The hook should return a BeautifulSoup object."
+            )
+            return soup
+
+        if not isinstance(result, BeautifulSoup):
+            raise ExtensionError(
+                f"simplepdf_html_hook must return a BeautifulSoup object, "
+                f"got {type(result).__name__}"
+            )
+
+        return result
+
     """
     attempts to fix cases where a document has multiple chapters that have the same name.
 
@@ -194,9 +301,16 @@ def finish(self) -> None:
 
     """
 
-    def _toctree_fix(self, html):
+    def _toctree_fix_soup(self, soup: BeautifulSoup) -> BeautifulSoup:
+        """Fix toctree page numbering issues for documents with duplicate chapter names.
+
+        Args:
+            soup: The BeautifulSoup object with parsed HTML.
+
+        Returns:
+            The modified BeautifulSoup object.
+        """
         print("checking for potential toctree page numbering errors")
-        soup = BeautifulSoup(html, "html.parser")
         sidebar = soup.find("div", class_="sphinxsidebarwrapper")
 
         # sidebar contains the toctree
@@ -315,7 +429,7 @@ def _toctree_fix(self, html):
                 heading.attrs["class"] = class_attr
 
         logger.debug(soup.prettify(formatter="html"))
-        return str(soup)
+        return soup
 
 
 def setup(app: Sphinx) -> dict[str, Any]:
@@ -330,6 +444,7 @@ def setup(app: Sphinx) -> dict[str, Any]:
     app.add_config_value("simplepdf_theme", "simplepdf_theme", "html", types=[str])
     app.add_config_value("simplepdf_theme_options", {}, "html", types=[dict])
     app.add_config_value("simplepdf_sidebars", {"**": ["localtoc.html"]}, "html", types=[dict])
+    app.add_config_value("simplepdf_html_hook", None, "html", types=[str])
     app.add_builder(SimplePdfBuilder)
 
     return {