Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,66 @@ To reduce output noise the output can be filtered by a list of regular expressio
``simplepdf_weasyprint_filter = ["WARNING: Ignored"]``

To suppress all output, the quite flag `-q` should be used.

simplepdf_html_hook
-------------------
.. versionadded:: 1.7

Path to a Python script and function that will be called to manipulate the HTML before PDF generation.
This allows custom transformations using BeautifulSoup.

**Format:** ``"path/to/script.py:function_name"``

The path can be absolute or relative to the ``conf.py`` directory.

**Example conf.py:**

.. code-block:: python

simplepdf_html_hook = "./hooks/pdf_hook.py:customize_html"

**Example hook script (hooks/pdf_hook.py):**

.. code-block:: python

from bs4 import BeautifulSoup

def customize_html(soup, app):
"""
Customize HTML before PDF generation.

Args:
soup: BeautifulSoup object with parsed HTML
app: Sphinx application instance
Returns:
Modified BeautifulSoup object
"""
# Example: Remove navigation elements
for nav in soup.find_all("nav"):
nav.decompose()

# Example: Add watermark
watermark = soup.new_tag("div", attrs={"class": "watermark"})
watermark.string = "DRAFT"
body = soup.find("body")
if body:
body.insert(0, watermark)

return soup

**Function signature:**

The hook function must accept two arguments:

:soup: A ``BeautifulSoup`` object containing the parsed HTML
:app: The Sphinx application instance (provides access to ``config``, ``srcdir``, ``outdir``, etc.)

The function must return a ``BeautifulSoup`` object.

**Error handling:**

- If the script file is not found, a ``ConfigError`` is raised
- If the function is not found in the script, a ``ConfigError`` is raised
- If the hook returns ``None``, a warning is logged and the original HTML is used
- If the hook returns a non-BeautifulSoup type, an error is raised
- If the hook raises an exception, it is wrapped in an ``ExtensionError``
125 changes: 120 additions & 5 deletions sphinx_simplepdf/builders/simplepdf.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import importlib.util
import os
import re
import subprocess
from collections import Counter
from typing import Any
from typing import Any, Callable

import sass
import weasyprint
from bs4 import BeautifulSoup
from sphinx import __version__
from sphinx.application import Sphinx
from sphinx.builders.singlehtml import SingleFileHTMLBuilder
from sphinx.errors import ConfigError, ExtensionError
from sphinx.util import logging

from sphinx_simplepdf.builders.debug import DebugPython
Expand Down Expand Up @@ -114,7 +116,10 @@ def finish(self) -> None:
with open(index_path, encoding="utf-8") as index_file:
index_html = "".join(index_file.readlines())

new_index_html = self._toctree_fix(index_html)
soup = BeautifulSoup(index_html, "html.parser")
soup = self._toctree_fix_soup(soup)
soup = self._execute_html_hook(soup)
new_index_html = str(soup)

with open(index_path, "w", encoding="utf-8") as index_file:
index_file.writelines(new_index_html)
Expand Down Expand Up @@ -169,6 +174,108 @@ def finish(self) -> None:
if (n == retries - 1) and not success:
raise RuntimeError(f"maximum number of retries {retries} failed in weasyprint")

def _load_html_hook(self) -> Callable[[BeautifulSoup, Sphinx], BeautifulSoup] | None:
"""Load the HTML hook function from the configured path.

Returns:
The hook function if configured, None otherwise.

Raises:
ConfigError: If the hook configuration is invalid.
"""
hook_path = self.config["simplepdf_html_hook"]
if hook_path is None:
return None

# Parse the path:function_name format
if ":" not in hook_path:
raise ConfigError(
f"simplepdf_html_hook must be in format 'path/to/script.py:function_name', "
f"got '{hook_path}'"
)

script_path, function_name = hook_path.rsplit(":", 1)

# Resolve path relative to conf.py directory
if not os.path.isabs(script_path):
script_path = os.path.join(self.app.confdir, script_path)

# Check if file exists
if not os.path.isfile(script_path):
raise ConfigError(
f"simplepdf_html_hook script not found: {script_path}"
)

# Load the module
spec = importlib.util.spec_from_file_location("simplepdf_hook", script_path)
if spec is None or spec.loader is None:
raise ConfigError(
f"Failed to load simplepdf_html_hook script: {script_path}"
)

module = importlib.util.module_from_spec(spec)
try:
spec.loader.exec_module(module)
except Exception as e:
raise ConfigError(
f"Error loading simplepdf_html_hook script '{script_path}': {e}"
) from e

# Get the function
if not hasattr(module, function_name):
raise ConfigError(
f"Function '{function_name}' not found in simplepdf_html_hook script: {script_path}"
)

hook_func = getattr(module, function_name)

if not callable(hook_func):
raise ConfigError(
f"simplepdf_html_hook '{function_name}' in '{script_path}' is not callable"
)

return hook_func

def _execute_html_hook(self, soup: BeautifulSoup) -> BeautifulSoup:
"""Execute the user-defined HTML hook if configured.

Args:
soup: The BeautifulSoup object to pass to the hook.

Returns:
The modified BeautifulSoup object.

Raises:
ExtensionError: If the hook raises an exception or returns an invalid type.
"""
hook_func = self._load_html_hook()
if hook_func is None:
return soup

logger.info("Executing simplepdf_html_hook")

try:
result = hook_func(soup, self.app)
except Exception as e:
raise ExtensionError(
f"simplepdf_html_hook raised an exception: {e}"
) from e

if result is None:
logger.warning(
"simplepdf_html_hook returned None, using original HTML. "
"The hook should return a BeautifulSoup object."
)
return soup

if not isinstance(result, BeautifulSoup):
raise ExtensionError(
f"simplepdf_html_hook must return a BeautifulSoup object, "
f"got {type(result).__name__}"
)

return result

"""
attempts to fix cases where a document has multiple chapters that have the same name.

Expand All @@ -194,9 +301,16 @@ def finish(self) -> None:

"""

def _toctree_fix(self, html):
def _toctree_fix_soup(self, soup: BeautifulSoup) -> BeautifulSoup:
"""Fix toctree page numbering issues for documents with duplicate chapter names.

Args:
soup: The BeautifulSoup object with parsed HTML.

Returns:
The modified BeautifulSoup object.
"""
print("checking for potential toctree page numbering errors")
soup = BeautifulSoup(html, "html.parser")
sidebar = soup.find("div", class_="sphinxsidebarwrapper")

# sidebar contains the toctree
Expand Down Expand Up @@ -315,7 +429,7 @@ def _toctree_fix(self, html):
heading.attrs["class"] = class_attr

logger.debug(soup.prettify(formatter="html"))
return str(soup)
return soup


def setup(app: Sphinx) -> dict[str, Any]:
Expand All @@ -330,6 +444,7 @@ def setup(app: Sphinx) -> dict[str, Any]:
app.add_config_value("simplepdf_theme", "simplepdf_theme", "html", types=[str])
app.add_config_value("simplepdf_theme_options", {}, "html", types=[dict])
app.add_config_value("simplepdf_sidebars", {"**": ["localtoc.html"]}, "html", types=[dict])
app.add_config_value("simplepdf_html_hook", None, "html", types=[str])
app.add_builder(SimplePdfBuilder)

return {
Expand Down
Loading