diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 00000000..37bb020b
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1,2 @@
+sphinx-combined/_build
+sphinx-combined/_doxygen
diff --git a/docs/build_combined_docs.sh b/docs/build_combined_docs.sh
new file mode 100755
index 00000000..feb6ea61
--- /dev/null
+++ b/docs/build_combined_docs.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="${SCRIPT_DIR}/sphinx-combined/_build"
+DOXYGEN_DIR="${SCRIPT_DIR}/sphinx-combined/_doxygen"
+
+mkdir -p "${BUILD_DIR}" "${DOXYGEN_DIR}"
+
+echo "Running Doxygen for combined C++ API..."
+(cd "${SCRIPT_DIR}/sphinx-combined" && doxygen Doxyfile)
+
+echo "Building combined Sphinx docs..."
+sphinx-build -E -b html "${SCRIPT_DIR}/sphinx-combined" "${BUILD_DIR}"
+
+echo "Combined docs available at ${BUILD_DIR}/index.html"
diff --git a/docs/cli_help.md b/docs/cli_help.md
index 15a89a73..5372474f 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -69,8 +69,7 @@
 
 * `--axis <axis specification>`, `-a <axis specification>`
   * Override an axis specification.
-  * See `--help-axis`
-    for [details on axis specifications](./cli_help_axis.md).
+  * See `--help-axis` for details on axis specifications.
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
diff --git a/docs/sphinx-combined/Doxyfile b/docs/sphinx-combined/Doxyfile
new file mode 100644
index 00000000..9dfeb77f
--- /dev/null
+++ b/docs/sphinx-combined/Doxyfile
@@ -0,0 +1,45 @@
+PROJECT_NAME           = "NVBench"
+PROJECT_BRIEF          = "C++ NVBench Library"
+OUTPUT_DIRECTORY       = _doxygen
+GENERATE_XML           = YES
+GENERATE_HTML          = NO
+GENERATE_LATEX         = NO
+QUIET                  = YES
+WARN_IF_UNDOCUMENTED   = NO
+WARN_IF_DOC_ERROR      = YES
+WARN_LOGFILE           = _doxygen/warnings.log
+INPUT                  = ../../nvbench
+EXCLUDE                = ../../nvbench/cupti_profiler.cxx
+EXCLUDE_SYMBOLS        = type_strings \
+                         nvbench::detail \
+                         nvbench::internal \
+                         nvbench::tl \
+                         UNUSED \
+                         M_PI \
+                         NVBENCH_UNIQUE_IDENTIFIER_IMPL1 \
+                         NVBENCH_UNIQUE_IDENTIFIER_IMPL2 \
+                         main \
+                         NVBENCH_STATE_EXEC_GUARD \
+                         wrapped_type
+FILE_PATTERNS          = *.cuh *.cxx *.cu *.h *.hpp
+EXTENSION_MAPPING      = cuh=C++ cu=C++
+RECURSIVE              = YES
+EXTRACT_ALL            = YES
+EXTRACT_PRIVATE        = YES
+EXTRACT_STATIC         = YES
+JAVADOC_AUTOBRIEF      = YES
+MULTILINE_CPP_IS_BRIEF = YES
+STRIP_FROM_PATH        = ../../
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = NO
+GENERATE_TAGFILE       =
+XML_PROGRAMLISTING    = NO
+PREDEFINED             = __device__= \
+                         __host__= \
+                         __global__= \
+                         __forceinline__= \
+                         __shared__= \
+                         __align__(x)= \
+                         __launch_bounds__(x)= \
+                         NVBENCH_HAS_CUDA=1
diff --git a/docs/sphinx-combined/_static/nvidia-logo.png b/docs/sphinx-combined/_static/nvidia-logo.png
new file mode 100644
index 00000000..1779ad93
Binary files /dev/null and b/docs/sphinx-combined/_static/nvidia-logo.png differ
diff --git a/docs/sphinx-combined/cli_overview.rst b/docs/sphinx-combined/cli_overview.rst
new file mode 100644
index 00000000..6d01ed27
--- /dev/null
+++ b/docs/sphinx-combined/cli_overview.rst
@@ -0,0 +1,12 @@
+CLI Options
+===========
+
+Every benchmark created with NVBench supports command-line interface,
+with a variety of options.
+
+.. include:: ../cli_help.md
+   :parser: myst_parser.sphinx_
+
+
+.. include:: ../cli_help_axis.md
+   :parser: myst_parser.sphinx_
diff --git a/docs/sphinx-combined/conf.py b/docs/sphinx-combined/conf.py
new file mode 100644
index 00000000..26cc0049
--- /dev/null
+++ b/docs/sphinx-combined/conf.py
@@ -0,0 +1,104 @@
+import os
+
+project = "NVBench API"
+author = "NVIDIA Corporation"
+
+extensions = [
+    "breathe",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.autosummary",
+    "myst_parser",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = ["_build", "_doxygen"]
+
+autosummary_generate = True
+autodoc_default_options = {"members": True, "undoc-members": True}
+
+release = "0.2.0"
+
+_here = os.path.abspath(os.path.dirname(__file__))
+_doxygen_xml = os.path.join(_here, "_doxygen", "xml")
+
+breathe_projects = {"nvbench": _doxygen_xml}
+breathe_default_project = "nvbench"
+breathe_domain_by_extension = {"cuh": "cpp", "cxx": "cpp", "cu": "cpp"}
+
+
+def _patch_breathe_namespace_declarations() -> None:
+    try:
+        import breathe.renderer.sphinxrenderer as sphinxrenderer
+        from docutils import nodes
+        from sphinx import addnodes
+    except Exception:
+        return
+
+    original = sphinxrenderer.SphinxRenderer.handle_declaration
+
+    def handle_declaration(self, nodeDef, declaration, *args, **kwargs):
+        is_namespace = getattr(nodeDef, "kind", None) == "namespace"
+        if not is_namespace:
+            return original(self, nodeDef, declaration, *args, **kwargs)
+
+        name = (declaration or "").strip()
+        if name.startswith("namespace "):
+            name = name[len("namespace ") :].strip()
+        if not name:
+            name = "<anonymous>"
+
+        keyword = addnodes.desc_sig_keyword("namespace", "namespace")
+        sig_name = addnodes.desc_sig_name(name, name)
+        return [keyword, nodes.Text(" "), sig_name]
+
+    sphinxrenderer.SphinxRenderer.handle_declaration = handle_declaration
+
+
+def setup(app):
+    _patch_breathe_namespace_declarations()
+
+
+######################################################
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = "nvidia_sphinx_theme"
+
+html_logo = "_static/nvidia-logo.png"
+
+html_baseurl = (
+    os.environ.get("NVBENCH_DOCS_BASE_URL", "https://nvidia.github.io/nvbench/").rstrip(
+        "/"
+    )
+    + "/"
+)
+
+html_theme_options = {
+    "icon_links": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA/nvbench",
+            "icon": "fa-brands fa-github",
+            "type": "fontawesome",
+        }
+    ],
+    "navigation_depth": 4,
+    "show_toc_level": 2,
+    "navbar_start": ["navbar-logo"],
+    "navbar_end": ["theme-switcher", "navbar-icon-links"],
+    "footer_start": ["copyright"],
+    "footer_end": ["sphinx-version"],
+    "sidebar_includehidden": True,
+    "collapse_navigation": False,
+    #    "switcher": {
+    #        "json_url": f"{html_baseurl}nv-versions.json",
+    #        "version_match": release,
+    #    },
+}
+
+html_static_path = ["_static"] if os.path.exists("_static") else []
+
+# Images directory
+if os.path.exists("img"):
+    html_static_path.append("img")
diff --git a/docs/sphinx-combined/cpp_api.rst b/docs/sphinx-combined/cpp_api.rst
new file mode 100644
index 00000000..410a6f7f
--- /dev/null
+++ b/docs/sphinx-combined/cpp_api.rst
@@ -0,0 +1,40 @@
+NVBench C++ API Reference
+=========================
+
+Index
+-----
+
+.. doxygenindex::
+   :project: nvbench
+
+
+Free Functions
+--------------
+
+.. doxygenfunction:: nvbench::make_cuda_stream_view
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::axis_type_to_string
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::add_devices_section
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::range
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::sleep_kernel
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::copy_kernel
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::mod2_kernel
+   :project: nvbench
+
+.. doxygenfunction:: nvbench::demangle(const std::string &str)
+   :project: nvbench
+
+.. cpp:function:: template <typename T> std::string nvbench::demangle()
+
+   Returns demangled type name.
diff --git a/docs/sphinx-combined/cpp_benchmarks.md b/docs/sphinx-combined/cpp_benchmarks.md
new file mode 100644
index 00000000..46c6125b
--- /dev/null
+++ b/docs/sphinx-combined/cpp_benchmarks.md
@@ -0,0 +1,530 @@
+# NVBench: benchmarking in C++
+
+(minimal-benchmark)=
+## Minimal Benchmark
+
+A basic kernel benchmark can be created with just a few lines of CUDA C++:
+
+```cpp
+void my_benchmark(nvbench::state& state) {
+  state.exec([](nvbench::launch& launch) {
+    my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
+There are three main components in the definition of a benchmark:
+
+- A `KernelGenerator` callable (`my_benchmark` above)
+- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and
+- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros.
+
+The `KernelGenerator` is called with an `nvbench::state` object that provides
+configuration information, as shown in later sections. The generator is
+responsible for configuring and instantiating a `KernelLauncher`, which is
+(unsurprisingly) responsible for launching a kernel. The launcher should contain
+only the minimum amount of code necessary to start the CUDA kernel,
+since `nvbench::exec` will execute it repeatedly to gather timing information.
+An `nvbench::launch` object is provided to the launcher to specify kernel
+execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers
+the benchmark with NVBench and initializes various attributes, including its
+name and parameter axes.
+
+## Benchmark Name
+
+By default, a benchmark is named by converting the first argument
+of `NVBENCH_BENCH` into a string.
+
+This can be changed to something more descriptive if desired.
+The `NVBENCH_BENCH` macro produces a customization object that allows such
+attributes to be modified.
+
+```cpp
+NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<<num_blocks, 256>>>");
+```
+
+## CUDA Streams
+
+NVBench records GPU execution times on a specific CUDA stream. By default, a new
+stream is created and passed to the `KernelLauncher` via the
+`nvbench::launch::get_stream()` method, as shown in
+[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other
+stream-ordered work must be launched on this stream for NVBench to capture it.
+
+In some instances, it may be inconvenient or impossible to specify an explicit
+CUDA stream for the benchmarked operation to use. For example, a library may
+manage and use its own streams, or an opaque API may always launch work on the
+default stream. In these situations, users may provide NVBench with an explicit
+stream via `nvbench::state::set_cuda_stream` and `nvbench::make_cuda_stream_view`.
+It is assumed that all work of interest executes on or synchronizes with this
+stream.
+
+```cpp
+void my_benchmark(nvbench::state& state) {
+  cudaStream_t default_stream = 0;
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
+  state.exec([](nvbench::launch&) {
+    my_func(); // a host API invoking GPU kernels on the default stream
+    my_kernel<<<num_blocks, 256>>>(); // or a kernel launched with the default stream
+  });
+}
+NVBENCH_BENCH(my_benchmark);
+```
+
+A full example can be found in [examples/stream.cu][CppExample_Stream].
+
+## Parameter Axes
+
+Some kernels will be used with a variety of options, input data types/sizes, and
+other factors that impact performance. NVBench explores these different
+scenarios by sweeping through a set of user-defined parameter axes.
+
+A parameter axis defines a set of interesting values for a single kernel
+parameter — for example, the size of the input, or the type of values being
+processed. These parameter axes are used to customize a `KernelGenerator` with
+static and runtime configurations. There are four supported types of parameters:
+int64, float64, string, and type.
+
+More examples can found in [examples/axes.cu][CppExample_Axes].
+
+### Int64 Axes
+
+A common example of a parameter axis is to vary the number of input values a
+kernel should process during a benchmark measurement. An `int64_axis` is ideal
+for this:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto num_inputs = state.get_int64("NumInputs");
+  thrust::device_vector<int> data = generate_input(num_inputs);
+
+  state.exec([&data](nvbench::launch& launch) {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096});
+```
+
+NVBench will run the `benchmark` kernel generator once for each specified value
+in the "NumInputs" axis. The `state` object provides the current parameter value
+to `benchmark`.
+
+### Int64 Power-Of-Two Axes
+
+Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a
+unique power-of-two mode that simplifies how such axes are defined and helps
+provide more readable output. A power-of-two int64 axis is defined using the
+integer exponents, but the benchmark will be run with the computed 2^N value.
+
+```cpp
+// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12}
+NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
+                                                     {4, 6, 8, 10, 12});
+// Or, as shown in a later section:
+NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs",
+                                                     nvbench::range(4, 12, 2));
+```
+
+### Float64 Axes
+
+For floating point numbers, a `float64_axis` is available:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto quality = state.get_float64("Quality");
+
+  state.exec([&quality](nvbench::launch& launch)
+  {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
+  });
+}
+NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.});
+```
+
+### String Axes
+
+For non-numeric data, an axis of arbitrary strings provides additional
+flexibility:
+
+```cpp
+void benchmark(nvbench::state& state)
+{
+  const auto rng_dist = state.get_string("RNG Distribution");
+  thrust::device_vector<int> data = generate_input(rng_dist);
+
+  state.exec([&data](nvbench::launch& launch)
+  {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"});
+```
+
+A common use for string axes is to encode enum values, as shown in
+[examples/enums.cu][CppExample_Enums].
+
+### Type Axes
+
+Another common situation involves benchmarking a templated kernel with multiple
+compile-time configurations. NVBench strives to make such benchmarks as easy to
+write as possible through the use of type axes.
+
+A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in
+a `nvbench::type_list<T1, T2, Ts...>`. The kernel generator becomes a template
+function and will be instantiated using types defined by the axis. The current
+configuration's type is passed into the kernel generator using
+a `nvbench::type_list<T>`.
+
+```cpp
+template <typename T>
+void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
+{
+  thrust::device_vector<T> data = generate_input<T>();
+
+  state.exec([&data](nvbench::launch& launch)
+  {
+    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
+  });
+}
+using my_types = nvbench::type_list<int, float, double>;
+NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
+  .set_type_axes_names({"ValueType"});
+```
+
+The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
+the `type_list<...>` from breaking macro parsing.
+
+Type axes can be used to encode compile-time enum and integral constants using
+the `nvbench::enum_type_list` helper. See
+[examples/enums.cu][CppExample_Enums] for detail.
+
+### Parameter sweeping
+
+Since parameter sweeps often explore a range of evenly-spaced numeric values, a
+strided range can be generated using the `nvbench::range(start, end, stride=1)`
+helper.
+
+```cpp
+assert(nvbench::range(2, 5) == {2, 3, 4, 5});
+assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0});
+assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12});
+assert(nvbench::range(2, 12, 5) == {2, 7, 12});
+assert(nvbench::range(2, 12, 6) == {2, 8});
+assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0});
+```
+
+Note that start and end are inclusive. This utility can be used to define axis
+values for all numeric axes.
+
+### Multiple Parameter Axes
+
+If more than one axis is defined, the complete cartesian product of all axes
+will be benchmarked. For example, consider a benchmark with two type axes, one
+int64 axis, and one float64 axis:
+
+```cpp
+// InputTypes: {char, int, unsigned int}
+// OutputTypes: {float, double}
+// NumInputs: {2^10, 2^20, 2^30}
+// Quality: {0.5, 1.0}
+
+using input_types = nvbench::type_list<char, int, unsigned int>;
+using output_types = nvbench::type_list<float, double>;
+NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types))
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10))
+  .add_float64_axis("Quality", {0.5, 1.0});
+```
+
+This would generate a total of 36 configurations and instantiate the benchmark 6
+times. Keep the rapid growth of these combinations in mind when choosing the
+number of values in an axis. See the section about combinatorial explosion for
+more examples and information.
+
+## Throughput Measurements
+
+In additional to raw timing information, NVBench can track a kernel's
+throughput, reporting the amount of data processed as:
+
+- Number of items per second
+- Number of bytes per second
+- Percentage of device's peak memory bandwidth utilized
+
+To enable throughput measurements, the kernel generator can specify the number
+of items and/or bytes handled in a single kernel execution using
+the `nvbench::state` API.
+
+```cpp
+state.add_element_count(size);
+state.add_global_memory_reads<InputType>(size);
+state.add_global_memory_writes<OutputType>(size);
+```
+
+In general::
+- Add only the input element count (no outputs).
+- Add all reads and writes to global memory.
+
+More examples can found in [examples/throughput.cu][CppExample_Throughput].
+
+(skip-uninteresting-or-invalid-benchmarks)=
+## Skip Uninteresting / Invalid Benchmarks
+
+Sometimes particular combinations of parameters aren't useful or interesting —
+or for type axes, some configurations may not even compile.
+
+The `nvbench::state` object provides a `skip("Reason")` method that can be used
+to avoid running these benchmarks. To skip uncompilable type axis
+configurations, create an overload for the kernel generator that selects for the
+invalid type combination:
+
+```cpp
+template <typename T, typename U>
+void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
+{
+  // Skip benchmarks at runtime:
+  if (should_skip_this_config)
+  {
+    state.skip("Reason for skip.");
+    return;
+  }
+
+  /* ... */
+};
+
+// Skip benchmarks at compile time -- for example, always skip when T == U
+// (Note that the `type_list` argument defines the same type twice).
+template <typename SameType>
+void my_benchmark(nvbench::state& state,
+                  nvbench::type_list<SameType, SameType>)
+{
+  state.skip("T must not be the same type as U.");
+}
+using Ts = nvbench::type_list<...>;
+using Us = nvbench::type_list<...>;
+NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us));
+```
+
+More examples can found in [examples/skip.cu][CppExample_Skip].
+
+## Execution Tags For Special Cases
+
+By default, NVBench assumes that the entire execution time of the
+`KernelLauncher` should be measured, and that no syncs are performed
+(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`,
+etc. are not called).
+
+Execution tags may be passed to `state.exec` when these assumptions are not
+true:
+
+- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will
+  synchronize internally.
+- `nvbench::exec_tag::timer` requests a timer object that can be used to
+  restrict the timed region.
+- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
+  them during execution to reduce runtime, and prevents their compilation to
+  reduce compile-time and binary size.
+- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark is defined with `set_is_cpu_only(true)`.
+- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark does not also define `set_is_cpu_only(true)`.
+
+Multiple execution tags may be combined using `operator|`, e.g.
+
+```cpp
+state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+           [](nvbench::launch &launch, auto& timer) { /*...*/ });
+```
+
+The following sections provide more details on these features.
+
+### Benchmarks that sync
+
+If a `KernelLauncher` synchronizes the CUDA device internally without passing
+this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag
+will fix this issue. Note that this disables batch measurements.
+
+```cpp
+void sync_example(nvbench::state& state)
+{
+  // Pass the `sync` exec tag to tell NVBench that this benchmark will sync:
+  state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) {
+    /* Benchmark that implicitly syncs here. */
+  });
+}
+NVBENCH_BENCH(sync_example);
+```
+
+See [examples/exec_tag_sync.cu][CppExample_ExecTagSync] for a complete
+example.
+
+(explicit-timer-mode)=
+### Explicit timer mode: `nvbench::exec_tag::timer`
+
+For some kernels, the working data may need to be reset between launches. This
+is particularly common for kernels that modify their input in-place.
+
+Resetting the input data to prepare for a new trial shouldn't be included in the
+benchmark's execution time. NVBench provides a manual timer mode that allows the
+kernel launcher to specify the critical section to be measured and exclude any
+per-trial reset operations.
+
+To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer`
+to `state.exec`, and declare the kernel launcher with an
+additional `auto& timer` argument.
+
+Note that using manual timer mode disables batch measurements.
+
+```cpp
+void timer_example(nvbench::state& state)
+{
+  // Pass the `timer` exec tag to request a timer:
+  state.exec(nvbench::exec_tag::timer,
+    // Lambda now accepts a timer:
+    [](nvbench::launch& launch, auto& timer)
+    {
+      /* Reset code here, excluded from timing */
+
+      /* Timed region is explicitly marked.
+       * The timer handles any synchronization, flushes, etc when/if
+       * needed for the current measurement.
+       */
+      timer.start();
+      /* Launch kernel on `launch.get_stream()` here */
+      timer.stop();
+    });
+}
+NVBENCH_BENCH(timer_example);
+```
+
+See [examples/exec_tag_timer.cu][CppExample_ExecTagTimer] for a complete
+example.
+
+### Compilation hints
+
+These execution tags are optional hints that disable the compilation of various
+code paths when they are not needed. They apply only to a single benchmark.
+
+- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
+- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
+  - Requires that the benchmark does not define `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - Host-side CPU measurements of GPU kernel execution time are still provided.
+- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
+  - Requires that the benchmark defines `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - See also [CPU-only Benchmarks](#cpu-only-benchmarks).
+
+(cpu-only-benchmarks)=
+## CPU-only Benchmarks
+
+NVBench provides CPU-only benchmarking facilities that are intended for measuring
+significant CPU workloads. We do not recommend using these features for high-resolution
+CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
+such applications. Examples are provided in [examples/cpu_only.cu][CppExample_CpuOnly].
+
+Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
+CPU-only benchmarks.
+
+The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
+binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
+`is_cpu_only` is false.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
+             [](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
+is useful for benchmarks that require additional per-sample setup/teardown. See the
+[`nvbench::exec_tag::timer`](#explicit-timer-mode) section for more
+details.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
+             nvbench::exec_tag::timer,   // Request a timer object
+             [](nvbench::launch &, auto &timer)
+    {
+      // Setup here
+      timer.start();
+      // timed workload
+      timer.stop();
+      // teardown here
+    });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+## Beware of Combinatorial Explosion
+
+Be very careful of how quickly the configuration space can grow. The following
+example generates 960 total runtime benchmark configurations, and will compile
+192 different static parametrizations of the kernel generator. This is likely
+excessive, especially for routine regression testing.
+
+```cpp
+using value_types = nvbench::type_list<nvbench::uint8_t,
+                                       nvbench::int32_t,
+                                       nvbench::float32_t,
+                                       nvbench::float64_t>;
+using op_types = nvbench::type_list<thrust::plus<>,
+                                    thrust::multiplies<>,
+                                    thrust::maximum<>>;
+
+NVBENCH_BENCH_TYPES(my_benchmark,
+                    NVBENCH_TYPE_AXES(value_types,
+                                      value_types,
+                                      value_types,
+                                      op_types>))
+  .set_type_axes_names({"T", "U", "V", "Op"})
+  .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5));
+```
+
+```
+960 total configs
+= 4 [T=(U8, I32, F32, F64)]
+* 4 [U=(U8, I32, F32, F64)]
+* 4 [V=(U8, I32, F32, F64)]
+* 3 [Op=(plus, multiplies, max)]
+* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)]
+```
+
+For large configuration spaces like this, pruning some of the less useful
+combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
+described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting-or-invalid-benchmarks)
+section can help immensely with keeping compile / run times manageable.
+
+Splitting a single large configuration space into multiple, more focused
+benchmarks with reduced dimensionality will likely be worth the effort as well.
+
+[CppExample_Stream]: https://github.com/NVIDIA/nvbench/blob/main/examples/stream.cu
+[CppExample_Axes]: https://github.com/NVIDIA/nvbench/blob/main/examples/axes.cu
+[CppExample_Enums]: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu
+[CppExample_Throughput]: https://github.com/NVIDIA/nvbench/blob/main/examples/throughput.cu
+[CppExample_Skip]: https://github.com/NVIDIA/nvbench/blob/main/examples/skip.cu
+[CppExample_CpuOnly]: https://github.com/NVIDIA/nvbench/blob/main/examples/cpu_only.cu
+[CppExample_ExecTagSync]: https://github.com/NVIDIA/nvbench/blob/main/examples/exec_tag_sync.cu
+[CppExample_ExecTagTimer]: https://github.com/NVIDIA/nvbench/blob/main/examples/exec_tag_timer.cu
diff --git a/docs/sphinx-combined/index.rst b/docs/sphinx-combined/index.rst
new file mode 100644
index 00000000..86107234
--- /dev/null
+++ b/docs/sphinx-combined/index.rst
@@ -0,0 +1,13 @@
+NVBench API
+===========
+
+Combined C++ and Python API documentation.
+
+.. toctree::
+   :maxdepth: 2
+
+   cpp_benchmarks
+   py_benchmarks
+   cli_overview
+   cpp_api
+   python_api
diff --git a/docs/sphinx-combined/py_benchmarks.md b/docs/sphinx-combined/py_benchmarks.md
new file mode 100644
index 00000000..788d4e7d
--- /dev/null
+++ b/docs/sphinx-combined/py_benchmarks.md
@@ -0,0 +1,38 @@
+# NVBench: benchmarking in Python
+
+The `cuda.bench` Python module provides Python API powered by C++ NVBench
+library to benchmark GPU-aware Python code.
+
+## Minimal benchmark
+
+```python
+from cuda.bench import State, Launch
+from cuda.bench import register, run_all_registered
+from typing import Callable
+
+from my_package import impl
+
+def benchmark_impl(state: State) -> None:
+
+    # get state parameters
+    n = state.get_int64("Elements")
+
+    # prepare inputs
+    data = generate(n, state.get_stream())
+
+    # body that is being timed. Must execute
+    # on the stream handed over by NVBench
+    launchable_fn : Callable[[Launch], None] =
+       lambda launch: impl(data, launch.get_stream())
+
+    state.exec(launchable_fn)
+
+
+bench = register(benchmark_impl)
+bench.add_int64_axis("Elements", [1000, 10000, 100000])
+
+
+if __name__ == "__main__":
+   import sys
+   run_all_registered(sys.argv)
+```
diff --git a/docs/sphinx-combined/python_api.rst b/docs/sphinx-combined/python_api.rst
new file mode 100644
index 00000000..22e24eff
--- /dev/null
+++ b/docs/sphinx-combined/python_api.rst
@@ -0,0 +1,8 @@
+cuda.bench Python API Reference
+===============================
+
+.. automodule:: cuda.bench
+   :members:
+   :imported-members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index e6e7753c..4d2f4963 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -69,12 +69,21 @@ def _get_cuda_major_version():
 State = _nvbench_module.State
 register = _nvbench_module.register
 run_all_benchmarks = _nvbench_module.run_all_benchmarks
-test_cpp_exception = _nvbench_module.test_cpp_exception
-test_py_exception = _nvbench_module.test_py_exception
+_test_cpp_exception = _nvbench_module._test_cpp_exception
+_test_py_exception = _nvbench_module._test_py_exception
 
 # Expose the module as _nvbench for backward compatibility (e.g., for tests)
 _nvbench = _nvbench_module
 
+# Set module of exposed objects
+Benchmark.__module__ = __name__
+CudaStream.__module__ = __name__
+Launch.__module__ = __name__
+NVBenchRuntimeError.__module__ = __name__
+State.__module__ = __name__
+register.__module__ = __name__
+run_all_benchmarks.__module__ = __name__
+
 # Clean up internal symbols
 del (
     _nvbench_module,
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index aeadcc24..ab79d753 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -273,7 +273,7 @@ static void def_class_CudaStream(py::module_ m)
   //       nvbench::cuda_stream::get_stream
 
   static constexpr const char *class_CudaStream_doc = R"XXX(
-Represents CUDA stream
+    Represents CUDA stream
 
     Note
     ----
@@ -321,7 +321,7 @@ void def_class_Launch(py::module_ m)
   //        nvbench::launch::get_stream -> nvbench::cuda_stream
 
   static constexpr const char *class_Launch_doc = R"XXXX(
-Configuration object for function launch.
+    Configuration object for function launch.
 
     Note
     ----
@@ -363,13 +363,13 @@ static void def_class_Benchmark(py::module_ m)
   //        nvbench::benchmark_base::set_min_samples
 
   static constexpr const char *class_Benchmark_doc = R"XXXX(
-Represents NVBench benchmark.
+    Represents NVBench benchmark.
 
     Note
     ----
         The class is not user-constructible.
 
-        Use `~register` function to create Benchmark and register
+        Use `register` function to create Benchmark and register
         it with NVBench.
 )XXXX";
   auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark", class_Benchmark_doc);
@@ -691,7 +691,7 @@ void def_class_State(py::module_ m)
 
   using state_ref_t                            = std::reference_wrapper<nvbench::state>;
   static constexpr const char *class_State_doc = R"XXXX(
-Represent benchmark configuration state.
+    Represents benchmark configuration state.
 
     Note
     ----
@@ -736,7 +736,7 @@ Get device_id of the device from this configuration
     return std::ref(state.get_cuda_stream());
   };
   static constexpr const char *method_get_stream_doc = R"XXXX(
-Get `~CudaStream` object from this configuration"
+Get `CudaStream` object from this configuration
 )XXXX";
   pystate_cls.def("get_stream",
                   method_get_stream_impl,
@@ -1014,10 +1014,10 @@ Use argument True to disable use of blocking kernel by NVBench"
     }
   };
   static constexpr const char *method_exec_doc = R"XXXX(
-Execute callable running the benchmark.
+    Execute callable running the benchmark.
 
     The callable may be executed multiple times. The callable
-    will be passed `~Launch` object argument.
+    will be passed `Launch` object argument.
 
     Parameters
     ----------
@@ -1194,8 +1194,8 @@ Register benchmark function of type Callable[[nvbench.State], None]
         py::arg("argv") = py::list());
 
   // Testing utilities
-  m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
-  m.def("test_py_exception", []() {
+  m.def("_test_cpp_exception", []() { throw nvbench_run_error("Test"); });
+  m.def("_test_py_exception", []() {
     py::set_error(exc_storage.get_stored(), "Test");
     throw py::error_already_set();
   });
diff --git a/python/test/test_cuda_bench.py b/python/test/test_cuda_bench.py
index 7d927e8f..b63d24d8 100644
--- a/python/test/test_cuda_bench.py
+++ b/python/test/test_cuda_bench.py
@@ -6,12 +6,12 @@
 
 def test_cpp_exception():
     with pytest.raises(RuntimeError, match="Test"):
-        bench._nvbench.test_cpp_exception()
+        bench._nvbench._test_cpp_exception()
 
 
 def test_py_exception():
     with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
-        bench._nvbench.test_py_exception()
+        bench._nvbench._test_py_exception()
 
 
 @pytest.mark.parametrize(