diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..37bb020b --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +sphinx-combined/_build +sphinx-combined/_doxygen diff --git a/docs/build_combined_docs.sh b/docs/build_combined_docs.sh new file mode 100755 index 00000000..feb6ea61 --- /dev/null +++ b/docs/build_combined_docs.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BUILD_DIR="${SCRIPT_DIR}/sphinx-combined/_build" +DOXYGEN_DIR="${SCRIPT_DIR}/sphinx-combined/_doxygen" + +mkdir -p "${BUILD_DIR}" "${DOXYGEN_DIR}" + +echo "Running Doxygen for combined C++ API..." +(cd "${SCRIPT_DIR}/sphinx-combined" && doxygen Doxyfile) + +echo "Building combined Sphinx docs..." +sphinx-build -E -b html "${SCRIPT_DIR}/sphinx-combined" "${BUILD_DIR}" + +echo "Combined docs available at ${BUILD_DIR}/index.html" diff --git a/docs/cli_help.md b/docs/cli_help.md index 15a89a73..5372474f 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -69,8 +69,7 @@ * `--axis `, `-a ` * Override an axis specification. - * See `--help-axis` - for [details on axis specifications](./cli_help_axis.md). + * See `--help-axis` for details on axis specifications. * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. diff --git a/docs/sphinx-combined/Doxyfile b/docs/sphinx-combined/Doxyfile new file mode 100644 index 00000000..9dfeb77f --- /dev/null +++ b/docs/sphinx-combined/Doxyfile @@ -0,0 +1,45 @@ +PROJECT_NAME = "NVBench" +PROJECT_BRIEF = "C++ NVBench Library" +OUTPUT_DIRECTORY = _doxygen +GENERATE_XML = YES +GENERATE_HTML = NO +GENERATE_LATEX = NO +QUIET = YES +WARN_IF_UNDOCUMENTED = NO +WARN_IF_DOC_ERROR = YES +WARN_LOGFILE = _doxygen/warnings.log +INPUT = ../../nvbench +EXCLUDE = ../../nvbench/cupti_profiler.cxx +EXCLUDE_SYMBOLS = type_strings \ + nvbench::detail \ + nvbench::internal \ + nvbench::tl \ + UNUSED \ + M_PI \ + NVBENCH_UNIQUE_IDENTIFIER_IMPL1 \ + NVBENCH_UNIQUE_IDENTIFIER_IMPL2 \ + main \ + NVBENCH_STATE_EXEC_GUARD \ + wrapped_type +FILE_PATTERNS = *.cuh *.cxx *.cu *.h *.hpp +EXTENSION_MAPPING = cuh=C++ cu=C++ +RECURSIVE = YES +EXTRACT_ALL = YES +EXTRACT_PRIVATE = YES +EXTRACT_STATIC = YES +JAVADOC_AUTOBRIEF = YES +MULTILINE_CPP_IS_BRIEF = YES +STRIP_FROM_PATH = ../../ +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = YES +EXPAND_ONLY_PREDEF = NO +GENERATE_TAGFILE = +XML_PROGRAMLISTING = NO +PREDEFINED = __device__= \ + __host__= \ + __global__= \ + __forceinline__= \ + __shared__= \ + __align__(x)= \ + __launch_bounds__(x)= \ + NVBENCH_HAS_CUDA=1 diff --git a/docs/sphinx-combined/_static/nvidia-logo.png b/docs/sphinx-combined/_static/nvidia-logo.png new file mode 100644 index 00000000..1779ad93 Binary files /dev/null and b/docs/sphinx-combined/_static/nvidia-logo.png differ diff --git a/docs/sphinx-combined/cli_overview.rst b/docs/sphinx-combined/cli_overview.rst new file mode 100644 index 00000000..6d01ed27 --- /dev/null +++ b/docs/sphinx-combined/cli_overview.rst @@ -0,0 +1,12 @@ +CLI Options +=========== + +Every benchmark created with NVBench supports command-line interface, +with a variety of options. + +.. include:: ../cli_help.md + :parser: myst_parser.sphinx_ + + +.. include:: ../cli_help_axis.md + :parser: myst_parser.sphinx_ diff --git a/docs/sphinx-combined/conf.py b/docs/sphinx-combined/conf.py new file mode 100644 index 00000000..26cc0049 --- /dev/null +++ b/docs/sphinx-combined/conf.py @@ -0,0 +1,104 @@ +import os + +project = "NVBench API" +author = "NVIDIA Corporation" + +extensions = [ + "breathe", + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.autosummary", + "myst_parser", +] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "_doxygen"] + +autosummary_generate = True +autodoc_default_options = {"members": True, "undoc-members": True} + +release = "0.2.0" + +_here = os.path.abspath(os.path.dirname(__file__)) +_doxygen_xml = os.path.join(_here, "_doxygen", "xml") + +breathe_projects = {"nvbench": _doxygen_xml} +breathe_default_project = "nvbench" +breathe_domain_by_extension = {"cuh": "cpp", "cxx": "cpp", "cu": "cpp"} + + +def _patch_breathe_namespace_declarations() -> None: + try: + import breathe.renderer.sphinxrenderer as sphinxrenderer + from docutils import nodes + from sphinx import addnodes + except Exception: + return + + original = sphinxrenderer.SphinxRenderer.handle_declaration + + def handle_declaration(self, nodeDef, declaration, *args, **kwargs): + is_namespace = getattr(nodeDef, "kind", None) == "namespace" + if not is_namespace: + return original(self, nodeDef, declaration, *args, **kwargs) + + name = (declaration or "").strip() + if name.startswith("namespace "): + name = name[len("namespace ") :].strip() + if not name: + name = "" + + keyword = addnodes.desc_sig_keyword("namespace", "namespace") + sig_name = addnodes.desc_sig_name(name, name) + return [keyword, nodes.Text(" "), sig_name] + + sphinxrenderer.SphinxRenderer.handle_declaration = handle_declaration + + +def setup(app): + _patch_breathe_namespace_declarations() + + +###################################################### + +# -- Options for HTML output ------------------------------------------------- + +html_theme = "nvidia_sphinx_theme" + +html_logo = "_static/nvidia-logo.png" + +html_baseurl = ( + os.environ.get("NVBENCH_DOCS_BASE_URL", "https://nvidia.github.io/nvbench/").rstrip( + "/" + ) + + "/" +) + +html_theme_options = { + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/NVIDIA/nvbench", + "icon": "fa-brands fa-github", + "type": "fontawesome", + } + ], + "navigation_depth": 4, + "show_toc_level": 2, + "navbar_start": ["navbar-logo"], + "navbar_end": ["theme-switcher", "navbar-icon-links"], + "footer_start": ["copyright"], + "footer_end": ["sphinx-version"], + "sidebar_includehidden": True, + "collapse_navigation": False, + # "switcher": { + # "json_url": f"{html_baseurl}nv-versions.json", + # "version_match": release, + # }, +} + +html_static_path = ["_static"] if os.path.exists("_static") else [] + +# Images directory +if os.path.exists("img"): + html_static_path.append("img") diff --git a/docs/sphinx-combined/cpp_api.rst b/docs/sphinx-combined/cpp_api.rst new file mode 100644 index 00000000..410a6f7f --- /dev/null +++ b/docs/sphinx-combined/cpp_api.rst @@ -0,0 +1,40 @@ +NVBench C++ API Reference +========================= + +Index +----- + +.. doxygenindex:: + :project: nvbench + + +Free Functions +-------------- + +.. doxygenfunction:: nvbench::make_cuda_stream_view + :project: nvbench + +.. doxygenfunction:: nvbench::axis_type_to_string + :project: nvbench + +.. doxygenfunction:: nvbench::add_devices_section + :project: nvbench + +.. doxygenfunction:: nvbench::range + :project: nvbench + +.. doxygenfunction:: nvbench::sleep_kernel + :project: nvbench + +.. doxygenfunction:: nvbench::copy_kernel + :project: nvbench + +.. doxygenfunction:: nvbench::mod2_kernel + :project: nvbench + +.. doxygenfunction:: nvbench::demangle(const std::string &str) + :project: nvbench + +.. cpp:function:: template std::string nvbench::demangle() + + Returns demangled type name. diff --git a/docs/sphinx-combined/cpp_benchmarks.md b/docs/sphinx-combined/cpp_benchmarks.md new file mode 100644 index 00000000..46c6125b --- /dev/null +++ b/docs/sphinx-combined/cpp_benchmarks.md @@ -0,0 +1,530 @@ +# NVBench: benchmarking in C++ + +(minimal-benchmark)= +## Minimal Benchmark + +A basic kernel benchmark can be created with just a few lines of CUDA C++: + +```cpp +void my_benchmark(nvbench::state& state) { + state.exec([](nvbench::launch& launch) { + my_kernel<<>>(); + }); +} +NVBENCH_BENCH(my_benchmark); +``` + +There are three main components in the definition of a benchmark: + +- A `KernelGenerator` callable (`my_benchmark` above) +- A `KernelLauncher` callable (the lambda passed to `nvbench::exec`), and +- A `BenchmarkDeclaration` using `NVBENCH_BENCH` or similar macros. + +The `KernelGenerator` is called with an `nvbench::state` object that provides +configuration information, as shown in later sections. The generator is +responsible for configuring and instantiating a `KernelLauncher`, which is +(unsurprisingly) responsible for launching a kernel. The launcher should contain +only the minimum amount of code necessary to start the CUDA kernel, +since `nvbench::exec` will execute it repeatedly to gather timing information. +An `nvbench::launch` object is provided to the launcher to specify kernel +execution details, such as the CUDA stream to use. `NVBENCH_BENCH` registers +the benchmark with NVBench and initializes various attributes, including its +name and parameter axes. + +## Benchmark Name + +By default, a benchmark is named by converting the first argument +of `NVBENCH_BENCH` into a string. + +This can be changed to something more descriptive if desired. +The `NVBENCH_BENCH` macro produces a customization object that allows such +attributes to be modified. + +```cpp +NVBENCH_BENCH(my_benchmark).set_name("my_kernel<<>>"); +``` + +## CUDA Streams + +NVBench records GPU execution times on a specific CUDA stream. By default, a new +stream is created and passed to the `KernelLauncher` via the +`nvbench::launch::get_stream()` method, as shown in +[Minimal Benchmark](#minimal-benchmark). All benchmarked kernels and other +stream-ordered work must be launched on this stream for NVBench to capture it. + +In some instances, it may be inconvenient or impossible to specify an explicit +CUDA stream for the benchmarked operation to use. For example, a library may +manage and use its own streams, or an opaque API may always launch work on the +default stream. In these situations, users may provide NVBench with an explicit +stream via `nvbench::state::set_cuda_stream` and `nvbench::make_cuda_stream_view`. +It is assumed that all work of interest executes on or synchronizes with this +stream. + +```cpp +void my_benchmark(nvbench::state& state) { + cudaStream_t default_stream = 0; + state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream)); + state.exec([](nvbench::launch&) { + my_func(); // a host API invoking GPU kernels on the default stream + my_kernel<<>>(); // or a kernel launched with the default stream + }); +} +NVBENCH_BENCH(my_benchmark); +``` + +A full example can be found in [examples/stream.cu][CppExample_Stream]. + +## Parameter Axes + +Some kernels will be used with a variety of options, input data types/sizes, and +other factors that impact performance. NVBench explores these different +scenarios by sweeping through a set of user-defined parameter axes. + +A parameter axis defines a set of interesting values for a single kernel +parameter — for example, the size of the input, or the type of values being +processed. These parameter axes are used to customize a `KernelGenerator` with +static and runtime configurations. There are four supported types of parameters: +int64, float64, string, and type. + +More examples can found in [examples/axes.cu][CppExample_Axes]. + +### Int64 Axes + +A common example of a parameter axis is to vary the number of input values a +kernel should process during a benchmark measurement. An `int64_axis` is ideal +for this: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto num_inputs = state.get_int64("NumInputs"); + thrust::device_vector data = generate_input(num_inputs); + + state.exec([&data](nvbench::launch& launch) { + my_kernel<<>>(data.begin(), data.end()); + }); +} +NVBENCH_BENCH(benchmark).add_int64_axis("NumInputs", {16, 64, 256, 1024, 4096}); +``` + +NVBench will run the `benchmark` kernel generator once for each specified value +in the "NumInputs" axis. The `state` object provides the current parameter value +to `benchmark`. + +### Int64 Power-Of-Two Axes + +Using powers-of-two is quite common for these sorts of axes. `int64_axis` has a +unique power-of-two mode that simplifies how such axes are defined and helps +provide more readable output. A power-of-two int64 axis is defined using the +integer exponents, but the benchmark will be run with the computed 2^N value. + +```cpp +// Equivalent to above, {16, 64, 256, 1024, 4096} = {2^4, 2^6, 2^8, 2^10, 2^12} +NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs", + {4, 6, 8, 10, 12}); +// Or, as shown in a later section: +NVBENCH_BENCH(benchmark).add_int64_power_of_two_axis("NumInputs", + nvbench::range(4, 12, 2)); +``` + +### Float64 Axes + +For floating point numbers, a `float64_axis` is available: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto quality = state.get_float64("Quality"); + + state.exec([&quality](nvbench::launch& launch) + { + my_kernel<<>>(quality); + }); +} +NVBENCH_BENCH(benchmark).add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.}); +``` + +### String Axes + +For non-numeric data, an axis of arbitrary strings provides additional +flexibility: + +```cpp +void benchmark(nvbench::state& state) +{ + const auto rng_dist = state.get_string("RNG Distribution"); + thrust::device_vector data = generate_input(rng_dist); + + state.exec([&data](nvbench::launch& launch) + { + my_kernel<<>>(data.begin(), data.end()); + }); +} +NVBENCH_BENCH(benchmark).add_string_axis("RNG Distribution", {"Uniform", "Gaussian"}); +``` + +A common use for string axes is to encode enum values, as shown in +[examples/enums.cu][CppExample_Enums]. + +### Type Axes + +Another common situation involves benchmarking a templated kernel with multiple +compile-time configurations. NVBench strives to make such benchmarks as easy to +write as possible through the use of type axes. + +A `type_axis` is a list of types (`T1`, `T2`, `Ts`...) wrapped in +a `nvbench::type_list`. The kernel generator becomes a template +function and will be instantiated using types defined by the axis. The current +configuration's type is passed into the kernel generator using +a `nvbench::type_list`. + +```cpp +template +void my_benchmark(nvbench::state& state, nvbench::type_list) +{ + thrust::device_vector data = generate_input(); + + state.exec([&data](nvbench::launch& launch) + { + my_kernel<<>>(data.begin(), data.end()); + }); +} +using my_types = nvbench::type_list; +NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types)) + .set_type_axes_names({"ValueType"}); +``` + +The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in +the `type_list<...>` from breaking macro parsing. + +Type axes can be used to encode compile-time enum and integral constants using +the `nvbench::enum_type_list` helper. See +[examples/enums.cu][CppExample_Enums] for detail. + +### Parameter sweeping + +Since parameter sweeps often explore a range of evenly-spaced numeric values, a +strided range can be generated using the `nvbench::range(start, end, stride=1)` +helper. + +```cpp +assert(nvbench::range(2, 5) == {2, 3, 4, 5}); +assert(nvbench::range(2.0, 5.0) == {2.0, 3.0, 4.0, 5.0}); +assert(nvbench::range(2, 12, 2) == {2, 4, 6, 8, 10, 12}); +assert(nvbench::range(2, 12, 5) == {2, 7, 12}); +assert(nvbench::range(2, 12, 6) == {2, 8}); +assert(nvbench::range(0.0, 10.0, 2.5) == { 0.0, 2.5, 5.0, 7.5, 10.0}); +``` + +Note that start and end are inclusive. This utility can be used to define axis +values for all numeric axes. + +### Multiple Parameter Axes + +If more than one axis is defined, the complete cartesian product of all axes +will be benchmarked. For example, consider a benchmark with two type axes, one +int64 axis, and one float64 axis: + +```cpp +// InputTypes: {char, int, unsigned int} +// OutputTypes: {float, double} +// NumInputs: {2^10, 2^20, 2^30} +// Quality: {0.5, 1.0} + +using input_types = nvbench::type_list; +using output_types = nvbench::type_list; +NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types)) + .set_type_axes_names({"InputType", "OutputType"}) + .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 10)) + .add_float64_axis("Quality", {0.5, 1.0}); +``` + +This would generate a total of 36 configurations and instantiate the benchmark 6 +times. Keep the rapid growth of these combinations in mind when choosing the +number of values in an axis. See the section about combinatorial explosion for +more examples and information. + +## Throughput Measurements + +In additional to raw timing information, NVBench can track a kernel's +throughput, reporting the amount of data processed as: + +- Number of items per second +- Number of bytes per second +- Percentage of device's peak memory bandwidth utilized + +To enable throughput measurements, the kernel generator can specify the number +of items and/or bytes handled in a single kernel execution using +the `nvbench::state` API. + +```cpp +state.add_element_count(size); +state.add_global_memory_reads(size); +state.add_global_memory_writes(size); +``` + +In general:: +- Add only the input element count (no outputs). +- Add all reads and writes to global memory. + +More examples can found in [examples/throughput.cu][CppExample_Throughput]. + +(skip-uninteresting-or-invalid-benchmarks)= +## Skip Uninteresting / Invalid Benchmarks + +Sometimes particular combinations of parameters aren't useful or interesting — +or for type axes, some configurations may not even compile. + +The `nvbench::state` object provides a `skip("Reason")` method that can be used +to avoid running these benchmarks. To skip uncompilable type axis +configurations, create an overload for the kernel generator that selects for the +invalid type combination: + +```cpp +template +void my_benchmark(nvbench::state& state, nvbench::type_list) +{ + // Skip benchmarks at runtime: + if (should_skip_this_config) + { + state.skip("Reason for skip."); + return; + } + + /* ... */ +}; + +// Skip benchmarks at compile time -- for example, always skip when T == U +// (Note that the `type_list` argument defines the same type twice). +template +void my_benchmark(nvbench::state& state, + nvbench::type_list) +{ + state.skip("T must not be the same type as U."); +} +using Ts = nvbench::type_list<...>; +using Us = nvbench::type_list<...>; +NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(Ts, Us)); +``` + +More examples can found in [examples/skip.cu][CppExample_Skip]. + +## Execution Tags For Special Cases + +By default, NVBench assumes that the entire execution time of the +`KernelLauncher` should be measured, and that no syncs are performed +(e.g. `cudaDeviceSynchronize`, `cudaStreamSynchronize`, `cudaEventSynchronize`, +etc. are not called). + +Execution tags may be passed to `state.exec` when these assumptions are not +true: + +- `nvbench::exec_tag::sync` tells NVBench that the kernel launcher will + synchronize internally. +- `nvbench::exec_tag::timer` requests a timer object that can be used to + restrict the timed region. +- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables + them during execution to reduce runtime, and prevents their compilation to + reduce compile-time and binary size. +- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking + code from being compiled for a particular benchmark. A runtime error is emitted + if the benchmark is defined with `set_is_cpu_only(true)`. +- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking + code from being compiled for a particular benchmark. A runtime error is emitted + if the benchmark does not also define `set_is_cpu_only(true)`. + +Multiple execution tags may be combined using `operator|`, e.g. + +```cpp +state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [](nvbench::launch &launch, auto& timer) { /*...*/ }); +``` + +The following sections provide more details on these features. + +### Benchmarks that sync + +If a `KernelLauncher` synchronizes the CUDA device internally without passing +this tag, **the benchmark will deadlock at runtime**. Passing the `sync` tag +will fix this issue. Note that this disables batch measurements. + +```cpp +void sync_example(nvbench::state& state) +{ + // Pass the `sync` exec tag to tell NVBench that this benchmark will sync: + state.exec(nvbench::exec_tag::sync, [](nvbench::launch& launch) { + /* Benchmark that implicitly syncs here. */ + }); +} +NVBENCH_BENCH(sync_example); +``` + +See [examples/exec_tag_sync.cu][CppExample_ExecTagSync] for a complete +example. + +(explicit-timer-mode)= +### Explicit timer mode: `nvbench::exec_tag::timer` + +For some kernels, the working data may need to be reset between launches. This +is particularly common for kernels that modify their input in-place. + +Resetting the input data to prepare for a new trial shouldn't be included in the +benchmark's execution time. NVBench provides a manual timer mode that allows the +kernel launcher to specify the critical section to be measured and exclude any +per-trial reset operations. + +To enable the manual timer mode, pass the tag object `nvbench::exec_tag::timer` +to `state.exec`, and declare the kernel launcher with an +additional `auto& timer` argument. + +Note that using manual timer mode disables batch measurements. + +```cpp +void timer_example(nvbench::state& state) +{ + // Pass the `timer` exec tag to request a timer: + state.exec(nvbench::exec_tag::timer, + // Lambda now accepts a timer: + [](nvbench::launch& launch, auto& timer) + { + /* Reset code here, excluded from timing */ + + /* Timed region is explicitly marked. + * The timer handles any synchronization, flushes, etc when/if + * needed for the current measurement. + */ + timer.start(); + /* Launch kernel on `launch.get_stream()` here */ + timer.stop(); + }); +} +NVBENCH_BENCH(timer_example); +``` + +See [examples/exec_tag_timer.cu][CppExample_ExecTagTimer] for a complete +example. + +### Compilation hints + +These execution tags are optional hints that disable the compilation of various +code paths when they are not needed. They apply only to a single benchmark. + +- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend. +- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends. + - Requires that the benchmark does not define `set_is_cpu_only(true)`. + - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size. + - Host-side CPU measurements of GPU kernel execution time are still provided. +- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends. + - Requires that the benchmark defines `set_is_cpu_only(true)`. + - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size. + - See also [CPU-only Benchmarks](#cpu-only-benchmarks). + +(cpu-only-benchmarks)= +## CPU-only Benchmarks + +NVBench provides CPU-only benchmarking facilities that are intended for measuring +significant CPU workloads. We do not recommend using these features for high-resolution +CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for +such applications. Examples are provided in [examples/cpu_only.cu][CppExample_CpuOnly]. + +Note that NVBench still requires a CUDA compiler and runtime even if a project only contains +CPU-only benchmarks. + +The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements: + +```cpp +void my_cpu_benchmark(nvbench::state &state) +{ + state.exec([](nvbench::launch &) { /* workload */ }); +} +NVBENCH_BENCH(my_cpu_benchmark) + .set_is_cpu_only(true); // Mark as CPU-only. +``` + +The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and +binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while +`is_cpu_only` is false. + +```cpp +void my_cpu_benchmark(nvbench::state &state) +{ + state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends + [](nvbench::launch &) { /* workload */ }); +} +NVBENCH_BENCH(my_cpu_benchmark) + .set_is_cpu_only(true); // Mark as CPU-only. +``` + +The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This +is useful for benchmarks that require additional per-sample setup/teardown. See the +[`nvbench::exec_tag::timer`](#explicit-timer-mode) section for more +details. + +```cpp +void my_cpu_benchmark(nvbench::state &state) +{ + state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends + nvbench::exec_tag::timer, // Request a timer object + [](nvbench::launch &, auto &timer) + { + // Setup here + timer.start(); + // timed workload + timer.stop(); + // teardown here + }); +} +NVBENCH_BENCH(my_cpu_benchmark) + .set_is_cpu_only(true); // Mark as CPU-only. +``` + +## Beware of Combinatorial Explosion + +Be very careful of how quickly the configuration space can grow. The following +example generates 960 total runtime benchmark configurations, and will compile +192 different static parametrizations of the kernel generator. This is likely +excessive, especially for routine regression testing. + +```cpp +using value_types = nvbench::type_list; +using op_types = nvbench::type_list, + thrust::multiplies<>, + thrust::maximum<>>; + +NVBENCH_BENCH_TYPES(my_benchmark, + NVBENCH_TYPE_AXES(value_types, + value_types, + value_types, + op_types>)) + .set_type_axes_names({"T", "U", "V", "Op"}) + .add_int64_power_of_two_axis("NumInputs", nvbench::range(10, 30, 5)); +``` + +``` +960 total configs += 4 [T=(U8, I32, F32, F64)] +* 4 [U=(U8, I32, F32, F64)] +* 4 [V=(U8, I32, F32, F64)] +* 3 [Op=(plus, multiplies, max)] +* 5 [NumInputs=(2^10, 2^15, 2^20, 2^25, 2^30)] +``` + +For large configuration spaces like this, pruning some of the less useful +combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques +described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting-or-invalid-benchmarks) +section can help immensely with keeping compile / run times manageable. + +Splitting a single large configuration space into multiple, more focused +benchmarks with reduced dimensionality will likely be worth the effort as well. + +[CppExample_Stream]: https://github.com/NVIDIA/nvbench/blob/main/examples/stream.cu +[CppExample_Axes]: https://github.com/NVIDIA/nvbench/blob/main/examples/axes.cu +[CppExample_Enums]: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu +[CppExample_Throughput]: https://github.com/NVIDIA/nvbench/blob/main/examples/throughput.cu +[CppExample_Skip]: https://github.com/NVIDIA/nvbench/blob/main/examples/skip.cu +[CppExample_CpuOnly]: https://github.com/NVIDIA/nvbench/blob/main/examples/cpu_only.cu +[CppExample_ExecTagSync]: https://github.com/NVIDIA/nvbench/blob/main/examples/exec_tag_sync.cu +[CppExample_ExecTagTimer]: https://github.com/NVIDIA/nvbench/blob/main/examples/exec_tag_timer.cu diff --git a/docs/sphinx-combined/index.rst b/docs/sphinx-combined/index.rst new file mode 100644 index 00000000..86107234 --- /dev/null +++ b/docs/sphinx-combined/index.rst @@ -0,0 +1,13 @@ +NVBench API +=========== + +Combined C++ and Python API documentation. + +.. toctree:: + :maxdepth: 2 + + cpp_benchmarks + py_benchmarks + cli_overview + cpp_api + python_api diff --git a/docs/sphinx-combined/py_benchmarks.md b/docs/sphinx-combined/py_benchmarks.md new file mode 100644 index 00000000..788d4e7d --- /dev/null +++ b/docs/sphinx-combined/py_benchmarks.md @@ -0,0 +1,38 @@ +# NVBench: benchmarking in Python + +The `cuda.bench` Python module provides Python API powered by C++ NVBench +library to benchmark GPU-aware Python code. + +## Minimal benchmark + +```python +from cuda.bench import State, Launch +from cuda.bench import register, run_all_registered +from typing import Callable + +from my_package import impl + +def benchmark_impl(state: State) -> None: + + # get state parameters + n = state.get_int64("Elements") + + # prepare inputs + data = generate(n, state.get_stream()) + + # body that is being timed. Must execute + # on the stream handed over by NVBench + launchable_fn : Callable[[Launch], None] = + lambda launch: impl(data, launch.get_stream()) + + state.exec(launchable_fn) + + +bench = register(benchmark_impl) +bench.add_int64_axis("Elements", [1000, 10000, 100000]) + + +if __name__ == "__main__": + import sys + run_all_registered(sys.argv) +``` diff --git a/docs/sphinx-combined/python_api.rst b/docs/sphinx-combined/python_api.rst new file mode 100644 index 00000000..22e24eff --- /dev/null +++ b/docs/sphinx-combined/python_api.rst @@ -0,0 +1,8 @@ +cuda.bench Python API Reference +=============================== + +.. automodule:: cuda.bench + :members: + :imported-members: + :undoc-members: + :show-inheritance: diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index e6e7753c..4d2f4963 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -69,12 +69,21 @@ def _get_cuda_major_version(): State = _nvbench_module.State register = _nvbench_module.register run_all_benchmarks = _nvbench_module.run_all_benchmarks -test_cpp_exception = _nvbench_module.test_cpp_exception -test_py_exception = _nvbench_module.test_py_exception +_test_cpp_exception = _nvbench_module._test_cpp_exception +_test_py_exception = _nvbench_module._test_py_exception # Expose the module as _nvbench for backward compatibility (e.g., for tests) _nvbench = _nvbench_module +# Set module of exposed objects +Benchmark.__module__ = __name__ +CudaStream.__module__ = __name__ +Launch.__module__ = __name__ +NVBenchRuntimeError.__module__ = __name__ +State.__module__ = __name__ +register.__module__ = __name__ +run_all_benchmarks.__module__ = __name__ + # Clean up internal symbols del ( _nvbench_module, diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index aeadcc24..ab79d753 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -273,7 +273,7 @@ static void def_class_CudaStream(py::module_ m) // nvbench::cuda_stream::get_stream static constexpr const char *class_CudaStream_doc = R"XXX( -Represents CUDA stream + Represents CUDA stream Note ---- @@ -321,7 +321,7 @@ void def_class_Launch(py::module_ m) // nvbench::launch::get_stream -> nvbench::cuda_stream static constexpr const char *class_Launch_doc = R"XXXX( -Configuration object for function launch. + Configuration object for function launch. Note ---- @@ -363,13 +363,13 @@ static void def_class_Benchmark(py::module_ m) // nvbench::benchmark_base::set_min_samples static constexpr const char *class_Benchmark_doc = R"XXXX( -Represents NVBench benchmark. + Represents NVBench benchmark. Note ---- The class is not user-constructible. - Use `~register` function to create Benchmark and register + Use `register` function to create Benchmark and register it with NVBench. )XXXX"; auto py_benchmark_cls = py::class_(m, "Benchmark", class_Benchmark_doc); @@ -691,7 +691,7 @@ void def_class_State(py::module_ m) using state_ref_t = std::reference_wrapper; static constexpr const char *class_State_doc = R"XXXX( -Represent benchmark configuration state. + Represents benchmark configuration state. Note ---- @@ -736,7 +736,7 @@ Get device_id of the device from this configuration return std::ref(state.get_cuda_stream()); }; static constexpr const char *method_get_stream_doc = R"XXXX( -Get `~CudaStream` object from this configuration" +Get `CudaStream` object from this configuration )XXXX"; pystate_cls.def("get_stream", method_get_stream_impl, @@ -1014,10 +1014,10 @@ Use argument True to disable use of blocking kernel by NVBench" } }; static constexpr const char *method_exec_doc = R"XXXX( -Execute callable running the benchmark. + Execute callable running the benchmark. The callable may be executed multiple times. The callable - will be passed `~Launch` object argument. + will be passed `Launch` object argument. Parameters ---------- @@ -1194,8 +1194,8 @@ Register benchmark function of type Callable[[nvbench.State], None] py::arg("argv") = py::list()); // Testing utilities - m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); }); - m.def("test_py_exception", []() { + m.def("_test_cpp_exception", []() { throw nvbench_run_error("Test"); }); + m.def("_test_py_exception", []() { py::set_error(exc_storage.get_stored(), "Test"); throw py::error_already_set(); }); diff --git a/python/test/test_cuda_bench.py b/python/test/test_cuda_bench.py index 7d927e8f..b63d24d8 100644 --- a/python/test/test_cuda_bench.py +++ b/python/test/test_cuda_bench.py @@ -6,12 +6,12 @@ def test_cpp_exception(): with pytest.raises(RuntimeError, match="Test"): - bench._nvbench.test_cpp_exception() + bench._nvbench._test_cpp_exception() def test_py_exception(): with pytest.raises(bench.NVBenchRuntimeError, match="Test"): - bench._nvbench.test_py_exception() + bench._nvbench._test_py_exception() @pytest.mark.parametrize(