From 2249bef6dcff279b459c4c858e67e540f2dfe6a1 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Tue, 16 Dec 2025 07:41:24 +0000 Subject: [PATCH 01/18] start metric profile branch --- requirements.txt | 1 + src/profile.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 src/profile.py diff --git a/requirements.txt b/requirements.txt index 253e57da..5f2ae387 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ pydra_config pytest ninja cupy-cuda12x +nsight-python # Numerics einops diff --git a/src/profile.py b/src/profile.py new file mode 100644 index 00000000..3459e106 --- /dev/null +++ b/src/profile.py @@ -0,0 +1,49 @@ +#### +# Profiling Related Functions +# TODO: @kesavan @simon @arya +#### + +import torch +import nsight + +# wrapper with tool to measure hardware metric + + +# nsight-python +# https://docs.nvidia.com/nsight-python/overview + +def check_ncu_available() -> bool: + """Check if ncu is in PATH. Returns True if found, False otherwise.""" + from shutil import which + return which('ncu') is not None + + +# Note you need access to hardware counter +# you also need to have ncu installed and point to the right path +# sudo env "PATH=$PATH" $(which python) src/profile.py +@nsight.analyze.kernel +def benchmark_matmul(n): + """ + The simplest possible benchmark. + We create two matrices and multiply them. + follow example from nsight-python docs + """ + # Create two NxN matrices on GPU + a = torch.randn(n, n, device="cuda") + b = torch.randn(n, n, device="cuda") + + # Mark the kernel we want to profile + with nsight.annotate("matmul"): + c = a @ b + + return c + +if __name__ == "__main__": + if not check_ncu_available(): + print("ncu not found in PATH. Please install ncu and point to path.") + exit(1) + # Run the benchmark + result = benchmark_matmul(1024) + +# pytorch profiler +# migrate from old repo during ICML / caesar repo \ No newline at end of file From 845dcbb835c045bdf80b827d2d614600ac04f249 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Tue, 16 Dec 2025 17:56:13 +0000 Subject: [PATCH 02/18] clean up remaining places for cuda events --- scripts/generate_baseline_time.py | 50 --------------- scripts/generate_baseline_time_modal.py | 68 ++++----------------- scripts/get_baseline_time_single_problem.py | 13 ++-- scripts/inspect_baseline.py | 3 +- scripts/inspect_triton.py | 9 ++- src/timing.py | 1 - 6 files changed, 27 insertions(+), 117 deletions(-) diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py index 0a1f608b..47975ebd 100644 --- a/scripts/generate_baseline_time.py +++ b/scripts/generate_baseline_time.py @@ -243,53 +243,3 @@ def test_measure_particular_program(level_num: int, problem_id: int): # get_time(2, 43, torch_compile=True) - - -################################################################################ -# Deprecated -################################################################################ - - -def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False): - raise DeprecationWarning("Use New measure_program_time instead") - ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id( - level_num, problem_id, with_name=True - ) - ref_arch_name = ref_arch_name.split("/")[-1] - context = {} - Model, get_init_inputs, get_inputs = load_original_model_and_inputs( - ref_arch_src, context - ) - try: - with torch.no_grad(): - torch.cuda.synchronize(device=device) - set_seed(42) - inputs = get_inputs() - set_seed(42) - init_inputs = get_init_inputs() - inputs = [ - x.cuda(device=device) if isinstance(x, torch.Tensor) else x - for x in inputs - ] - init_inputs = [ - x.cuda(device=device) if isinstance(x, torch.Tensor) else x - for x in init_inputs - ] - model = Model(*init_inputs) - - if torch_compile: - model = torch.compile(model) - print("Compiled model Done") - model = model.cuda(device=device) - torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( - model, *inputs, num_trials=num_trials, verbose=False, device=device - ) - runtime_stats = get_timing_stats(elapsed_times, device=device) - # json_results[f"level{level_num}"][ref_arch_name] = runtime_stats - print(f"{ref_arch_name} {runtime_stats}") - return (ref_arch_name, runtime_stats) - except Exception as e: - print(f"[Eval] Error in Measuring Performance: {e}") - - diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py index a0039193..c85824d1 100644 --- a/scripts/generate_baseline_time_modal.py +++ b/scripts/generate_baseline_time_modal.py @@ -2,11 +2,10 @@ import numpy as np from src.eval import ( load_original_model_and_inputs, - time_execution_with_cuda_event, - get_timing_stats, set_seed, fetch_ref_arch_from_problem_id, ) +from src.timing import get_timing_function, get_timing_stats from src.dataset import construct_problem_dataset_from_problem_dir from src.utils import read_file import os @@ -160,6 +159,7 @@ def measure_program_time( ref_arch_name: str, ref_arch_src: str, num_trials: int = 100, + timing_method: str="cuda_event", use_torch_compile: bool = False, torch_compile_backend: str="inductor", torch_compile_options: str="default", @@ -199,9 +199,16 @@ def measure_program_time( print(f"Using PyTorch Eager Execution on {ref_arch_name}") model = model.cuda(device=device) + timing_func = get_timing_function(timing_method) torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( - model, *inputs, num_trials=num_trials, verbose=verbose, device=device + elapsed_times = timing_func( + model, + inputs, + num_warmup=3, # or any default you prefer + num_trials=num_trials, + discard_first=1, # or 0 to include first trial + verbose=verbose, + device=device, ) runtime_stats = get_timing_stats(elapsed_times, device=device) @@ -247,7 +254,8 @@ def record_baseline_times(config: BaselineConfig, ref_arch_name, ref_arch_src, config.num_trials, - use_torch_compile, + timing_method, + use_torch_compile, torch_compile_backend, torch_compile_options, torch.device(f"cuda:0"), @@ -341,53 +349,3 @@ def main(config: BaselineConfig): # get_time(2, 43, torch_compile=True) - - -################################################################################ -# Deprecated -################################################################################ - - -def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False): - raise DeprecationWarning("Use New measure_program_time instead") - ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id( - level_num, problem_id, with_name=True - ) - ref_arch_name = ref_arch_name.split("/")[-1] - context = {} - Model, get_init_inputs, get_inputs = load_original_model_and_inputs( - ref_arch_src, context - ) - try: - with torch.no_grad(): - torch.cuda.synchronize(device=device) - set_seed(42) - inputs = get_inputs() - set_seed(42) - init_inputs = get_init_inputs() - inputs = [ - x.cuda(device=device) if isinstance(x, torch.Tensor) else x - for x in inputs - ] - init_inputs = [ - x.cuda(device=device) if isinstance(x, torch.Tensor) else x - for x in init_inputs - ] - model = Model(*init_inputs) - - if torch_compile: - model = torch.compile(model) - print("Compiled model Done") - model = model.cuda(device=device) - torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( - model, *inputs, num_trials=num_trials, verbose=False, device=device - ) - runtime_stats = get_timing_stats(elapsed_times, device=device) - # json_results[f"level{level_num}"][ref_arch_name] = runtime_stats - print(f"{ref_arch_name} {runtime_stats}") - return (ref_arch_name, runtime_stats) - except Exception as e: - print(f"[Eval] Error in Measuring Performance: {e}") - - diff --git a/scripts/get_baseline_time_single_problem.py b/scripts/get_baseline_time_single_problem.py index 476fe2fd..e6f8155a 100644 --- a/scripts/get_baseline_time_single_problem.py +++ b/scripts/get_baseline_time_single_problem.py @@ -2,16 +2,17 @@ import numpy as np from src.eval import ( load_original_model_and_inputs, - time_execution_with_cuda_event, - get_timing_stats, set_seed, fetch_ref_arch_from_problem_id, ) +from src.timing import get_timing_function, get_timing_stats + def measure_program_time( ref_arch_name: str, ref_arch_src: str, num_trials: int = 100, + timing_method: str="cuda_event", use_torch_compile: bool = False, torch_compile_backend: str="inductor", torch_compile_options: str="default", @@ -52,8 +53,9 @@ def measure_program_time( model = model.cuda(device=device) torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( - model, *inputs, num_trials=num_trials, verbose=verbose, device=device + timing_func = get_timing_function(timing_method ) + elapsed_times = timing_func( + model, inputs, num_warmup=3, num_trials=num_trials, discard_first=1, verbose=verbose, device=device ) runtime_stats = get_timing_stats(elapsed_times, device=device) @@ -87,5 +89,4 @@ def get_inputs(): def get_init_inputs(): return [] # No special initialization inputs needed """ - print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False)) - print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=True)) \ No newline at end of file + print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event")) \ No newline at end of file diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py index e7811f64..a39716fd 100644 --- a/scripts/inspect_baseline.py +++ b/scripts/inspect_baseline.py @@ -5,11 +5,10 @@ import numpy as np from src.eval import ( load_original_model_and_inputs, - time_execution_with_cuda_event, - get_timing_stats, set_seed, fetch_ref_arch_from_problem_id, ) +from src.timing import get_timing_function, get_timing_stats from src.dataset import construct_problem_dataset_from_problem_dir import os, sys import logging diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py index 4f13c8af..f664b6d6 100644 --- a/scripts/inspect_triton.py +++ b/scripts/inspect_triton.py @@ -21,10 +21,9 @@ from src.eval import ( load_custom_model, load_original_model_and_inputs, - time_execution_with_cuda_event, - get_timing_stats, set_seed, ) +from src.timing import get_timing_function, get_timing_stats def fetch_ref_arch_from_dataset(dataset: list[str], problem_id: int) -> tuple[str, str, str]: @@ -123,6 +122,8 @@ def run_profile_and_save_trace(dataset: list[str], problem_id: int, num_trials=1 def get_torch_compile_triton(level_num, problem_id): """ Get the triton code generated by torch compile for a particular problem + + Made this during ICML rebuttal in March 2025 """ ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset( dataset, problem_id, with_name=True @@ -167,7 +168,9 @@ def get_torch_compile_triton(level_num, problem_id): torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( + timing_method = "cuda_event" # use cuda event for timing here + time_func_cuda_event = get_timing_function(timing_method) + elapsed_times = time_func_cuda_event( model, *inputs, num_trials=1, verbose=False, device=device ) runtime_stats = get_timing_stats(elapsed_times, device=device) diff --git a/src/timing.py b/src/timing.py index 8a36522b..7f192625 100644 --- a/src/timing.py +++ b/src/timing.py @@ -90,7 +90,6 @@ def get_timing_function( NOTE: we have a WIP blogpost on this topic covering the various timing approaches """ - def time_execution_with_cuda_event( kernel_fn: callable, args: list[Any], From b3387a42a301cea6d1d31dd2dd3265e170b15b5f Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 17 Dec 2025 21:19:51 +0000 Subject: [PATCH 03/18] add in nsight-python metric evaluation script and example usage, have to patch some nsight python functions to get this to work --- src/profile.py | 127 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 108 insertions(+), 19 deletions(-) diff --git a/src/profile.py b/src/profile.py index 3459e106..8a99f802 100644 --- a/src/profile.py +++ b/src/profile.py @@ -4,46 +4,135 @@ #### import torch -import nsight +import pandas as pd # wrapper with tool to measure hardware metric -# nsight-python -# https://docs.nvidia.com/nsight-python/overview +# Check if nsight-python is available +# To patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug +try: + import nsight + NSIGHT_AVAILABLE = True + + # Patch 1: Multiple metrics support + _orig_ncu_init = nsight.collection.ncu.NCUCollector.__init__ + nsight.collection.ncu.NCUCollector.__init__ = lambda self, metric="gpu__time_duration.sum", *a, **kw: \ + _orig_ncu_init(self, ",".join(metric) if isinstance(metric, (list, tuple)) else metric, *a, **kw) + + # Patch 2: Extract all metrics from comma-separated string + _orig_extract = nsight.extraction.extract_df_from_report + def _patched_extract(path, metric, *a, **kw): + if "," not in metric: + return _orig_extract(path, metric, *a, **kw) + rows = [] + for m in metric.split(","): + df = _orig_extract(path, m.strip(), *a, **kw) + if df is not None and not df.empty: + df = df.copy() + df['Metric Name'] = m.strip() + rows.extend(df.to_dict('records')) + return pd.DataFrame(rows) if rows else None + nsight.extraction.extract_df_from_report = _patched_extract + + # Patch 3: Fix "cannot insert Annotation" bug + _orig_agg = nsight.transformation.aggregate_data + def _patched_agg(df, func, norm, progress): + _orig_gb = pd.DataFrame.groupby + pd.DataFrame.groupby = lambda self, *a, **kw: _orig_gb(self, *a, **{**kw, 'as_index': False}) + try: + return _orig_agg(df, func, norm, progress) + finally: + pd.DataFrame.groupby = _orig_gb + nsight.transformation.aggregate_data = _patched_agg + +except ImportError: + NSIGHT_AVAILABLE = False + + +def profile_with_nsight(func, metrics=None, *args, **kwargs): + """Profile a PyTorch function. Returns {metric_name: value}.""" + if not NSIGHT_AVAILABLE: + raise RuntimeError("nsight-python not available") + + metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg']) + + @nsight.analyze.kernel(metric=metrics, runs=1, configs=[(0,)], + combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0)) + def profiled(_): + with nsight.annotate("kernel"): + return func(*args, **kwargs) + + try: + result = profiled() + if result is None: + return {m: None for m in metrics} + + df = result.to_dataframe() + if df is None or df.empty: + return {m: None for m in metrics} + + if 'Metric Name' in df.columns: + return {row['Metric Name']: float(row['AvgValue']) for _, row in df.iterrows()} + return {metrics[0]: float(df['AvgValue'].iloc[0])} + except Exception as e: + print(f"Error profiling: {e}") + return {m: None for m in metrics} + + +# example function to profile with nsight +def example_ncu_python_profile(): + # Test with simple kernel + def test_kernel(x, y): + """Simple matmul kernel.""" + return x @ y + + print("Creating test tensors...") + a = torch.randn(256, 256, device="cuda") + b = torch.randn(256, 256, device="cuda") + + print("Running nsight profiling...") + metric_values = profile_with_nsight( + test_kernel, + ['sm__cycles_active.avg', 'sm__cycles_elapsed.sum', "smsp__inst_executed_pipe_tensor_op_hmma.sum"], + a, b + ) + + print("\nProfiling results:") + for metric_name, value in metric_values.items(): + print(f" {metric_name}: {value}") + return + def check_ncu_available() -> bool: - """Check if ncu is in PATH. Returns True if found, False otherwise.""" from shutil import which return which('ncu') is not None -# Note you need access to hardware counter -# you also need to have ncu installed and point to the right path -# sudo env "PATH=$PATH" $(which python) src/profile.py @nsight.analyze.kernel def benchmark_matmul(n): - """ - The simplest possible benchmark. - We create two matrices and multiply them. - follow example from nsight-python docs - """ - # Create two NxN matrices on GPU + """Standard benchmark following nsight-python docs.""" a = torch.randn(n, n, device="cuda") b = torch.randn(n, n, device="cuda") - - # Mark the kernel we want to profile with nsight.annotate("matmul"): c = a @ b - return c + if __name__ == "__main__": if not check_ncu_available(): - print("ncu not found in PATH. Please install ncu and point to path.") + print("ncu not found in PATH. Install Nsight Compute.") + exit(1) + + if not torch.cuda.is_available(): + print("CUDA is not available.") exit(1) - # Run the benchmark - result = benchmark_matmul(1024) + + + # test the example_ncu_python_profile + example_ncu_python_profile() + + # pytorch profiler # migrate from old repo during ICML / caesar repo \ No newline at end of file From 9d36c26c638c86db61933ab32abc0c894b249d1a Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Tue, 6 Jan 2026 22:00:41 +0000 Subject: [PATCH 04/18] merge main From daa08cb360e8cbef19ebd0404dba2c15ecca2ee0 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 02:04:27 +0000 Subject: [PATCH 05/18] add in profiling with nsight python capability, as well as code path for timing witih nsight python --- pyproject.toml | 1 + requirements.txt | 3 +- scripts/inspect_baseline.py | 4 + src/kernelbench/timing.py | 199 ++++++++++++++++++++++++++++++++++++ src/profile.py | 180 ++++++++++++++++++++++++++++++-- 5 files changed, 377 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85a690c5..7c0bccb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ gpu = [ "nvidia-cutlass-dsl", "tilelang", "cupy-cuda12x", + "nsight-python", ] dev = [ "pytest", diff --git a/requirements.txt b/requirements.txt index 1e8db6e7..07603a86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,12 +19,11 @@ tilelang tqdm>=4.67.1 packaging pydra-config -pytest ninja>=1.13.0 cupy-cuda12x==13.6.0 -nsight-python tomli>=2.3.0 tabulate>=0.9.0 +nsight-python # Numerics einops>=0.8.1 diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py index 7043a3f2..9f9f6b7c 100644 --- a/scripts/inspect_baseline.py +++ b/scripts/inspect_baseline.py @@ -5,10 +5,14 @@ import numpy as np from kernelbench.eval import ( load_original_model_and_inputs, + time_execution_with_cuda_event, + get_timing_stats, set_seed, fetch_ref_arch_from_problem_id, ) from kernelbench.dataset import construct_problem_dataset_from_problem_dir +import os, sys +import logging import json device = torch.device("cuda:0") diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py index 7f860005..6b81e8c7 100644 --- a/src/kernelbench/timing.py +++ b/src/kernelbench/timing.py @@ -77,6 +77,8 @@ def get_timing_function( return time_execution_with_do_bench_impl case "host_time": return time_execution_with_host_time + case "nsight_python_time": + return time_execution_with_nsight_python # we might add other methods in the future case _: raise ValueError(f"Unsupported timing method: {method}") @@ -387,6 +389,75 @@ def time_execution_with_host_time( return elapsed_times +def time_execution_with_nsight_python( + kernel_fn: callable, + args: list[Any], + num_warmup: int = 3, + num_trials: int = 10, + discard_first: int = 1, # not used here + verbose: bool = True, + device: torch.device | None = None) -> list[float]: + """ + Time a CUDA kernel function using nsight-python. + + Note: nsight returns an average time across num_trials runs. + Returns a list with a single value (average time) for API consistency. + GPU time from nsight is in nanoseconds, converted to milliseconds. + + Returns: + List containing one float: average elapsed time in milliseconds + """ + import sys + import os + # Add parent directory to path to import profile module + parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + from profile import profile_with_nsight + + if device is None: + if verbose: + print(f"Using current device: {torch.cuda.current_device()}") + device = torch.cuda.current_device() + + with torch.cuda.device(device): + # Warm ups + for _ in range(num_warmup): + kernel_fn(*args) + torch.cuda.synchronize(device=device) + + # Clear cache for cold start + torch.cuda.empty_cache() + clear_l2_cache(device=device) + + if verbose: + print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}") + + # Profile with nsight - returns average time in nanoseconds + def wrapped_kernel(): + return kernel_fn(*args) + + metric_values = profile_with_nsight( + wrapped_kernel, + metrics=["gpu__time_duration.sum"], + num_trials=num_trials + ) + + # Convert from nanoseconds to milliseconds + gpu_time_ns = metric_values.get("gpu__time_duration.sum") + if gpu_time_ns is None: + raise RuntimeError("Failed to get GPU time from nsight") + + # Convert nanoseconds to milliseconds + # nsight returns average across num_trials, so we return a single value in a list + gpu_time_ms = gpu_time_ns / 1_000_000.0 + + if verbose: + print(f"Average GPU time: {gpu_time_ms:.3f} ms (across {num_trials} trials)") + + # Return list with single average value for API consistency + return [gpu_time_ms] + ######################################################## # Timing stats # tools to help compute speedup and other time @@ -438,3 +509,131 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> stats["device"] = str(device) # for debugging return stats + + +# def test_nsight_timing_accuracy( +# num_trials: int = 10, +# num_warmup: int = 3, +# device: torch.device | None = None, +# verbose: bool = True +# ): +# """ +# Test and benchmark time_execution_with_nsight_python against other timing methods. + +# Compares nsight_python_time with cuda_event and host_time to verify accuracy. +# All times are reported in milliseconds for consistency. + +# Args: +# num_trials: Number of timing trials to run +# num_warmup: Number of warmup iterations +# device: CUDA device to use, defaults to current device +# verbose: Whether to print detailed comparison results + +# Returns: +# Dictionary with timing results and comparison statistics +# """ +# if not torch.cuda.is_available(): +# raise RuntimeError("CUDA is not available. This test requires a CUDA device.") + +# if device is None: +# device = torch.cuda.current_device() + +# # Create a simple test kernel (matrix multiplication) +# M, N, K = 1024, 1024, 1024 +# a = torch.randn(M, K, device=device, dtype=torch.float32) +# b = torch.randn(K, N, device=device, dtype=torch.float32) + +# def matmul_kernel(a, b): +# return torch.matmul(a, b) + +# args = [a, b] + +# if verbose: +# print("=" * 80) +# print("Testing nsight_python_time accuracy against other timing methods") +# print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication") +# print(f"Device: {device} ({torch.cuda.get_device_name(device)})") +# print(f"Trials: {num_trials}, Warmup: {num_warmup}") +# print("=" * 80) + +# results = {} + +# # Test each timing method +# timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"] + +# for method in timing_methods: +# if verbose: +# print(f"\n[Testing] {method}...") + +# try: +# timing_func = get_timing_function(method) +# elapsed_times = timing_func( +# matmul_kernel, +# args=args, +# num_warmup=num_warmup, +# num_trials=num_trials, +# discard_first=1, +# verbose=False, +# device=device +# ) + +# # Get statistics +# stats = get_timing_stats(elapsed_times, device=device) +# results[method] = { +# "times": elapsed_times, +# "stats": stats +# } + +# if verbose: +# print(f" Mean: {stats['mean']:.3f} ms") +# print(f" Std: {stats['std']:.3f} ms") +# print(f" Min: {stats['min']:.3f} ms") +# print(f" Max: {stats['max']:.3f} ms") + +# except Exception as e: +# if verbose: +# print(f" ERROR: {e}") +# results[method] = {"error": str(e)} + +# # Compare results +# if verbose: +# print("\n" + "=" * 80) +# print("Comparison Summary (all times in milliseconds):") +# print("=" * 80) + +# if "cuda_event" in results and "stats" in results["cuda_event"]: +# cuda_mean = results["cuda_event"]["stats"]["mean"] + +# if "nsight_python_time" in results and "stats" in results["nsight_python_time"]: +# nsight_mean = results["nsight_python_time"]["stats"]["mean"] +# diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100 +# print(f"\nnsight_python_time vs cuda_event:") +# print(f" cuda_event: {cuda_mean:.3f} ms (mean of {num_trials} trials)") +# print(f" nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)") +# print(f" Difference: {diff_pct:+.2f}%") + +# # Check if within reasonable range (within 20% of cuda_event) +# # Note: nsight measures pure GPU time, cuda_event includes some overhead +# # So nsight should be slightly lower or similar +# if abs(diff_pct) < 20: +# print(f" ✓ Accuracy check PASSED (within 20% of cuda_event)") +# else: +# print(f" ⚠ Accuracy check WARNING (difference > 20%)") +# print(f" Note: nsight measures pure GPU time, may differ from cuda_event") + +# if "host_time" in results and "stats" in results["host_time"]: +# host_mean = results["host_time"]["stats"]["mean"] +# print(f"\nhost_time vs cuda_event:") +# print(f" cuda_event: {cuda_mean:.3f} ms") +# print(f" host_time: {host_mean:.3f} ms") +# print(f" (host_time typically higher due to CPU overhead)") + +# return results + + +if __name__ == "__main__": + # Run the test if executed directly + if torch.cuda.is_available(): + test_nsight_timing_accuracy(num_trials=10, num_warmup=3, verbose=True) + else: + print("CUDA not available. Skipping timing accuracy test.") diff --git a/src/profile.py b/src/profile.py index 8a99f802..21ece54b 100644 --- a/src/profile.py +++ b/src/profile.py @@ -3,6 +3,7 @@ # TODO: @kesavan @simon @arya #### +import os import torch import pandas as pd @@ -50,14 +51,14 @@ def _patched_agg(df, func, norm, progress): NSIGHT_AVAILABLE = False -def profile_with_nsight(func, metrics=None, *args, **kwargs): +def profile_with_nsight(func, metrics=None, num_trials=1, *args, **kwargs): """Profile a PyTorch function. Returns {metric_name: value}.""" if not NSIGHT_AVAILABLE: raise RuntimeError("nsight-python not available") metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg']) - @nsight.analyze.kernel(metric=metrics, runs=1, configs=[(0,)], + @nsight.analyze.kernel(metric=metrics, runs=num_trials, configs=[(0,)], combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0)) def profiled(_): with nsight.annotate("kernel"): @@ -119,6 +120,172 @@ def benchmark_matmul(n): return c + + +# pytorch profiler +# migrate from old repo during ICML / caesar repo + + +def profile_kernelbench_model_with_nsight( + custom_model_src: str, + ref_model_src: str = None, + metrics: list = None, + num_trials: int = 1, + seed: int = 42, + device: torch.device = None, + backend: str = "cuda", + precision: torch.dtype = torch.float32, + build_dir: str = None, + verbose: bool = False, +) -> dict: + """ + Profile a kernelbench model using nsight-python. + + Assumes model has been validated and compiled successfully via eval. + No error checking is performed - this should only be called after eval proves correctness. + + Args: + custom_model_src: Source code string for the custom model (ModelNew class) + ref_model_src: Optional source code string for reference model to get get_inputs/get_init_inputs. + If None, will try to get these from custom_model_src. + metrics: List of nsight metrics to collect. Default: ['sm__cycles_active.avg'] + num_trials: Number of trials to run. Default: 1 + seed: Random seed for reproducible inputs + device: CUDA device to run on. Default: cuda:0 + backend: Backend type ('cuda', 'triton', 'tilelang', 'cute'). Default: 'cuda' + precision: torch.dtype for computation. Default: torch.float32 + build_dir: Build directory for compiled kernels. Default: None + verbose: Whether to print verbose output. Default: False + + Returns: + Dictionary mapping metric names to their values + """ + from kernelbench.eval import ( + load_custom_model, + load_custom_model_with_tempfile, + load_original_model_and_inputs, + _process_input_tensor, + set_seed, + graceful_eval_cleanup, + ) + from kernelbench.utils import set_gpu_arch + + device = device or torch.device("cuda:0") + metrics = metrics or ['sm__cycles_active.avg'] + metrics = [metrics] if isinstance(metrics, str) else metrics + + set_gpu_arch(["Ada"]) + torch.cuda.set_device(device) + + # Load input functions using existing eval function + input_source = ref_model_src or custom_model_src + context = {} + _, get_init_inputs, get_inputs = load_original_model_and_inputs(input_source, context) + + # Prepare inputs + set_seed(seed) + init_inputs = [_process_input_tensor(x, device, backend, precision) for x in get_init_inputs()] + + # Load and compile model + if verbose: + print("[Profile] Loading and compiling custom model...") + + os.environ["TORCH_USE_CUDA_DSA"] = "1" + tempfile = None + + if backend.lower() in ["triton", "tilelang", "cute"]: + ModelNew, tempfile = load_custom_model_with_tempfile(custom_model_src, entry_point="ModelNew") + else: + ModelNew = load_custom_model(custom_model_src, {}, build_dir) + + torch.cuda.synchronize(device=device) + + # Instantiate model + with torch.no_grad(): + set_seed(seed) + custom_model = ModelNew(*init_inputs) + custom_model = custom_model.to(device=device, dtype=precision) + torch.cuda.synchronize(device=device) + + if verbose: + print("[Profile] Model instantiated successfully") + + # Prepare profiling inputs + set_seed(seed) + inputs = [_process_input_tensor(x, device, backend, precision) for x in get_inputs()] + + # Profile + if verbose: + print(f"[Profile] Profiling with nsight (metrics: {metrics})...") + + def model_forward(): + with torch.no_grad(): + return custom_model(*inputs) + + metric_values = profile_with_nsight(model_forward, metrics=metrics, num_trials=num_trials) + + if verbose: + print("[Profile] Profiling completed successfully") + + # Cleanup using existing eval function + graceful_eval_cleanup(context, device, tempfile) + + return metric_values + +def test_flash_attention_profile(): + """ + Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example. + """ + import os + from kernelbench.utils import read_file + + # Get the paths to the reference and custom model files + # profile.py is in src/, so we need to go up one level to get repo root + repo_root = os.path.dirname(os.path.dirname(__file__)) + ref_model_path = os.path.join( + repo_root, + "src/kernelbench/prompts/few_shot/model_ex_flash_attn.py" + ) + custom_model_path = os.path.join( + repo_root, + "src/kernelbench/prompts/few_shot/model_new_ex_flash_attn.py" + ) + + # Read the model source files + print("[Test] Reading model source files...") + ref_model_src = read_file(ref_model_path) + custom_model_src = read_file(custom_model_path) + + + print("[Test] Starting profiling with nsight...") + + # Profile the custom model + metrics = profile_kernelbench_model_with_nsight( + custom_model_src=custom_model_src, + ref_model_src=ref_model_src, + metrics=[ + 'gpu__time_duration.sum' + ], + seed=42, + backend="cuda", + precision=torch.float32, + verbose=True + ) + print(metrics) + + print("\n[Test] Profiling results:") + print("=" * 60) + for metric_name, value in metrics.items(): + if value is not None: + print(f" {metric_name}: {value}") + else: + print(f" {metric_name}: ") + print("=" * 60) + + return metrics + + + if __name__ == "__main__": if not check_ncu_available(): print("ncu not found in PATH. Install Nsight Compute.") @@ -130,9 +297,6 @@ def benchmark_matmul(n): # test the example_ncu_python_profile - example_ncu_python_profile() - - - -# pytorch profiler -# migrate from old repo during ICML / caesar repo \ No newline at end of file + # example_ncu_python_profile() + test_flash_attention_profile() + \ No newline at end of file From 24cc9fc3785dc3e52d78576d156c05617492777c Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 02:06:31 +0000 Subject: [PATCH 06/18] add in profiling via nsight python From ce4469d57037cf81cc2032995cc0c889472eed27 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 06:28:24 +0000 Subject: [PATCH 07/18] move profile.py --- src/{ => kernelbench}/profile.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{ => kernelbench}/profile.py (100%) diff --git a/src/profile.py b/src/kernelbench/profile.py similarity index 100% rename from src/profile.py rename to src/kernelbench/profile.py From decf8c684814642756f98203cfdb070eea0e8700 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 06:51:08 +0000 Subject: [PATCH 08/18] fixing profiling and timing --- src/kernelbench/profile.py | 41 +++++--- src/kernelbench/timing.py | 208 ++++++++++++++++++------------------- 2 files changed, 127 insertions(+), 122 deletions(-) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 21ece54b..160ac69b 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -51,18 +51,22 @@ def _patched_agg(df, func, norm, progress): NSIGHT_AVAILABLE = False -def profile_with_nsight(func, metrics=None, num_trials=1, *args, **kwargs): +def profile_with_nsight(func, metrics=None, num_trials=1): """Profile a PyTorch function. Returns {metric_name: value}.""" if not NSIGHT_AVAILABLE: raise RuntimeError("nsight-python not available") metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg']) - @nsight.analyze.kernel(metric=metrics, runs=num_trials, configs=[(0,)], - combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0)) + @nsight.analyze.kernel( + metric=metrics, + runs=num_trials, + configs=[(0,)], + combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), + ) def profiled(_): with nsight.annotate("kernel"): - return func(*args, **kwargs) + return func() try: result = profiled() @@ -93,10 +97,15 @@ def test_kernel(x, y): b = torch.randn(256, 256, device="cuda") print("Running nsight profiling...") + + # Wrap kernel + def test_kernel_forward(): + return test_kernel(a, b) + metric_values = profile_with_nsight( - test_kernel, + test_kernel_forward, ['sm__cycles_active.avg', 'sm__cycles_elapsed.sum', "smsp__inst_executed_pipe_tensor_op_hmma.sum"], - a, b + num_trials=1, ) print("\nProfiling results:") @@ -160,7 +169,7 @@ def profile_kernelbench_model_with_nsight( Returns: Dictionary mapping metric names to their values """ - from kernelbench.eval import ( + from eval import ( load_custom_model, load_custom_model_with_tempfile, load_original_model_and_inputs, @@ -168,13 +177,14 @@ def profile_kernelbench_model_with_nsight( set_seed, graceful_eval_cleanup, ) - from kernelbench.utils import set_gpu_arch + from utils import set_gpu_arch device = device or torch.device("cuda:0") metrics = metrics or ['sm__cycles_active.avg'] metrics = [metrics] if isinstance(metrics, str) else metrics - set_gpu_arch(["Ada"]) + # set_gpu_arch(["Ada"]) # NOTE: can set GPU arch here if needed + torch.cuda.set_device(device) # Load input functions using existing eval function @@ -237,18 +247,17 @@ def test_flash_attention_profile(): Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example. """ import os - from kernelbench.utils import read_file + from utils import read_file # Get the paths to the reference and custom model files - # profile.py is in src/, so we need to go up one level to get repo root - repo_root = os.path.dirname(os.path.dirname(__file__)) + REPO_ROOT = os.path.dirname(__file__) ref_model_path = os.path.join( - repo_root, - "src/kernelbench/prompts/few_shot/model_ex_flash_attn.py" + REPO_ROOT, + "prompts/few_shot/model_ex_flash_attn.py" ) custom_model_path = os.path.join( - repo_root, - "src/kernelbench/prompts/few_shot/model_new_ex_flash_attn.py" + REPO_ROOT, + "prompts/few_shot/model_new_ex_flash_attn.py" ) # Read the model source files diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py index 6b81e8c7..91fe8fac 100644 --- a/src/kernelbench/timing.py +++ b/src/kernelbench/timing.py @@ -407,13 +407,8 @@ def time_execution_with_nsight_python( Returns: List containing one float: average elapsed time in milliseconds """ - import sys - import os - # Add parent directory to path to import profile module - parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - if parent_dir not in sys.path: - sys.path.insert(0, parent_dir) - from profile import profile_with_nsight + + from kernelbench.profile import profile_with_nsight if device is None: if verbose: @@ -434,12 +429,13 @@ def time_execution_with_nsight_python( print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}") # Profile with nsight - returns average time in nanoseconds + # Wrap kernel function to avoid argument passing confusion def wrapped_kernel(): return kernel_fn(*args) metric_values = profile_with_nsight( - wrapped_kernel, - metrics=["gpu__time_duration.sum"], + wrapped_kernel, + metrics=["gpu__time_duration.sum"], num_trials=num_trials ) @@ -511,124 +507,124 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> return stats -# def test_nsight_timing_accuracy( -# num_trials: int = 10, -# num_warmup: int = 3, -# device: torch.device | None = None, -# verbose: bool = True -# ): -# """ -# Test and benchmark time_execution_with_nsight_python against other timing methods. +def test_nsight_timing_accuracy( + num_trials: int = 10, + num_warmup: int = 3, + device: torch.device | None = None, + verbose: bool = True +): + """ + Test and benchmark time_execution_with_nsight_python against other timing methods. -# Compares nsight_python_time with cuda_event and host_time to verify accuracy. -# All times are reported in milliseconds for consistency. + Compares nsight_python_time with cuda_event and host_time to verify accuracy. + All times are reported in milliseconds for consistency. -# Args: -# num_trials: Number of timing trials to run -# num_warmup: Number of warmup iterations -# device: CUDA device to use, defaults to current device -# verbose: Whether to print detailed comparison results + Args: + num_trials: Number of timing trials to run + num_warmup: Number of warmup iterations + device: CUDA device to use, defaults to current device + verbose: Whether to print detailed comparison results -# Returns: -# Dictionary with timing results and comparison statistics -# """ -# if not torch.cuda.is_available(): -# raise RuntimeError("CUDA is not available. This test requires a CUDA device.") + Returns: + Dictionary with timing results and comparison statistics + """ + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available. This test requires a CUDA device.") -# if device is None: -# device = torch.cuda.current_device() + if device is None: + device = torch.cuda.current_device() -# # Create a simple test kernel (matrix multiplication) -# M, N, K = 1024, 1024, 1024 -# a = torch.randn(M, K, device=device, dtype=torch.float32) -# b = torch.randn(K, N, device=device, dtype=torch.float32) + # Create a simple test kernel (matrix multiplication) + M, N, K = 1024, 1024, 1024 + a = torch.randn(M, K, device=device, dtype=torch.float32) + b = torch.randn(K, N, device=device, dtype=torch.float32) -# def matmul_kernel(a, b): -# return torch.matmul(a, b) + def matmul_kernel(a, b): + return torch.matmul(a, b) -# args = [a, b] + args = [a, b] -# if verbose: -# print("=" * 80) -# print("Testing nsight_python_time accuracy against other timing methods") -# print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication") -# print(f"Device: {device} ({torch.cuda.get_device_name(device)})") -# print(f"Trials: {num_trials}, Warmup: {num_warmup}") -# print("=" * 80) + if verbose: + print("=" * 80) + print("Testing nsight_python_time accuracy against other timing methods") + print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication") + print(f"Device: {device} ({torch.cuda.get_device_name(device)})") + print(f"Trials: {num_trials}, Warmup: {num_warmup}") + print("=" * 80) -# results = {} + results = {} -# # Test each timing method -# timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"] + # Test each timing method + timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"] -# for method in timing_methods: -# if verbose: -# print(f"\n[Testing] {method}...") + for method in timing_methods: + if verbose: + print(f"\n[Testing] {method}...") -# try: -# timing_func = get_timing_function(method) -# elapsed_times = timing_func( -# matmul_kernel, -# args=args, -# num_warmup=num_warmup, -# num_trials=num_trials, -# discard_first=1, -# verbose=False, -# device=device -# ) + try: + timing_func = get_timing_function(method) + elapsed_times = timing_func( + matmul_kernel, + args=args, + num_warmup=num_warmup, + num_trials=num_trials, + discard_first=1, + verbose=False, + device=device + ) -# # Get statistics -# stats = get_timing_stats(elapsed_times, device=device) -# results[method] = { -# "times": elapsed_times, -# "stats": stats -# } + # Get statistics + stats = get_timing_stats(elapsed_times, device=device) + results[method] = { + "times": elapsed_times, + "stats": stats + } -# if verbose: -# print(f" Mean: {stats['mean']:.3f} ms") -# print(f" Std: {stats['std']:.3f} ms") -# print(f" Min: {stats['min']:.3f} ms") -# print(f" Max: {stats['max']:.3f} ms") + if verbose: + print(f" Mean: {stats['mean']:.3f} ms") + print(f" Std: {stats['std']:.3f} ms") + print(f" Min: {stats['min']:.3f} ms") + print(f" Max: {stats['max']:.3f} ms") -# except Exception as e: -# if verbose: -# print(f" ERROR: {e}") -# results[method] = {"error": str(e)} + except Exception as e: + if verbose: + print(f" ERROR: {e}") + results[method] = {"error": str(e)} -# # Compare results -# if verbose: -# print("\n" + "=" * 80) -# print("Comparison Summary (all times in milliseconds):") -# print("=" * 80) + # Compare results + if verbose: + print("\n" + "=" * 80) + print("Comparison Summary (all times in milliseconds):") + print("=" * 80) -# if "cuda_event" in results and "stats" in results["cuda_event"]: -# cuda_mean = results["cuda_event"]["stats"]["mean"] + if "cuda_event" in results and "stats" in results["cuda_event"]: + cuda_mean = results["cuda_event"]["stats"]["mean"] -# if "nsight_python_time" in results and "stats" in results["nsight_python_time"]: -# nsight_mean = results["nsight_python_time"]["stats"]["mean"] -# diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100 -# print(f"\nnsight_python_time vs cuda_event:") -# print(f" cuda_event: {cuda_mean:.3f} ms (mean of {num_trials} trials)") -# print(f" nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)") -# print(f" Difference: {diff_pct:+.2f}%") + if "nsight_python_time" in results and "stats" in results["nsight_python_time"]: + nsight_mean = results["nsight_python_time"]["stats"]["mean"] + diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100 + print(f"\nnsight_python_time vs cuda_event:") + print(f" cuda_event: {cuda_mean:.3f} ms (mean of {num_trials} trials)") + print(f" nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)") + print(f" Difference: {diff_pct:+.2f}%") -# # Check if within reasonable range (within 20% of cuda_event) -# # Note: nsight measures pure GPU time, cuda_event includes some overhead -# # So nsight should be slightly lower or similar -# if abs(diff_pct) < 20: -# print(f" ✓ Accuracy check PASSED (within 20% of cuda_event)") -# else: -# print(f" ⚠ Accuracy check WARNING (difference > 20%)") -# print(f" Note: nsight measures pure GPU time, may differ from cuda_event") + # Check if within reasonable range (within 20% of cuda_event) + # Note: nsight measures pure GPU time, cuda_event includes some overhead + # So nsight should be slightly lower or similar + if abs(diff_pct) < 20: + print(f" ✓ Accuracy check PASSED (within 20% of cuda_event)") + else: + print(f" ⚠ Accuracy check WARNING (difference > 20%)") + print(f" Note: nsight measures pure GPU time, may differ from cuda_event") -# if "host_time" in results and "stats" in results["host_time"]: -# host_mean = results["host_time"]["stats"]["mean"] -# print(f"\nhost_time vs cuda_event:") -# print(f" cuda_event: {cuda_mean:.3f} ms") -# print(f" host_time: {host_mean:.3f} ms") -# print(f" (host_time typically higher due to CPU overhead)") + if "host_time" in results and "stats" in results["host_time"]: + host_mean = results["host_time"]["stats"]["mean"] + print(f"\nhost_time vs cuda_event:") + print(f" cuda_event: {cuda_mean:.3f} ms") + print(f" host_time: {host_mean:.3f} ms") + print(f" (host_time typically higher due to CPU overhead)") -# return results + return results if __name__ == "__main__": From 6a16b50757ed3ad7021067ee5b523b13d07e7cb1 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 06:53:32 +0000 Subject: [PATCH 09/18] merge in main From 3b4b342783dc8ccdc5dd039c4f0d2f104e891154 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 07:09:56 +0000 Subject: [PATCH 10/18] clean up comments --- src/kernelbench/profile.py | 18 ++++++++++-------- src/kernelbench/timing.py | 3 ++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 160ac69b..40d3da02 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -1,6 +1,6 @@ #### -# Profiling Related Functions -# TODO: @kesavan @simon @arya +# Nsight Profiling Related Functions +# Currently showcases how to profile a kernelbench model with nsight-python #### import os @@ -11,7 +11,7 @@ # Check if nsight-python is available -# To patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug +# Monkey-patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug try: import nsight NSIGHT_AVAILABLE = True @@ -62,7 +62,7 @@ def profile_with_nsight(func, metrics=None, num_trials=1): metric=metrics, runs=num_trials, configs=[(0,)], - combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), + combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), # NOTE: some torch ops launch multiple kernels, so we need to combine metrics ) def profiled(_): with nsight.annotate("kernel"): @@ -169,7 +169,7 @@ def profile_kernelbench_model_with_nsight( Returns: Dictionary mapping metric names to their values """ - from eval import ( + from kernelbench.eval import ( load_custom_model, load_custom_model_with_tempfile, load_original_model_and_inputs, @@ -177,7 +177,7 @@ def profile_kernelbench_model_with_nsight( set_seed, graceful_eval_cleanup, ) - from utils import set_gpu_arch + from kernelbench.utils import set_gpu_arch device = device or torch.device("cuda:0") metrics = metrics or ['sm__cycles_active.avg'] @@ -228,10 +228,12 @@ def profile_kernelbench_model_with_nsight( if verbose: print(f"[Profile] Profiling with nsight (metrics: {metrics})...") + # Wrap model forward pass def model_forward(): with torch.no_grad(): return custom_model(*inputs) + # Profile with nsight metric_values = profile_with_nsight(model_forward, metrics=metrics, num_trials=num_trials) if verbose: @@ -247,7 +249,7 @@ def test_flash_attention_profile(): Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example. """ import os - from utils import read_file + from kernelbench.utils import read_file # Get the paths to the reference and custom model files REPO_ROOT = os.path.dirname(__file__) @@ -306,6 +308,6 @@ def test_flash_attention_profile(): # test the example_ncu_python_profile - # example_ncu_python_profile() + example_ncu_python_profile() test_flash_attention_profile() \ No newline at end of file diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py index 7a2b1cd2..5cc8da3a 100644 --- a/src/kernelbench/timing.py +++ b/src/kernelbench/timing.py @@ -429,10 +429,11 @@ def time_execution_with_nsight_python( print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}") # Profile with nsight - returns average time in nanoseconds - # Wrap kernel function to avoid argument passing confusion + # Wrap kernel function def wrapped_kernel(): return kernel_fn(*args) + # Profile with nsight, use gpu_time_duration.sum metric for GPU time metric_values = profile_with_nsight( wrapped_kernel, metrics=["gpu__time_duration.sum"], From 14658ab3c6e95c6678fcb628cfee8592ce9616df Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 07:11:10 +0000 Subject: [PATCH 11/18] reference nsight python --- src/kernelbench/profile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 40d3da02..397bc146 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -12,6 +12,7 @@ # Check if nsight-python is available # Monkey-patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug +# Reference: https://docs.nvidia.com/nsight-python try: import nsight NSIGHT_AVAILABLE = True From e6729abd6236b6f242399877e2fbaeccdb0fb030 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Wed, 7 Jan 2026 07:16:12 +0000 Subject: [PATCH 12/18] clean up --- src/kernelbench/profile.py | 2 +- src/kernelbench/timing.py | 127 ------------------------------------- 2 files changed, 1 insertion(+), 128 deletions(-) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 397bc146..3f0913d9 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -308,7 +308,7 @@ def test_flash_attention_profile(): exit(1) - # test the example_ncu_python_profile + # test the example_ncu_python_profile, and flash attention profile example_ncu_python_profile() test_flash_attention_profile() \ No newline at end of file diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py index 5cc8da3a..b48698cb 100644 --- a/src/kernelbench/timing.py +++ b/src/kernelbench/timing.py @@ -507,130 +507,3 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> return stats - -def test_nsight_timing_accuracy( - num_trials: int = 10, - num_warmup: int = 3, - device: torch.device | None = None, - verbose: bool = True -): - """ - Test and benchmark time_execution_with_nsight_python against other timing methods. - - Compares nsight_python_time with cuda_event and host_time to verify accuracy. - All times are reported in milliseconds for consistency. - - Args: - num_trials: Number of timing trials to run - num_warmup: Number of warmup iterations - device: CUDA device to use, defaults to current device - verbose: Whether to print detailed comparison results - - Returns: - Dictionary with timing results and comparison statistics - """ - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. This test requires a CUDA device.") - - if device is None: - device = torch.cuda.current_device() - - # Create a simple test kernel (matrix multiplication) - M, N, K = 1024, 1024, 1024 - a = torch.randn(M, K, device=device, dtype=torch.float32) - b = torch.randn(K, N, device=device, dtype=torch.float32) - - def matmul_kernel(a, b): - return torch.matmul(a, b) - - args = [a, b] - - if verbose: - print("=" * 80) - print("Testing nsight_python_time accuracy against other timing methods") - print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication") - print(f"Device: {device} ({torch.cuda.get_device_name(device)})") - print(f"Trials: {num_trials}, Warmup: {num_warmup}") - print("=" * 80) - - results = {} - - # Test each timing method - timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"] - - for method in timing_methods: - if verbose: - print(f"\n[Testing] {method}...") - - try: - timing_func = get_timing_function(method) - elapsed_times = timing_func( - matmul_kernel, - args=args, - num_warmup=num_warmup, - num_trials=num_trials, - discard_first=1, - verbose=False, - device=device - ) - - # Get statistics - stats = get_timing_stats(elapsed_times, device=device) - results[method] = { - "times": elapsed_times, - "stats": stats - } - - if verbose: - print(f" Mean: {stats['mean']:.3f} ms") - print(f" Std: {stats['std']:.3f} ms") - print(f" Min: {stats['min']:.3f} ms") - print(f" Max: {stats['max']:.3f} ms") - - except Exception as e: - if verbose: - print(f" ERROR: {e}") - results[method] = {"error": str(e)} - - # Compare results - if verbose: - print("\n" + "=" * 80) - print("Comparison Summary (all times in milliseconds):") - print("=" * 80) - - if "cuda_event" in results and "stats" in results["cuda_event"]: - cuda_mean = results["cuda_event"]["stats"]["mean"] - - if "nsight_python_time" in results and "stats" in results["nsight_python_time"]: - nsight_mean = results["nsight_python_time"]["stats"]["mean"] - diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100 - print(f"\nnsight_python_time vs cuda_event:") - print(f" cuda_event: {cuda_mean:.3f} ms (mean of {num_trials} trials)") - print(f" nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)") - print(f" Difference: {diff_pct:+.2f}%") - - # Check if within reasonable range (within 20% of cuda_event) - # Note: nsight measures pure GPU time, cuda_event includes some overhead - # So nsight should be slightly lower or similar - if abs(diff_pct) < 20: - print(f" ✓ Accuracy check PASSED (within 20% of cuda_event)") - else: - print(f" ⚠ Accuracy check WARNING (difference > 20%)") - print(f" Note: nsight measures pure GPU time, may differ from cuda_event") - - if "host_time" in results and "stats" in results["host_time"]: - host_mean = results["host_time"]["stats"]["mean"] - print(f"\nhost_time vs cuda_event:") - print(f" cuda_event: {cuda_mean:.3f} ms") - print(f" host_time: {host_mean:.3f} ms") - print(f" (host_time typically higher due to CPU overhead)") - - return results - - -if __name__ == "__main__": - # Run the test if executed directly - if torch.cuda.is_available(): - test_nsight_timing_accuracy(num_trials=10, num_warmup=3, verbose=True) - else: - print("CUDA not available. Skipping timing accuracy test.") From 7b018455cff64512b96148e51376c5744bc39f34 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Wed, 7 Jan 2026 08:13:50 +0000 Subject: [PATCH 13/18] put wheel pkd data fix in this PR --- pyproject.toml | 5 +- src/kernelbench/prompt_constructor_toml.py | 34 ++++++-------- src/kernelbench/utils.py | 54 ++++++++++++++++++++++ 3 files changed, 71 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7c0bccb8..bed37150 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,4 +52,7 @@ dev = [ [tool.setuptools.packages.find] where = ["src"] -include = ["kernelbench*"] \ No newline at end of file +include = ["kernelbench*"] + +[tool.setuptools.package-data] +kernelbench = ["prompts/**/*"] \ No newline at end of file diff --git a/src/kernelbench/prompt_constructor_toml.py b/src/kernelbench/prompt_constructor_toml.py index 7b8b1bdf..4349a74d 100644 --- a/src/kernelbench/prompt_constructor_toml.py +++ b/src/kernelbench/prompt_constructor_toml.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional -from kernelbench.utils import read_file +from kernelbench.utils import read_file, get_package_resource_path, resolve_path, REPO_TOP_PATH """ TOML-based prompt constructor for managing prompt templates and configurations. @@ -14,11 +14,9 @@ You can easily check some of the prompt templates we have provided and create your own. """ -REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) -PROMPTS_TOML = os.path.join(REPO_TOP_PATH, "src/kernelbench/prompts/prompts.toml") - -assert os.path.exists(PROMPTS_TOML), f"Prompts.toml not found at {PROMPTS_TOML}" -GPU_SPECS_PY = "src/kernelbench/prompts/hardware/gpu_specs.py" +# Resolve paths using the helper from utils +PROMPTS_TOML = get_package_resource_path("prompts/prompts.toml") +GPU_SPECS_PY = get_package_resource_path("prompts/hardware/gpu_specs.py") HARDWARE_COMPONENT_KEYS = [ "hardware_header", "hardware_specs", @@ -26,12 +24,6 @@ "hardware_best_practices", ] -def _abs_path(rel: str) -> str: - """Convert relative path to absolute path from repo root.""" - if os.path.isabs(rel): - return rel - return os.path.join(REPO_TOP_PATH, rel) - @dataclass class PromptConfig: """ @@ -255,17 +247,17 @@ def render_example_entry(input_code: str, output_code: str, example_label: str) # Use multiple examples (true few-shot) examples_intro = intro_few_shot for i, (input_path, output_path) in enumerate(few_shot_examples, 1): - input_code = read_file(_abs_path(input_path)) - output_code = read_file(_abs_path(output_path)) + input_code = read_file(resolve_path(input_path)) + output_code = read_file(resolve_path(output_path)) examples_entries.append( render_example_entry(input_code, output_code, f"Example {i}:") ) else: # Fall back to one-shot - ex_arch_path = _abs_path( + ex_arch_path = resolve_path( backend_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch") ) - ex_new_path = _abs_path(backend_data["one_shot_new_arch"]) + ex_new_path = resolve_path(backend_data["one_shot_new_arch"]) input_code = read_file(ex_arch_path) output_code = read_file(ex_new_path) examples_entries.append( @@ -274,10 +266,10 @@ def render_example_entry(input_code: str, output_code: str, example_label: str) elif requires_example == "one_shot": # Always use one-shot - ex_arch_path = _abs_path( + ex_arch_path = resolve_path( backend_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch") ) - ex_new_path = _abs_path(backend_data["one_shot_new_arch"]) + ex_new_path = resolve_path(backend_data["one_shot_new_arch"]) input_code = read_file(ex_arch_path) output_code = read_file(ex_new_path) examples_entries.append( @@ -296,7 +288,7 @@ def render_example_entry(input_code: str, output_code: str, example_label: str) raise ValueError( f"Hardware info requested for option '{option}'; provide gpu_specs_py and gpu_name" ) - context = {**context, **_gpu_context_from_gpu_specs(_abs_path(gpu_specs_py), gpu_name)} + context = {**context, **_gpu_context_from_gpu_specs(resolve_path(gpu_specs_py), gpu_name)} # Builds the prompt from the components in the toml file. prompt_parts = [] @@ -416,10 +408,10 @@ def test_prompt(): generation. Customize the reference architecture or custom_prompt_key if you want to try different inputs. """ - REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) ref_arch_src = read_file(os.path.join(REPO_TOP_PATH, "KernelBench", "level1", "1_Square_matrix_multiplication_.py")) assert len(ref_arch_src) > 0, "ref_arch_src is empty" - + + print("Testing prompt construction...") scratch_dir = os.path.join(REPO_TOP_PATH, "scratch") # baseline prompt baseline_prompt = get_prompt_for_backend( diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py index abe067b1..cf8b0ad8 100644 --- a/src/kernelbench/utils.py +++ b/src/kernelbench/utils.py @@ -15,6 +15,7 @@ import os import json from tqdm import tqdm +from importlib.resources import files, as_file # API clients from openai import OpenAI @@ -273,6 +274,59 @@ def read_file(file_path) -> str: return "" +######################################################## +# Path Resolution Helpers +######################################################## +REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + +def get_package_resource_path(relative_path: str) -> str: + """ + Get absolute path to a kernelbench package resource. + Works for all three usage modes: + - Running from repo directly + - As a git submodule + - As an installed pip/uv dependency + + Args: + relative_path: Path relative to kernelbench/, e.g. "prompts/prompts.toml" + """ + # Try importlib.resources first (installed package) + try: + resource = files("kernelbench").joinpath(relative_path) + with as_file(resource) as path: + if path.exists(): + return str(path) + except (TypeError, FileNotFoundError): + pass + + # Try repo path (running from source / submodule) + repo_path = os.path.join(REPO_TOP_PATH, "src/kernelbench", relative_path) + if os.path.exists(repo_path): + return repo_path + + raise FileNotFoundError(f"Could not find resource: {relative_path}") + + +def resolve_path(rel: str) -> str: + """ + Resolve a relative path to absolute. Handles paths like "src/kernelbench/prompts/..." + from prompts.toml which reference files relative to repo root. + """ + if os.path.isabs(rel): + return rel + + # Convert "src/kernelbench/..." paths to package-relative + if rel.startswith("src/kernelbench/"): + return get_package_resource_path(rel[len("src/kernelbench/"):]) + + # Otherwise treat as repo-relative + repo_path = os.path.join(REPO_TOP_PATH, rel) + if os.path.exists(repo_path): + return repo_path + + raise FileNotFoundError(f"Could not resolve path: {rel}") + + def print_messages(messages): for message in messages: print(message["role"]) From 977ecb6f434d862afa88c81744fcf6fa6f267bc4 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Thu, 8 Jan 2026 00:57:19 +0000 Subject: [PATCH 14/18] update to new nsight version --- src/kernelbench/profile.py | 52 ++++++++------------------------------ 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 3f0913d9..e490c369 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -6,48 +6,17 @@ import os import torch import pandas as pd +import numpy as np # wrapper with tool to measure hardware metric # Check if nsight-python is available -# Monkey-patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug # Reference: https://docs.nvidia.com/nsight-python try: import nsight NSIGHT_AVAILABLE = True - # Patch 1: Multiple metrics support - _orig_ncu_init = nsight.collection.ncu.NCUCollector.__init__ - nsight.collection.ncu.NCUCollector.__init__ = lambda self, metric="gpu__time_duration.sum", *a, **kw: \ - _orig_ncu_init(self, ",".join(metric) if isinstance(metric, (list, tuple)) else metric, *a, **kw) - - # Patch 2: Extract all metrics from comma-separated string - _orig_extract = nsight.extraction.extract_df_from_report - def _patched_extract(path, metric, *a, **kw): - if "," not in metric: - return _orig_extract(path, metric, *a, **kw) - rows = [] - for m in metric.split(","): - df = _orig_extract(path, m.strip(), *a, **kw) - if df is not None and not df.empty: - df = df.copy() - df['Metric Name'] = m.strip() - rows.extend(df.to_dict('records')) - return pd.DataFrame(rows) if rows else None - nsight.extraction.extract_df_from_report = _patched_extract - - # Patch 3: Fix "cannot insert Annotation" bug - _orig_agg = nsight.transformation.aggregate_data - def _patched_agg(df, func, norm, progress): - _orig_gb = pd.DataFrame.groupby - pd.DataFrame.groupby = lambda self, *a, **kw: _orig_gb(self, *a, **{**kw, 'as_index': False}) - try: - return _orig_agg(df, func, norm, progress) - finally: - pd.DataFrame.groupby = _orig_gb - nsight.transformation.aggregate_data = _patched_agg - except ImportError: NSIGHT_AVAILABLE = False @@ -60,7 +29,7 @@ def profile_with_nsight(func, metrics=None, num_trials=1): metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg']) @nsight.analyze.kernel( - metric=metrics, + metrics=metrics, runs=num_trials, configs=[(0,)], combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), # NOTE: some torch ops launch multiple kernels, so we need to combine metrics @@ -120,14 +89,15 @@ def check_ncu_available() -> bool: return which('ncu') is not None -@nsight.analyze.kernel -def benchmark_matmul(n): - """Standard benchmark following nsight-python docs.""" - a = torch.randn(n, n, device="cuda") - b = torch.randn(n, n, device="cuda") - with nsight.annotate("matmul"): - c = a @ b - return c +if NSIGHT_AVAILABLE: + @nsight.analyze.kernel + def benchmark_matmul(n): + """Standard benchmark following nsight-python docs.""" + a = torch.randn(n, n, device="cuda") + b = torch.randn(n, n, device="cuda") + with nsight.annotate("matmul"): + c = a @ b + return c From 8d9461d51ec2eafba57e0cdba244307febbd294a Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Thu, 8 Jan 2026 01:34:24 +0000 Subject: [PATCH 15/18] for kesavan to further fix --- src/kernelbench/profile.py | 45 ++++++++++++------- .../unit_tests/test_eval_timing.py | 7 ++- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index e490c369..659c4593 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -44,14 +44,20 @@ def profiled(_): return {m: None for m in metrics} df = result.to_dataframe() - if df is None or df.empty: + if df is None or (hasattr(df, 'empty') and df.empty): return {m: None for m in metrics} - if 'Metric Name' in df.columns: + if isinstance(df, pd.DataFrame) and 'Metric Name' in df.columns: return {row['Metric Name']: float(row['AvgValue']) for _, row in df.iterrows()} - return {metrics[0]: float(df['AvgValue'].iloc[0])} + elif isinstance(df, pd.DataFrame): + return {metrics[0]: float(df['AvgValue'].iloc[0])} + else: + # Handle case where df is not a DataFrame + return {m: None for m in metrics} except Exception as e: print(f"Error profiling: {e}") + import traceback + traceback.print_exc() return {m: None for m in metrics} @@ -89,18 +95,6 @@ def check_ncu_available() -> bool: return which('ncu') is not None -if NSIGHT_AVAILABLE: - @nsight.analyze.kernel - def benchmark_matmul(n): - """Standard benchmark following nsight-python docs.""" - a = torch.randn(n, n, device="cuda") - b = torch.randn(n, n, device="cuda") - with nsight.annotate("matmul"): - c = a @ b - return c - - - # pytorch profiler # migrate from old repo during ICML / caesar repo @@ -215,6 +209,24 @@ def model_forward(): return metric_values + + +if NSIGHT_AVAILABLE: + @nsight.analyze.kernel + def benchmark_matmul(n): + """Standard benchmark following nsight-python docs.""" + a = torch.randn(n, n, device="cuda") + b = torch.randn(n, n, device="cuda") + with nsight.annotate("matmul"): + c = a @ b + return c + + + + +########### +# +############ def test_flash_attention_profile(): """ Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example. @@ -246,7 +258,8 @@ def test_flash_attention_profile(): custom_model_src=custom_model_src, ref_model_src=ref_model_src, metrics=[ - 'gpu__time_duration.sum' + 'gpu__time_duration.sum', + 'sm__cycles_elapsed.sum' ], seed=42, backend="cuda", diff --git a/src/kernelbench/unit_tests/test_eval_timing.py b/src/kernelbench/unit_tests/test_eval_timing.py index 8be8824b..e524ee46 100644 --- a/src/kernelbench/unit_tests/test_eval_timing.py +++ b/src/kernelbench/unit_tests/test_eval_timing.py @@ -10,7 +10,7 @@ Test Timing We want to systematically study different timing methodologies. """ -REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) # use exampls in the few shot directory EXAMPLES_PATH = os.path.join(REPO_PATH, "src", "kernelbench", "prompts", "few_shot") @@ -83,6 +83,11 @@ def run_all_timing_tests(device="cuda"): _run_timing_smoke_test_matmul(timing_method, device=device) +# TODO: make the ncu profiling a spearte one +# because not veeryone can have hardware counter +# def run_ncu_against_cuda_event(): +# pass + # need ncu install first (can call the helper function in profile.py to check if ncu exists) test_device = torch.device("cuda:5") run_all_timing_tests(test_device) From a9a293290f6022de7b70468220d7a3e87bfec487 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Thu, 8 Jan 2026 01:51:32 +0000 Subject: [PATCH 16/18] clean up profile and make verbose --- src/kernelbench/profile.py | 407 +++++++++++++++++++++++++------------ 1 file changed, 272 insertions(+), 135 deletions(-) diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 659c4593..2a3243c0 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -1,105 +1,152 @@ -#### -# Nsight Profiling Related Functions -# Currently showcases how to profile a kernelbench model with nsight-python -#### +""" +Nsight Profiling Module for KernelBench +======================================== + +This module provides GPU profiling capabilities using NVIDIA Nsight Compute (ncu). +It allows collecting hardware-level metrics from kernels. + +Key Features: +- Profile arbitrary PyTorch functions with hardware metrics +- Profile KernelBench models (ModelNew) with automatic setup/teardown +- Combine metrics from multi-kernel operations (common in PyTorch) + +Requirements: +- NVIDIA Nsight Compute CLI (ncu) must be installed and in PATH +- nsight-python package + +Common Metrics: +- gpu__time_duration.sum: Total GPU time in nanoseconds +- sm__cycles_elapsed.sum: Total SM cycles elapsed +- sm__cycles_active.avg: Average active cycles per SM + +Reference: https://docs.nvidia.com/nsight-python +""" import os -import torch -import pandas as pd -import numpy as np +from shutil import which -# wrapper with tool to measure hardware metric +import torch +# ============================================================================= +# Nsight Availability Check +# ============================================================================= -# Check if nsight-python is available -# Reference: https://docs.nvidia.com/nsight-python try: import nsight NSIGHT_AVAILABLE = True - except ImportError: NSIGHT_AVAILABLE = False +# ============================================================================= +# Utility Functions +# ============================================================================= + +def check_ncu_available() -> bool: + """ + Check if NVIDIA Nsight Compute CLI (ncu) is available in PATH. + + The ncu command-line tool is required for collecting GPU hardware metrics. + It's typically installed with the CUDA Toolkit or NVIDIA Nsight Compute. + + Returns: + True if ncu is found in PATH, False otherwise. + """ + return which('ncu') is not None + + +# ============================================================================= +# Core Profiling Functions +# ============================================================================= + def profile_with_nsight(func, metrics=None, num_trials=1): - """Profile a PyTorch function. Returns {metric_name: value}.""" - if not NSIGHT_AVAILABLE: - raise RuntimeError("nsight-python not available") + """ + Profile a PyTorch function and collect hardware metrics. - metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg']) + Handles complexity: + - Setting up the Nsight kernel analyzer + - Combining metrics when PyTorch ops launch multiple CUDA kernels + - Extracting results from Nsight's DataFrame format + Args: + func: A callable (no arguments) that executes the code to profile. + Typically a closure that captures the model and inputs. + metrics: List of Nsight metric names to collect. If None, defaults to + ['sm__cycles_active.avg']. Can also pass a single string. + num_trials: Number of times to run the function for averaging. + + Returns: + Dictionary mapping metric names to their values (float). + Returns None for metrics that couldn't be collected. + + Example: + >>> def my_kernel(): + ... return torch.matmul(a, b) + >>> results = profile_with_nsight(my_kernel, ['gpu__time_duration.sum']) + >>> print(results['gpu__time_duration.sum']) # Time in nanoseconds + + Raises: + RuntimeError: If nsight-python is not installed. + """ + if not NSIGHT_AVAILABLE: + raise RuntimeError( + "nsight-python not available." + ) + + # Normalize metrics to a list + if metrics is None: + metrics = ['sm__cycles_active.avg'] + elif isinstance(metrics, str): + metrics = [metrics] + + # Define the profiled function with Nsight decorator + # NOTE: PyTorch operations often launch multiple CUDA kernels (e.g., a matmul + # might have separate kernels for the computation and memory operations). + # We use combine_kernel_metrics to sum these together for a single measurement. @nsight.analyze.kernel( metrics=metrics, runs=num_trials, - configs=[(0,)], - combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), # NOTE: some torch ops launch multiple kernels, so we need to combine metrics + configs=[(0,)], # Use default GPU config + combine_kernel_metrics=lambda a, b: (0 if a is None else a) + (0 if b is None else b), ) def profiled(_): + # The nsight.annotate context marks the region we care about with nsight.annotate("kernel"): return func() try: + # Run profiling - this invokes ncu under the hood result = profiled() - if result is None: - return {m: None for m in metrics} - df = result.to_dataframe() - if df is None or (hasattr(df, 'empty') and df.empty): + # Convert results to DataFrame + df = result.to_dataframe() if result else None + if df is None or df.empty: return {m: None for m in metrics} - if isinstance(df, pd.DataFrame) and 'Metric Name' in df.columns: - return {row['Metric Name']: float(row['AvgValue']) for _, row in df.iterrows()} - elif isinstance(df, pd.DataFrame): - return {metrics[0]: float(df['AvgValue'].iloc[0])} - else: - # Handle case where df is not a DataFrame + # Nsight returns a DataFrame with columns like: + # - 'Metric': The metric name (e.g., 'gpu__time_duration.sum') + # - 'AvgValue': The measured value + # We need to find these columns (names may vary slightly) + metric_col = next((c for c in df.columns if c.lower() == 'metric'), None) + value_col = next((c for c in df.columns if 'value' in c.lower()), None) + + if not metric_col or not value_col: return {m: None for m in metrics} + + # Build a dictionary of all metrics in the DataFrame + metric_dict = { + row[metric_col]: float(row[value_col]) + for _, row in df.iterrows() + } + + # Return only the requested metrics (None if not found) + return {m: metric_dict.get(m) for m in metrics} + except Exception as e: print(f"Error profiling: {e}") - import traceback - traceback.print_exc() return {m: None for m in metrics} -# example function to profile with nsight -def example_ncu_python_profile(): - # Test with simple kernel - def test_kernel(x, y): - """Simple matmul kernel.""" - return x @ y - - print("Creating test tensors...") - a = torch.randn(256, 256, device="cuda") - b = torch.randn(256, 256, device="cuda") - - print("Running nsight profiling...") - - # Wrap kernel - def test_kernel_forward(): - return test_kernel(a, b) - - metric_values = profile_with_nsight( - test_kernel_forward, - ['sm__cycles_active.avg', 'sm__cycles_elapsed.sum', "smsp__inst_executed_pipe_tensor_op_hmma.sum"], - num_trials=1, - ) - - print("\nProfiling results:") - for metric_name, value in metric_values.items(): - print(f" {metric_name}: {value}") - return - - -def check_ncu_available() -> bool: - from shutil import which - return which('ncu') is not None - - - -# pytorch profiler -# migrate from old repo during ICML / caesar repo - - def profile_kernelbench_model_with_nsight( custom_model_src: str, ref_model_src: str = None, @@ -113,27 +160,46 @@ def profile_kernelbench_model_with_nsight( verbose: bool = False, ) -> dict: """ - Profile a kernelbench model using nsight-python. + Profile a KernelBench model (ModelNew) using Nsight hardware metrics. + + This is the high-level profiling function designed for KernelBench workflows. + It handles the full lifecycle: + 1. Load and compile the custom model from source code + 2. Generate inputs using the model's get_inputs() function + 3. Profile the forward pass with Nsight + 4. Clean up resources - Assumes model has been validated and compiled successfully via eval. - No error checking is performed - this should only be called after eval proves correctness. + IMPORTANT: This function assumes the model has already been validated for + correctness via eval. No correctness checking is performed here. Args: - custom_model_src: Source code string for the custom model (ModelNew class) - ref_model_src: Optional source code string for reference model to get get_inputs/get_init_inputs. - If None, will try to get these from custom_model_src. - metrics: List of nsight metrics to collect. Default: ['sm__cycles_active.avg'] - num_trials: Number of trials to run. Default: 1 - seed: Random seed for reproducible inputs - device: CUDA device to run on. Default: cuda:0 - backend: Backend type ('cuda', 'triton', 'tilelang', 'cute'). Default: 'cuda' - precision: torch.dtype for computation. Default: torch.float32 - build_dir: Build directory for compiled kernels. Default: None - verbose: Whether to print verbose output. Default: False + custom_model_src: Python source code string containing the ModelNew class. + ref_model_src: Optional source code for the reference model. Used to get + get_inputs() and get_init_inputs() if they're not in + custom_model_src. If None, uses custom_model_src. + metrics: List of Nsight metrics to collect. Defaults to ['sm__cycles_active.avg']. + num_trials: Number of profiling runs for averaging. Default: 1. + seed: Random seed for reproducible input generation. Default: 42. + device: CUDA device to run on. Default: cuda:0. + backend: Compilation backend ('cuda', 'triton', 'tilelang', 'cute'). + precision: torch.dtype for computation. Default: torch.float32. + build_dir: Directory for compiled kernel artifacts. Default: None. + verbose: Print progress messages. Default: False. Returns: - Dictionary mapping metric names to their values + Dictionary mapping metric names to their measured values. + Values are None if the metric couldn't be collected. + + Example: + >>> results = profile_kernelbench_model_with_nsight( + ... custom_model_src=my_model_code, + ... ref_model_src=ref_model_code, + ... metrics=['gpu__time_duration.sum', 'sm__cycles_elapsed.sum'], + ... verbose=True + ... ) + >>> print(f"GPU time: {results['gpu__time_duration.sum']} ns") """ + # Import eval utilities (deferred to avoid circular imports) from kernelbench.eval import ( load_custom_model, load_custom_model_with_tempfile, @@ -142,40 +208,57 @@ def profile_kernelbench_model_with_nsight( set_seed, graceful_eval_cleanup, ) - from kernelbench.utils import set_gpu_arch + # Set defaults device = device or torch.device("cuda:0") - metrics = metrics or ['sm__cycles_active.avg'] - metrics = [metrics] if isinstance(metrics, str) else metrics - - # set_gpu_arch(["Ada"]) # NOTE: can set GPU arch here if needed + if metrics is None: + metrics = ['sm__cycles_active.avg'] + elif isinstance(metrics, str): + metrics = [metrics] torch.cuda.set_device(device) - # Load input functions using existing eval function + # ------------------------------------------------------------------------- + # Step 1: Load input generation functions from model source + # ------------------------------------------------------------------------- + # The model source should define get_inputs() and get_init_inputs() functions + # that return the tensors needed to run the model. input_source = ref_model_src or custom_model_src context = {} _, get_init_inputs, get_inputs = load_original_model_and_inputs(input_source, context) - # Prepare inputs + # Generate initialization inputs (for model constructor) set_seed(seed) - init_inputs = [_process_input_tensor(x, device, backend, precision) for x in get_init_inputs()] - - # Load and compile model + init_inputs = [ + _process_input_tensor(x, device, backend, precision) + for x in get_init_inputs() + ] + + # ------------------------------------------------------------------------- + # Step 2: Load and compile the custom model + # ------------------------------------------------------------------------- if verbose: print("[Profile] Loading and compiling custom model...") + # Enable CUDA Device-Side Assertions for better error messages os.environ["TORCH_USE_CUDA_DSA"] = "1" tempfile = None + # Different backends require different loading mechanisms if backend.lower() in ["triton", "tilelang", "cute"]: - ModelNew, tempfile = load_custom_model_with_tempfile(custom_model_src, entry_point="ModelNew") + # These backends need a temp file for proper module loading + ModelNew, tempfile = load_custom_model_with_tempfile( + custom_model_src, entry_point="ModelNew" + ) else: + # Standard CUDA backend ModelNew = load_custom_model(custom_model_src, {}, build_dir) torch.cuda.synchronize(device=device) - # Instantiate model + # ------------------------------------------------------------------------- + # Step 3: Instantiate the model + # ------------------------------------------------------------------------- with torch.no_grad(): set_seed(seed) custom_model = ModelNew(*init_inputs) @@ -185,94 +268,119 @@ def profile_kernelbench_model_with_nsight( if verbose: print("[Profile] Model instantiated successfully") - # Prepare profiling inputs + # ------------------------------------------------------------------------- + # Step 4: Profile the forward pass + # ------------------------------------------------------------------------- + # Generate forward pass inputs set_seed(seed) - inputs = [_process_input_tensor(x, device, backend, precision) for x in get_inputs()] + inputs = [ + _process_input_tensor(x, device, backend, precision) + for x in get_inputs() + ] - # Profile if verbose: print(f"[Profile] Profiling with nsight (metrics: {metrics})...") - # Wrap model forward pass + # Create a closure for the forward pass def model_forward(): with torch.no_grad(): return custom_model(*inputs) - # Profile with nsight - metric_values = profile_with_nsight(model_forward, metrics=metrics, num_trials=num_trials) + # Run profiling + metric_values = profile_with_nsight( + model_forward, + metrics=metrics, + num_trials=num_trials + ) if verbose: print("[Profile] Profiling completed successfully") - # Cleanup using existing eval function + # ------------------------------------------------------------------------- + # Step 5: Cleanup + # ------------------------------------------------------------------------- graceful_eval_cleanup(context, device, tempfile) return metric_values +# ============================================================================= +# Examples and Tests +# ============================================================================= -if NSIGHT_AVAILABLE: - @nsight.analyze.kernel - def benchmark_matmul(n): - """Standard benchmark following nsight-python docs.""" - a = torch.randn(n, n, device="cuda") - b = torch.randn(n, n, device="cuda") - with nsight.annotate("matmul"): - c = a @ b - return c - - +def example_ncu_python_profile(): + """ + Simple example demonstrating how to profile a basic matrix multiplication. + + This shows the minimal setup needed to use profile_with_nsight(). + """ + print("Creating test tensors...") + a = torch.randn(256, 256, device="cuda") + b = torch.randn(256, 256, device="cuda") + + # Create a closure that captures the tensors + def matmul_kernel(): + return a @ b + + print("Running nsight profiling...") + + metric_values = profile_with_nsight( + matmul_kernel, + metrics=[ + 'sm__cycles_active.avg', # Average active cycles per SM + 'sm__cycles_elapsed.sum', # Total cycles elapsed + 'smsp__inst_executed_pipe_tensor_op_hmma.sum', # Tensor core ops + ], + num_trials=1, + ) + + print("\nProfiling results:") + for metric_name, value in metric_values.items(): + print(f" {metric_name}: {value}") -########### -# -############ def test_flash_attention_profile(): """ - Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example. + Test profiling a Flash Attention model from the KernelBench examples. + + This demonstrates the full workflow of profiling a KernelBench model + using profile_kernelbench_model_with_nsight(). """ - import os from kernelbench.utils import read_file - # Get the paths to the reference and custom model files + # Locate the example model files REPO_ROOT = os.path.dirname(__file__) ref_model_path = os.path.join( - REPO_ROOT, - "prompts/few_shot/model_ex_flash_attn.py" + REPO_ROOT, "prompts/few_shot/model_ex_flash_attn.py" ) custom_model_path = os.path.join( - REPO_ROOT, - "prompts/few_shot/model_new_ex_flash_attn.py" + REPO_ROOT, "prompts/few_shot/model_new_ex_flash_attn.py" ) - # Read the model source files print("[Test] Reading model source files...") ref_model_src = read_file(ref_model_path) custom_model_src = read_file(custom_model_path) - print("[Test] Starting profiling with nsight...") - # Profile the custom model metrics = profile_kernelbench_model_with_nsight( custom_model_src=custom_model_src, ref_model_src=ref_model_src, metrics=[ - 'gpu__time_duration.sum', - 'sm__cycles_elapsed.sum' + 'gpu__time_duration.sum', # Total GPU execution time (ns) + 'sm__cycles_elapsed.sum', # Total SM cycles ], seed=42, backend="cuda", precision=torch.float32, verbose=True ) - print(metrics) print("\n[Test] Profiling results:") print("=" * 60) for metric_name, value in metrics.items(): if value is not None: - print(f" {metric_name}: {value}") + print(f" {metric_name}: {value:,.0f}") else: print(f" {metric_name}: ") print("=" * 60) @@ -280,18 +388,47 @@ def test_flash_attention_profile(): return metrics +# Optional: Decorated benchmark function for direct use with nsight +if NSIGHT_AVAILABLE: + @nsight.analyze.kernel + def benchmark_matmul(n): + """ + Standard benchmark following nsight-python documentation style. + + This shows how to use the @nsight.analyze.kernel decorator directly + for simple benchmarking scenarios. + """ + a = torch.randn(n, n, device="cuda") + b = torch.randn(n, n, device="cuda") + with nsight.annotate("matmul"): + c = a @ b + return c + + +# ============================================================================= +# Main Entry Point +# ============================================================================= if __name__ == "__main__": + # Verify prerequisites if not check_ncu_available(): - print("ncu not found in PATH. Install Nsight Compute.") + print("ERROR: ncu not found in PATH.") + print("Install NVIDIA Nsight Compute from:") + print(" https://developer.nvidia.com/nsight-compute") exit(1) if not torch.cuda.is_available(): - print("CUDA is not available.") + print("ERROR: CUDA is not available.") exit(1) - - # test the example_ncu_python_profile, and flash attention profile + print("=" * 60) + print("Running Nsight Profiling Examples") + print("=" * 60) + + # Run the simple example first + print("\n--- Example: Basic Matrix Multiplication ---\n") example_ncu_python_profile() + + # Run the full KernelBench model test + print("\n--- Test: Flash Attention Model ---\n") test_flash_attention_profile() - \ No newline at end of file From cd6415612ef6f5be431f233e8270dc2631bb3f79 Mon Sep 17 00:00:00 2001 From: Kesavan Ramakrishnan Date: Thu, 8 Jan 2026 01:56:32 +0000 Subject: [PATCH 17/18] add in tests to test_eval_timing --- .../unit_tests/test_eval_timing.py | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/kernelbench/unit_tests/test_eval_timing.py b/src/kernelbench/unit_tests/test_eval_timing.py index e524ee46..6ceb3adb 100644 --- a/src/kernelbench/unit_tests/test_eval_timing.py +++ b/src/kernelbench/unit_tests/test_eval_timing.py @@ -5,6 +5,7 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from kernelbench import timing +from kernelbench.profile import NSIGHT_AVAILABLE, check_ncu_available """ Test Timing @@ -78,19 +79,59 @@ def matmul_kernel(a, b): # test all currently available timing methods def run_all_timing_tests(device="cuda"): timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"] - # timing_methods = ["cuda_event", "do_bench_impl"] for timing_method in timing_methods: _run_timing_smoke_test_matmul(timing_method, device=device) -# TODO: make the ncu profiling a spearte one -# because not veeryone can have hardware counter -# def run_ncu_against_cuda_event(): -# pass - # need ncu install first (can call the helper function in profile.py to check if ncu exists) +def run_nsight_timing_test(device="cuda"): + """ + Run nsight-python timing test if available. + + Nsight requires: + - nsight-python package installed + - ncu (Nsight Compute CLI) in PATH + + Compares nsight GPU time against cuda_event timing for the same matmul operation. + """ + if not NSIGHT_AVAILABLE: + print("[SKIP] nsight-python not installed") + return None + + if not check_ncu_available(): + print("[SKIP] ncu not found in PATH") + return None + + print("\n" + "=" * 60) + print("Running nsight-python timing benchmark") + print("=" * 60) + + # Run nsight timing + print("\n--- nsight_python_time ---") + _run_timing_smoke_test_matmul("nsight_python_time", device=device) + + # Run cuda_event for comparison + print("\n--- cuda_event (for comparison) ---") + _run_timing_smoke_test_matmul("cuda_event", device=device) + + print("\n" + "=" * 60) + print("nsight-python timing benchmark complete") + print("=" * 60) + + +def run_all_timing_tests_with_nsight(device="cuda"): + """ + Run all timing methods including nsight-python if available. + """ + # Standard timing methods (always available) + run_all_timing_tests(device=device) + + # Nsight timing (requires ncu + nsight-python) + run_nsight_timing_test(device=device) + test_device = torch.device("cuda:5") run_all_timing_tests(test_device) +run_nsight_timing_test(test_device) From 95468c30a67157ad944fc982371328480b520bc0 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Thu, 8 Jan 2026 02:07:26 +0000 Subject: [PATCH 18/18] get ready for merge --- EVAL.md | 3 +++ README.md | 4 +++- src/kernelbench/profile.py | 4 ++++ src/kernelbench/timing.py | 2 ++ src/kernelbench/unit_tests/test_eval_timing.py | 1 + 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/EVAL.md b/EVAL.md index b9880560..faebda6f 100644 --- a/EVAL.md +++ b/EVAL.md @@ -36,6 +36,9 @@ We have (and continue to) implement various approaches to conduct kernel timing Check out `timing.py` to see available timing methods and `src/unit_tests/test_eval_timing.py` to test out various timing methods (including leveraging `cuda_event` marker, Triton `do_bench`, `host_time` E2E time). @palic and team is working on a blogpost explaining the different tradeoffs soon. +### Profiling +We have experimental profiling support leveraging NVIDIA NCU in `profile.py`. + ### Checkers There are potentially many ways model might reward hack and we would like to catch the known ways through checkers [experimental and WIP]. We start with `kernel_static_checker.py`, which is a regex-based checker on the genenrated code against set of rules. We plan to add AST-based, LM-as-a-judge, and more runtime checks in the future. We welcome suggestions and contributions here. diff --git a/README.md b/README.md index 3686574a..7343e73b 100644 --- a/README.md +++ b/README.md @@ -66,13 +66,15 @@ We organize the repo into the following structure: KernelBench/ ├── assets/ ├── KernelBench/ # Benchmark dataset files -├── src/ # KernelBench logic code +├── src/kernelbench/ # KernelBench logic code │ ├── unit_tests/ │ ├── prompts/ │ ├── .... ├── scripts/ # helpful scripts to run the benchmark ├── results/ # baseline times across hardware ├── runs/ # where your runs will be stored +├── notebooks/ # example notebooks for analysis +├── pyproject.toml # Project configuration and dependencies ``` ## 🔧 Set up diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py index 2a3243c0..8326324e 100644 --- a/src/kernelbench/profile.py +++ b/src/kernelbench/profile.py @@ -5,6 +5,10 @@ This module provides GPU profiling capabilities using NVIDIA Nsight Compute (ncu). It allows collecting hardware-level metrics from kernels. +NOTE: this is an experimental module, not part of the default eval pass. +You need hardware counter access (usually requires sudo) to accesss hardware counter. +We only support local mode with this feature, not avaliable on Modal. + Key Features: - Profile arbitrary PyTorch functions with hardware metrics - Profile KernelBench models (ModelNew) with automatic setup/teardown diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py index b48698cb..83908391 100644 --- a/src/kernelbench/timing.py +++ b/src/kernelbench/timing.py @@ -452,6 +452,8 @@ def wrapped_kernel(): if verbose: print(f"Average GPU time: {gpu_time_ms:.3f} ms (across {num_trials} trials)") + # NOTE: nsight only returns average time across num_trials, so we return a single value in a list + # it did run num_trials times, but we only return the average (1 item) # Return list with single average value for API consistency return [gpu_time_ms] diff --git a/src/kernelbench/unit_tests/test_eval_timing.py b/src/kernelbench/unit_tests/test_eval_timing.py index 6ceb3adb..470c4254 100644 --- a/src/kernelbench/unit_tests/test_eval_timing.py +++ b/src/kernelbench/unit_tests/test_eval_timing.py @@ -129,6 +129,7 @@ def run_all_timing_tests_with_nsight(device="cuda"): run_nsight_timing_test(device=device) +# select a free GPU here or set CUDA_VISIBLE_DEVICES test_device = torch.device("cuda:5") run_all_timing_tests(test_device) run_nsight_timing_test(test_device)