From 2249bef6dcff279b459c4c858e67e540f2dfe6a1 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Tue, 16 Dec 2025 07:41:24 +0000
Subject: [PATCH 01/18] start metric profile branch

---
 requirements.txt |  1 +
 src/profile.py   | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 src/profile.py

diff --git a/requirements.txt b/requirements.txt
index 253e57da..5f2ae387 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,7 @@ pydra_config
 pytest
 ninja
 cupy-cuda12x
+nsight-python
 
 # Numerics
 einops
diff --git a/src/profile.py b/src/profile.py
new file mode 100644
index 00000000..3459e106
--- /dev/null
+++ b/src/profile.py
@@ -0,0 +1,49 @@
+####
+# Profiling Related Functions
+# TODO: @kesavan @simon @arya
+####
+
+import torch
+import nsight
+
+# wrapper with tool to measure hardware metric
+
+
+# nsight-python
+# https://docs.nvidia.com/nsight-python/overview
+
+def check_ncu_available() -> bool:
+    """Check if ncu is in PATH. Returns True if found, False otherwise."""
+    from shutil import which
+    return which('ncu') is not None
+
+
+# Note you need access to hardware counter
+# you also need to have ncu installed and point to the right path
+# sudo env "PATH=$PATH" $(which python) src/profile.py
+@nsight.analyze.kernel
+def benchmark_matmul(n):
+    """
+    The simplest possible benchmark.
+    We create two matrices and multiply them.
+    follow example from nsight-python docs
+    """
+    # Create two NxN matrices on GPU
+    a = torch.randn(n, n, device="cuda")
+    b = torch.randn(n, n, device="cuda")
+
+    # Mark the kernel we want to profile
+    with nsight.annotate("matmul"):
+        c = a @ b
+
+    return c
+
+if __name__ == "__main__":
+    if not check_ncu_available():
+        print("ncu not found in PATH. Please install ncu and point to path.")
+        exit(1)
+    # Run the benchmark
+    result = benchmark_matmul(1024)
+
+# pytorch profiler
+# migrate from old repo during ICML / caesar repo
\ No newline at end of file

From 845dcbb835c045bdf80b827d2d614600ac04f249 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Tue, 16 Dec 2025 17:56:13 +0000
Subject: [PATCH 02/18] clean up remaining places for cuda events

---
 scripts/generate_baseline_time.py           | 50 ---------------
 scripts/generate_baseline_time_modal.py     | 68 ++++-----------------
 scripts/get_baseline_time_single_problem.py | 13 ++--
 scripts/inspect_baseline.py                 |  3 +-
 scripts/inspect_triton.py                   |  9 ++-
 src/timing.py                               |  1 -
 6 files changed, 27 insertions(+), 117 deletions(-)

diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
index 0a1f608b..47975ebd 100644
--- a/scripts/generate_baseline_time.py
+++ b/scripts/generate_baseline_time.py
@@ -243,53 +243,3 @@ def test_measure_particular_program(level_num: int, problem_id: int):
     # get_time(2, 43, torch_compile=True)
 
 
-
-
-################################################################################
-# Deprecated
-################################################################################
-
-
-def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False):
-    raise DeprecationWarning("Use New measure_program_time instead")
-    ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id(
-        level_num, problem_id, with_name=True
-    )
-    ref_arch_name = ref_arch_name.split("/")[-1]
-    context = {}
-    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
-        ref_arch_src, context
-    )
-    try:
-        with torch.no_grad():
-            torch.cuda.synchronize(device=device)
-            set_seed(42)
-            inputs = get_inputs()
-            set_seed(42)
-            init_inputs = get_init_inputs()
-            inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in inputs
-            ]
-            init_inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in init_inputs
-            ]
-            model = Model(*init_inputs)
-            
-            if torch_compile:
-                model = torch.compile(model)
-                print("Compiled model Done")
-            model = model.cuda(device=device)
-            torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
-                model, *inputs, num_trials=num_trials, verbose=False, device=device
-            )
-            runtime_stats = get_timing_stats(elapsed_times, device=device)
-            # json_results[f"level{level_num}"][ref_arch_name] = runtime_stats
-            print(f"{ref_arch_name} {runtime_stats}")
-            return (ref_arch_name, runtime_stats)
-    except Exception as e:
-        print(f"[Eval] Error in Measuring Performance: {e}")
-
-
diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py
index a0039193..c85824d1 100644
--- a/scripts/generate_baseline_time_modal.py
+++ b/scripts/generate_baseline_time_modal.py
@@ -2,11 +2,10 @@
 import numpy as np
 from src.eval import (
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
+from src.timing import get_timing_function, get_timing_stats
 from src.dataset import construct_problem_dataset_from_problem_dir
 from src.utils import read_file
 import os
@@ -160,6 +159,7 @@ def measure_program_time(
             ref_arch_name: str,
             ref_arch_src: str, 
             num_trials: int = 100,
+            timing_method: str="cuda_event",
             use_torch_compile: bool = False,
             torch_compile_backend: str="inductor", 
             torch_compile_options: str="default",
@@ -199,9 +199,16 @@ def measure_program_time(
                     print(f"Using PyTorch Eager Execution on {ref_arch_name}")
                 
                 model = model.cuda(device=device)
+                timing_func = get_timing_function(timing_method)
                 torch.cuda.synchronize(device=device)
-                elapsed_times = time_execution_with_cuda_event(
-                    model, *inputs, num_trials=num_trials, verbose=verbose, device=device
+                elapsed_times = timing_func(
+                    model,
+                    inputs,
+                    num_warmup=3,  # or any default you prefer
+                    num_trials=num_trials,
+                    discard_first=1,  # or 0 to include first trial
+                    verbose=verbose,
+                    device=device,
                 )
                 runtime_stats = get_timing_stats(elapsed_times, device=device)
 
@@ -247,7 +254,8 @@ def record_baseline_times(config: BaselineConfig,
                         ref_arch_name,
                         ref_arch_src,
                         config.num_trials,
-                        use_torch_compile,
+                        timing_method,
+                        use_torch_compile,  
                         torch_compile_backend,
                         torch_compile_options,
                         torch.device(f"cuda:0"),
@@ -341,53 +349,3 @@ def main(config: BaselineConfig):
     # get_time(2, 43, torch_compile=True)
 
 
-
-
-################################################################################
-# Deprecated
-################################################################################
-
-
-def get_time_old(level_num, problem_id, num_trials=100, torch_compile=False):
-    raise DeprecationWarning("Use New measure_program_time instead")
-    ref_arch_name, ref_arch_src = fetch_ref_arch_from_level_problem_id(
-        level_num, problem_id, with_name=True
-    )
-    ref_arch_name = ref_arch_name.split("/")[-1]
-    context = {}
-    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
-        ref_arch_src, context
-    )
-    try:
-        with torch.no_grad():
-            torch.cuda.synchronize(device=device)
-            set_seed(42)
-            inputs = get_inputs()
-            set_seed(42)
-            init_inputs = get_init_inputs()
-            inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in inputs
-            ]
-            init_inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in init_inputs
-            ]
-            model = Model(*init_inputs)
-            
-            if torch_compile:
-                model = torch.compile(model)
-                print("Compiled model Done")
-            model = model.cuda(device=device)
-            torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
-                model, *inputs, num_trials=num_trials, verbose=False, device=device
-            )
-            runtime_stats = get_timing_stats(elapsed_times, device=device)
-            # json_results[f"level{level_num}"][ref_arch_name] = runtime_stats
-            print(f"{ref_arch_name} {runtime_stats}")
-            return (ref_arch_name, runtime_stats)
-    except Exception as e:
-        print(f"[Eval] Error in Measuring Performance: {e}")
-
-
diff --git a/scripts/get_baseline_time_single_problem.py b/scripts/get_baseline_time_single_problem.py
index 476fe2fd..e6f8155a 100644
--- a/scripts/get_baseline_time_single_problem.py
+++ b/scripts/get_baseline_time_single_problem.py
@@ -2,16 +2,17 @@
 import numpy as np
 from src.eval import (
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
 
+from src.timing import get_timing_function, get_timing_stats
+
 def measure_program_time(
         ref_arch_name: str,
         ref_arch_src: str, 
         num_trials: int = 100,
+        timing_method: str="cuda_event",        
         use_torch_compile: bool = False,
         torch_compile_backend: str="inductor", 
         torch_compile_options: str="default",
@@ -52,8 +53,9 @@ def measure_program_time(
             
             model = model.cuda(device=device)
             torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
-                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
+            timing_func = get_timing_function(timing_method )
+            elapsed_times = timing_func(
+                model, inputs, num_warmup=3, num_trials=num_trials, discard_first=1, verbose=verbose, device=device
             )
             runtime_stats = get_timing_stats(elapsed_times, device=device)
 
@@ -87,5 +89,4 @@ def get_inputs():
 def get_init_inputs():
     return []  # No special initialization inputs needed
     """
-    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False))
-    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=True))
\ No newline at end of file
+    print(measure_program_time(ref_arch_name, ref_arch_src, use_torch_compile=False, timing_method="cuda_event"))
\ No newline at end of file
diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py
index e7811f64..a39716fd 100644
--- a/scripts/inspect_baseline.py
+++ b/scripts/inspect_baseline.py
@@ -5,11 +5,10 @@
 import numpy as np
 from src.eval import (
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
+from src.timing import get_timing_function, get_timing_stats
 from src.dataset import construct_problem_dataset_from_problem_dir
 import os, sys
 import logging
diff --git a/scripts/inspect_triton.py b/scripts/inspect_triton.py
index 4f13c8af..f664b6d6 100644
--- a/scripts/inspect_triton.py
+++ b/scripts/inspect_triton.py
@@ -21,10 +21,9 @@
 from src.eval import (
     load_custom_model,
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
 )
+from src.timing import get_timing_function, get_timing_stats
 
 def fetch_ref_arch_from_dataset(dataset: list[str], 
                                 problem_id: int) -> tuple[str, str, str]:
@@ -123,6 +122,8 @@ def run_profile_and_save_trace(dataset: list[str], problem_id: int, num_trials=1
 def get_torch_compile_triton(level_num, problem_id):
     """
     Get the triton code generated by torch compile for a particular problem
+
+    Made this during ICML rebuttal in March 2025
     """
     ref_arch_path, ref_arch_name, ref_arch_src = fetch_ref_arch_from_dataset(
         dataset, problem_id, with_name=True
@@ -167,7 +168,9 @@ def get_torch_compile_triton(level_num, problem_id):
             
 
             torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
+            timing_method = "cuda_event" # use cuda event for timing here
+            time_func_cuda_event = get_timing_function(timing_method)
+            elapsed_times = time_func_cuda_event(
                 model, *inputs, num_trials=1, verbose=False, device=device
             )
             runtime_stats = get_timing_stats(elapsed_times, device=device)
diff --git a/src/timing.py b/src/timing.py
index 8a36522b..7f192625 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -90,7 +90,6 @@ def get_timing_function(
 NOTE: we have a WIP blogpost on this topic covering the various timing approaches   
 """
 
-
 def time_execution_with_cuda_event(
     kernel_fn: callable,
     args: list[Any],

From b3387a42a301cea6d1d31dd2dd3265e170b15b5f Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 17 Dec 2025 21:19:51 +0000
Subject: [PATCH 03/18] add in nsight-python metric evaluation script and
 example usage, have to patch some nsight python functions to get this to work

---
 src/profile.py | 127 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 108 insertions(+), 19 deletions(-)

diff --git a/src/profile.py b/src/profile.py
index 3459e106..8a99f802 100644
--- a/src/profile.py
+++ b/src/profile.py
@@ -4,46 +4,135 @@
 ####
 
 import torch
-import nsight
+import pandas as pd
 
 # wrapper with tool to measure hardware metric
 
 
-# nsight-python
-# https://docs.nvidia.com/nsight-python/overview
+# Check if nsight-python is available
+# To patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug
+try:
+    import nsight
+    NSIGHT_AVAILABLE = True
+    
+    # Patch 1: Multiple metrics support
+    _orig_ncu_init = nsight.collection.ncu.NCUCollector.__init__
+    nsight.collection.ncu.NCUCollector.__init__ = lambda self, metric="gpu__time_duration.sum", *a, **kw: \
+        _orig_ncu_init(self, ",".join(metric) if isinstance(metric, (list, tuple)) else metric, *a, **kw)
+    
+    # Patch 2: Extract all metrics from comma-separated string
+    _orig_extract = nsight.extraction.extract_df_from_report
+    def _patched_extract(path, metric, *a, **kw):
+        if "," not in metric:
+            return _orig_extract(path, metric, *a, **kw)
+        rows = []
+        for m in metric.split(","):
+            df = _orig_extract(path, m.strip(), *a, **kw)
+            if df is not None and not df.empty:
+                df = df.copy()
+                df['Metric Name'] = m.strip()
+                rows.extend(df.to_dict('records'))
+        return pd.DataFrame(rows) if rows else None
+    nsight.extraction.extract_df_from_report = _patched_extract
+    
+    # Patch 3: Fix "cannot insert Annotation" bug
+    _orig_agg = nsight.transformation.aggregate_data
+    def _patched_agg(df, func, norm, progress):
+        _orig_gb = pd.DataFrame.groupby
+        pd.DataFrame.groupby = lambda self, *a, **kw: _orig_gb(self, *a, **{**kw, 'as_index': False})
+        try:
+            return _orig_agg(df, func, norm, progress)
+        finally:
+            pd.DataFrame.groupby = _orig_gb
+    nsight.transformation.aggregate_data = _patched_agg
+
+except ImportError:
+    NSIGHT_AVAILABLE = False
+
+
+def profile_with_nsight(func, metrics=None, *args, **kwargs):
+    """Profile a PyTorch function. Returns {metric_name: value}."""
+    if not NSIGHT_AVAILABLE:
+        raise RuntimeError("nsight-python not available")
+    
+    metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg'])
+    
+    @nsight.analyze.kernel(metric=metrics, runs=1, configs=[(0,)], 
+                           combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0))
+    def profiled(_):
+        with nsight.annotate("kernel"):
+            return func(*args, **kwargs)
+    
+    try:
+        result = profiled()
+        if result is None:
+            return {m: None for m in metrics}
+        
+        df = result.to_dataframe()
+        if df is None or df.empty:
+            return {m: None for m in metrics}
+        
+        if 'Metric Name' in df.columns:
+            return {row['Metric Name']: float(row['AvgValue']) for _, row in df.iterrows()}
+        return {metrics[0]: float(df['AvgValue'].iloc[0])}
+    except Exception as e:
+        print(f"Error profiling: {e}")
+        return {m: None for m in metrics}
+
+
+# example function to profile with nsight
+def example_ncu_python_profile():
+    # Test with simple kernel
+    def test_kernel(x, y):
+        """Simple matmul kernel."""
+        return x @ y
+    
+    print("Creating test tensors...")
+    a = torch.randn(256, 256, device="cuda")
+    b = torch.randn(256, 256, device="cuda")
+    
+    print("Running nsight profiling...")
+    metric_values = profile_with_nsight(
+        test_kernel, 
+        ['sm__cycles_active.avg', 'sm__cycles_elapsed.sum', "smsp__inst_executed_pipe_tensor_op_hmma.sum"],
+        a, b
+    )
+    
+    print("\nProfiling results:")
+    for metric_name, value in metric_values.items():
+        print(f"  {metric_name}: {value}")
+    return
+    
 
 def check_ncu_available() -> bool:
-    """Check if ncu is in PATH. Returns True if found, False otherwise."""
     from shutil import which
     return which('ncu') is not None
 
 
-# Note you need access to hardware counter
-# you also need to have ncu installed and point to the right path
-# sudo env "PATH=$PATH" $(which python) src/profile.py
 @nsight.analyze.kernel
 def benchmark_matmul(n):
-    """
-    The simplest possible benchmark.
-    We create two matrices and multiply them.
-    follow example from nsight-python docs
-    """
-    # Create two NxN matrices on GPU
+    """Standard benchmark following nsight-python docs."""
     a = torch.randn(n, n, device="cuda")
     b = torch.randn(n, n, device="cuda")
-
-    # Mark the kernel we want to profile
     with nsight.annotate("matmul"):
         c = a @ b
-
     return c
 
+
 if __name__ == "__main__":
     if not check_ncu_available():
-        print("ncu not found in PATH. Please install ncu and point to path.")
+        print("ncu not found in PATH. Install Nsight Compute.")
+        exit(1)
+    
+    if not torch.cuda.is_available():
+        print("CUDA is not available.")
         exit(1)
-    # Run the benchmark
-    result = benchmark_matmul(1024)
+        
+    
+    # test the example_ncu_python_profile
+    example_ncu_python_profile()
+    
+    
 
 # pytorch profiler
 # migrate from old repo during ICML / caesar repo
\ No newline at end of file

From 9d36c26c638c86db61933ab32abc0c894b249d1a Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Tue, 6 Jan 2026 22:00:41 +0000
Subject: [PATCH 04/18] merge main


From daa08cb360e8cbef19ebd0404dba2c15ecca2ee0 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 02:04:27 +0000
Subject: [PATCH 05/18] add in profiling with nsight python capability, as well
 as code path for timing witih nsight python

---
 pyproject.toml              |   1 +
 requirements.txt            |   3 +-
 scripts/inspect_baseline.py |   4 +
 src/kernelbench/timing.py   | 199 ++++++++++++++++++++++++++++++++++++
 src/profile.py              | 180 ++++++++++++++++++++++++++++++--
 5 files changed, 377 insertions(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 85a690c5..7c0bccb8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,7 @@ gpu = [
     "nvidia-cutlass-dsl",
     "tilelang",
     "cupy-cuda12x",
+    "nsight-python",
 ]
 dev = [
     "pytest",
diff --git a/requirements.txt b/requirements.txt
index 1e8db6e7..07603a86 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,12 +19,11 @@ tilelang
 tqdm>=4.67.1
 packaging
 pydra-config
-pytest
 ninja>=1.13.0
 cupy-cuda12x==13.6.0
-nsight-python
 tomli>=2.3.0
 tabulate>=0.9.0
+nsight-python
 
 # Numerics
 einops>=0.8.1
diff --git a/scripts/inspect_baseline.py b/scripts/inspect_baseline.py
index 7043a3f2..9f9f6b7c 100644
--- a/scripts/inspect_baseline.py
+++ b/scripts/inspect_baseline.py
@@ -5,10 +5,14 @@
 import numpy as np
 from kernelbench.eval import (
     load_original_model_and_inputs,
+    time_execution_with_cuda_event,
+    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
 from kernelbench.dataset import construct_problem_dataset_from_problem_dir
+import os, sys
+import logging
 import json
 
 device = torch.device("cuda:0")
diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index 7f860005..6b81e8c7 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -77,6 +77,8 @@ def get_timing_function(
             return time_execution_with_do_bench_impl
         case "host_time":
             return time_execution_with_host_time 
+        case "nsight_python_time":
+            return time_execution_with_nsight_python
         # we might add other methods in the future
         case _: 
             raise ValueError(f"Unsupported timing method: {method}")
@@ -387,6 +389,75 @@ def time_execution_with_host_time(
 
     return elapsed_times
 
+def time_execution_with_nsight_python(
+    kernel_fn: callable,
+    args: list[Any],
+    num_warmup: int = 3,
+    num_trials: int = 10,
+    discard_first: int = 1, # not used here
+    verbose: bool = True,
+    device: torch.device | None = None) -> list[float]:
+    """
+    Time a CUDA kernel function using nsight-python.
+    
+    Note: nsight returns an average time across num_trials runs.
+    Returns a list with a single value (average time) for API consistency.
+    GPU time from nsight is in nanoseconds, converted to milliseconds.
+    
+    Returns:
+        List containing one float: average elapsed time in milliseconds
+    """
+    import sys
+    import os
+    # Add parent directory to path to import profile module
+    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    if parent_dir not in sys.path:
+        sys.path.insert(0, parent_dir)
+    from profile import profile_with_nsight
+    
+    if device is None:
+        if verbose:
+            print(f"Using current device: {torch.cuda.current_device()}")
+        device = torch.cuda.current_device()
+
+    with torch.cuda.device(device):
+        # Warm ups
+        for _ in range(num_warmup):
+            kernel_fn(*args)
+            torch.cuda.synchronize(device=device)
+        
+        # Clear cache for cold start
+        torch.cuda.empty_cache()
+        clear_l2_cache(device=device)
+        
+        if verbose:
+            print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}")
+        
+        # Profile with nsight - returns average time in nanoseconds
+        def wrapped_kernel():
+            return kernel_fn(*args)
+        
+        metric_values = profile_with_nsight(
+            wrapped_kernel, 
+            metrics=["gpu__time_duration.sum"], 
+            num_trials=num_trials
+        )
+        
+        # Convert from nanoseconds to milliseconds
+        gpu_time_ns = metric_values.get("gpu__time_duration.sum")
+        if gpu_time_ns is None:
+            raise RuntimeError("Failed to get GPU time from nsight")
+        
+        # Convert nanoseconds to milliseconds
+        # nsight returns average across num_trials, so we return a single value in a list
+        gpu_time_ms = gpu_time_ns / 1_000_000.0
+        
+        if verbose:
+            print(f"Average GPU time: {gpu_time_ms:.3f} ms (across {num_trials} trials)")
+        
+        # Return list with single average value for API consistency
+        return [gpu_time_ms]
+
 ########################################################
 # Timing stats
 # tools to help compute speedup and other time
@@ -438,3 +509,131 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) ->
         stats["device"] = str(device)  # for debugging
 
     return stats
+
+
+# def test_nsight_timing_accuracy(
+#     num_trials: int = 10,
+#     num_warmup: int = 3,
+#     device: torch.device | None = None,
+#     verbose: bool = True
+# ):
+#     """
+#     Test and benchmark time_execution_with_nsight_python against other timing methods.
+    
+#     Compares nsight_python_time with cuda_event and host_time to verify accuracy.
+#     All times are reported in milliseconds for consistency.
+    
+#     Args:
+#         num_trials: Number of timing trials to run
+#         num_warmup: Number of warmup iterations
+#         device: CUDA device to use, defaults to current device
+#         verbose: Whether to print detailed comparison results
+    
+#     Returns:
+#         Dictionary with timing results and comparison statistics
+#     """
+#     if not torch.cuda.is_available():
+#         raise RuntimeError("CUDA is not available. This test requires a CUDA device.")
+    
+#     if device is None:
+#         device = torch.cuda.current_device()
+    
+#     # Create a simple test kernel (matrix multiplication)
+#     M, N, K = 1024, 1024, 1024
+#     a = torch.randn(M, K, device=device, dtype=torch.float32)
+#     b = torch.randn(K, N, device=device, dtype=torch.float32)
+    
+#     def matmul_kernel(a, b):
+#         return torch.matmul(a, b)
+    
+#     args = [a, b]
+    
+#     if verbose:
+#         print("=" * 80)
+#         print("Testing nsight_python_time accuracy against other timing methods")
+#         print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication")
+#         print(f"Device: {device} ({torch.cuda.get_device_name(device)})")
+#         print(f"Trials: {num_trials}, Warmup: {num_warmup}")
+#         print("=" * 80)
+    
+#     results = {}
+    
+#     # Test each timing method
+#     timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"]
+    
+#     for method in timing_methods:
+#         if verbose:
+#             print(f"\n[Testing] {method}...")
+        
+#         try:
+#             timing_func = get_timing_function(method)
+#             elapsed_times = timing_func(
+#                 matmul_kernel,
+#                 args=args,
+#                 num_warmup=num_warmup,
+#                 num_trials=num_trials,
+#                 discard_first=1,
+#                 verbose=False,
+#                 device=device
+#             )
+            
+#             # Get statistics
+#             stats = get_timing_stats(elapsed_times, device=device)
+#             results[method] = {
+#                 "times": elapsed_times,
+#                 "stats": stats
+#             }
+            
+#             if verbose:
+#                 print(f"  Mean: {stats['mean']:.3f} ms")
+#                 print(f"  Std:  {stats['std']:.3f} ms")
+#                 print(f"  Min:   {stats['min']:.3f} ms")
+#                 print(f"  Max:   {stats['max']:.3f} ms")
+                
+#         except Exception as e:
+#             if verbose:
+#                 print(f"  ERROR: {e}")
+#             results[method] = {"error": str(e)}
+    
+#     # Compare results
+#     if verbose:
+#         print("\n" + "=" * 80)
+#         print("Comparison Summary (all times in milliseconds):")
+#         print("=" * 80)
+        
+#         if "cuda_event" in results and "stats" in results["cuda_event"]:
+#             cuda_mean = results["cuda_event"]["stats"]["mean"]
+            
+#             if "nsight_python_time" in results and "stats" in results["nsight_python_time"]:
+#                 nsight_mean = results["nsight_python_time"]["stats"]["mean"]
+#                 diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100
+#                 print(f"\nnsight_python_time vs cuda_event:")
+#                 print(f"  cuda_event:        {cuda_mean:.3f} ms (mean of {num_trials} trials)")
+#                 print(f"  nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)")
+#                 print(f"  Difference:         {diff_pct:+.2f}%")
+                
+#                 # Check if within reasonable range (within 20% of cuda_event)
+#                 # Note: nsight measures pure GPU time, cuda_event includes some overhead
+#                 # So nsight should be slightly lower or similar
+#                 if abs(diff_pct) < 20:
+#                     print(f"  ✓ Accuracy check PASSED (within 20% of cuda_event)")
+#                 else:
+#                     print(f"  ⚠ Accuracy check WARNING (difference > 20%)")
+#                     print(f"     Note: nsight measures pure GPU time, may differ from cuda_event")
+            
+#             if "host_time" in results and "stats" in results["host_time"]:
+#                 host_mean = results["host_time"]["stats"]["mean"]
+#                 print(f"\nhost_time vs cuda_event:")
+#                 print(f"  cuda_event: {cuda_mean:.3f} ms")
+#                 print(f"  host_time:  {host_mean:.3f} ms")
+#                 print(f"  (host_time typically higher due to CPU overhead)")
+    
+#     return results
+
+
+if __name__ == "__main__":
+    # Run the test if executed directly
+    if torch.cuda.is_available():
+        test_nsight_timing_accuracy(num_trials=10, num_warmup=3, verbose=True)
+    else:
+        print("CUDA not available. Skipping timing accuracy test.")
diff --git a/src/profile.py b/src/profile.py
index 8a99f802..21ece54b 100644
--- a/src/profile.py
+++ b/src/profile.py
@@ -3,6 +3,7 @@
 # TODO: @kesavan @simon @arya
 ####
 
+import os
 import torch
 import pandas as pd
 
@@ -50,14 +51,14 @@ def _patched_agg(df, func, norm, progress):
     NSIGHT_AVAILABLE = False
 
 
-def profile_with_nsight(func, metrics=None, *args, **kwargs):
+def profile_with_nsight(func, metrics=None, num_trials=1, *args, **kwargs):
     """Profile a PyTorch function. Returns {metric_name: value}."""
     if not NSIGHT_AVAILABLE:
         raise RuntimeError("nsight-python not available")
     
     metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg'])
     
-    @nsight.analyze.kernel(metric=metrics, runs=1, configs=[(0,)], 
+    @nsight.analyze.kernel(metric=metrics, runs=num_trials, configs=[(0,)], 
                            combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0))
     def profiled(_):
         with nsight.annotate("kernel"):
@@ -119,6 +120,172 @@ def benchmark_matmul(n):
     return c
 
 
+    
+
+# pytorch profiler
+# migrate from old repo during ICML / caesar repo
+
+
+def profile_kernelbench_model_with_nsight(
+    custom_model_src: str,
+    ref_model_src: str = None,
+    metrics: list = None,
+    num_trials: int = 1,
+    seed: int = 42,
+    device: torch.device = None,
+    backend: str = "cuda",
+    precision: torch.dtype = torch.float32,
+    build_dir: str = None,
+    verbose: bool = False,
+) -> dict:
+    """
+    Profile a kernelbench model using nsight-python.
+    
+    Assumes model has been validated and compiled successfully via eval.
+    No error checking is performed - this should only be called after eval proves correctness.
+    
+    Args:
+        custom_model_src: Source code string for the custom model (ModelNew class)
+        ref_model_src: Optional source code string for reference model to get get_inputs/get_init_inputs.
+                      If None, will try to get these from custom_model_src.
+        metrics: List of nsight metrics to collect. Default: ['sm__cycles_active.avg']
+        num_trials: Number of trials to run. Default: 1
+        seed: Random seed for reproducible inputs
+        device: CUDA device to run on. Default: cuda:0
+        backend: Backend type ('cuda', 'triton', 'tilelang', 'cute'). Default: 'cuda'
+        precision: torch.dtype for computation. Default: torch.float32
+        build_dir: Build directory for compiled kernels. Default: None
+        verbose: Whether to print verbose output. Default: False
+    
+    Returns:
+        Dictionary mapping metric names to their values
+    """
+    from kernelbench.eval import (
+        load_custom_model,
+        load_custom_model_with_tempfile,
+        load_original_model_and_inputs,
+        _process_input_tensor,
+        set_seed,
+        graceful_eval_cleanup,
+    )
+    from kernelbench.utils import set_gpu_arch
+    
+    device = device or torch.device("cuda:0")
+    metrics = metrics or ['sm__cycles_active.avg']
+    metrics = [metrics] if isinstance(metrics, str) else metrics
+    
+    set_gpu_arch(["Ada"])
+    torch.cuda.set_device(device)
+    
+    # Load input functions using existing eval function
+    input_source = ref_model_src or custom_model_src
+    context = {}
+    _, get_init_inputs, get_inputs = load_original_model_and_inputs(input_source, context)
+    
+    # Prepare inputs
+    set_seed(seed)
+    init_inputs = [_process_input_tensor(x, device, backend, precision) for x in get_init_inputs()]
+    
+    # Load and compile model
+    if verbose:
+        print("[Profile] Loading and compiling custom model...")
+    
+    os.environ["TORCH_USE_CUDA_DSA"] = "1"
+    tempfile = None
+    
+    if backend.lower() in ["triton", "tilelang", "cute"]:
+        ModelNew, tempfile = load_custom_model_with_tempfile(custom_model_src, entry_point="ModelNew")
+    else:
+        ModelNew = load_custom_model(custom_model_src, {}, build_dir)
+    
+    torch.cuda.synchronize(device=device)
+    
+    # Instantiate model
+    with torch.no_grad():
+        set_seed(seed)
+        custom_model = ModelNew(*init_inputs)
+        custom_model = custom_model.to(device=device, dtype=precision)
+        torch.cuda.synchronize(device=device)
+    
+    if verbose:
+        print("[Profile] Model instantiated successfully")
+    
+    # Prepare profiling inputs
+    set_seed(seed)
+    inputs = [_process_input_tensor(x, device, backend, precision) for x in get_inputs()]
+    
+    # Profile
+    if verbose:
+        print(f"[Profile] Profiling with nsight (metrics: {metrics})...")
+    
+    def model_forward():
+        with torch.no_grad():
+            return custom_model(*inputs)
+    
+    metric_values = profile_with_nsight(model_forward, metrics=metrics, num_trials=num_trials)
+    
+    if verbose:
+        print("[Profile] Profiling completed successfully")
+    
+    # Cleanup using existing eval function
+    graceful_eval_cleanup(context, device, tempfile)
+    
+    return metric_values
+
+def test_flash_attention_profile():
+    """
+    Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example.
+    """
+    import os
+    from kernelbench.utils import read_file
+    
+    # Get the paths to the reference and custom model files
+    # profile.py is in src/, so we need to go up one level to get repo root
+    repo_root = os.path.dirname(os.path.dirname(__file__))
+    ref_model_path = os.path.join(
+        repo_root, 
+        "src/kernelbench/prompts/few_shot/model_ex_flash_attn.py"
+    )
+    custom_model_path = os.path.join(
+        repo_root,
+        "src/kernelbench/prompts/few_shot/model_new_ex_flash_attn.py"
+    )
+    
+    # Read the model source files
+    print("[Test] Reading model source files...")
+    ref_model_src = read_file(ref_model_path)
+    custom_model_src = read_file(custom_model_path)
+    
+    
+    print("[Test] Starting profiling with nsight...")
+    
+    # Profile the custom model
+    metrics = profile_kernelbench_model_with_nsight(
+        custom_model_src=custom_model_src,
+        ref_model_src=ref_model_src,
+        metrics=[
+            'gpu__time_duration.sum'
+        ],
+        seed=42,
+        backend="cuda",
+        precision=torch.float32,
+        verbose=True
+    )
+    print(metrics)
+    
+    print("\n[Test] Profiling results:")
+    print("=" * 60)
+    for metric_name, value in metrics.items():
+        if value is not None:
+            print(f"  {metric_name}: {value}")
+        else:
+            print(f"  {metric_name}: <not available>")
+    print("=" * 60)
+    
+    return metrics
+
+
+
 if __name__ == "__main__":
     if not check_ncu_available():
         print("ncu not found in PATH. Install Nsight Compute.")
@@ -130,9 +297,6 @@ def benchmark_matmul(n):
         
     
     # test the example_ncu_python_profile
-    example_ncu_python_profile()
-    
-    
-
-# pytorch profiler
-# migrate from old repo during ICML / caesar repo
\ No newline at end of file
+    # example_ncu_python_profile()
+    test_flash_attention_profile()
+    
\ No newline at end of file

From 24cc9fc3785dc3e52d78576d156c05617492777c Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 02:06:31 +0000
Subject: [PATCH 06/18] add in profiling via nsight python


From ce4469d57037cf81cc2032995cc0c889472eed27 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 06:28:24 +0000
Subject: [PATCH 07/18] move profile.py

---
 src/{ => kernelbench}/profile.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/{ => kernelbench}/profile.py (100%)

diff --git a/src/profile.py b/src/kernelbench/profile.py
similarity index 100%
rename from src/profile.py
rename to src/kernelbench/profile.py

From decf8c684814642756f98203cfdb070eea0e8700 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 06:51:08 +0000
Subject: [PATCH 08/18] fixing profiling and timing

---
 src/kernelbench/profile.py |  41 +++++---
 src/kernelbench/timing.py  | 208 ++++++++++++++++++-------------------
 2 files changed, 127 insertions(+), 122 deletions(-)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 21ece54b..160ac69b 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -51,18 +51,22 @@ def _patched_agg(df, func, norm, progress):
     NSIGHT_AVAILABLE = False
 
 
-def profile_with_nsight(func, metrics=None, num_trials=1, *args, **kwargs):
+def profile_with_nsight(func, metrics=None, num_trials=1):
     """Profile a PyTorch function. Returns {metric_name: value}."""
     if not NSIGHT_AVAILABLE:
         raise RuntimeError("nsight-python not available")
     
     metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg'])
     
-    @nsight.analyze.kernel(metric=metrics, runs=num_trials, configs=[(0,)], 
-                           combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0))
+    @nsight.analyze.kernel(
+        metric=metrics,
+        runs=num_trials,
+        configs=[(0,)],
+        combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0),
+    )
     def profiled(_):
         with nsight.annotate("kernel"):
-            return func(*args, **kwargs)
+            return func()
     
     try:
         result = profiled()
@@ -93,10 +97,15 @@ def test_kernel(x, y):
     b = torch.randn(256, 256, device="cuda")
     
     print("Running nsight profiling...")
+
+    # Wrap kernel
+    def test_kernel_forward():
+        return test_kernel(a, b)
+
     metric_values = profile_with_nsight(
-        test_kernel, 
+        test_kernel_forward,
         ['sm__cycles_active.avg', 'sm__cycles_elapsed.sum', "smsp__inst_executed_pipe_tensor_op_hmma.sum"],
-        a, b
+        num_trials=1,
     )
     
     print("\nProfiling results:")
@@ -160,7 +169,7 @@ def profile_kernelbench_model_with_nsight(
     Returns:
         Dictionary mapping metric names to their values
     """
-    from kernelbench.eval import (
+    from eval import (
         load_custom_model,
         load_custom_model_with_tempfile,
         load_original_model_and_inputs,
@@ -168,13 +177,14 @@ def profile_kernelbench_model_with_nsight(
         set_seed,
         graceful_eval_cleanup,
     )
-    from kernelbench.utils import set_gpu_arch
+    from utils import set_gpu_arch
     
     device = device or torch.device("cuda:0")
     metrics = metrics or ['sm__cycles_active.avg']
     metrics = [metrics] if isinstance(metrics, str) else metrics
     
-    set_gpu_arch(["Ada"])
+    # set_gpu_arch(["Ada"]) # NOTE: can set GPU arch here if needed
+    
     torch.cuda.set_device(device)
     
     # Load input functions using existing eval function
@@ -237,18 +247,17 @@ def test_flash_attention_profile():
     Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example.
     """
     import os
-    from kernelbench.utils import read_file
+    from utils import read_file
     
     # Get the paths to the reference and custom model files
-    # profile.py is in src/, so we need to go up one level to get repo root
-    repo_root = os.path.dirname(os.path.dirname(__file__))
+    REPO_ROOT = os.path.dirname(__file__)
     ref_model_path = os.path.join(
-        repo_root, 
-        "src/kernelbench/prompts/few_shot/model_ex_flash_attn.py"
+        REPO_ROOT,
+        "prompts/few_shot/model_ex_flash_attn.py"
     )
     custom_model_path = os.path.join(
-        repo_root,
-        "src/kernelbench/prompts/few_shot/model_new_ex_flash_attn.py"
+        REPO_ROOT,
+        "prompts/few_shot/model_new_ex_flash_attn.py"
     )
     
     # Read the model source files
diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index 6b81e8c7..91fe8fac 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -407,13 +407,8 @@ def time_execution_with_nsight_python(
     Returns:
         List containing one float: average elapsed time in milliseconds
     """
-    import sys
-    import os
-    # Add parent directory to path to import profile module
-    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    if parent_dir not in sys.path:
-        sys.path.insert(0, parent_dir)
-    from profile import profile_with_nsight
+    
+    from kernelbench.profile import profile_with_nsight
     
     if device is None:
         if verbose:
@@ -434,12 +429,13 @@ def time_execution_with_nsight_python(
             print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}")
         
         # Profile with nsight - returns average time in nanoseconds
+        # Wrap kernel function to avoid argument passing confusion
         def wrapped_kernel():
             return kernel_fn(*args)
         
         metric_values = profile_with_nsight(
-            wrapped_kernel, 
-            metrics=["gpu__time_duration.sum"], 
+            wrapped_kernel,
+            metrics=["gpu__time_duration.sum"],
             num_trials=num_trials
         )
         
@@ -511,124 +507,124 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) ->
     return stats
 
 
-# def test_nsight_timing_accuracy(
-#     num_trials: int = 10,
-#     num_warmup: int = 3,
-#     device: torch.device | None = None,
-#     verbose: bool = True
-# ):
-#     """
-#     Test and benchmark time_execution_with_nsight_python against other timing methods.
+def test_nsight_timing_accuracy(
+    num_trials: int = 10,
+    num_warmup: int = 3,
+    device: torch.device | None = None,
+    verbose: bool = True
+):
+    """
+    Test and benchmark time_execution_with_nsight_python against other timing methods.
     
-#     Compares nsight_python_time with cuda_event and host_time to verify accuracy.
-#     All times are reported in milliseconds for consistency.
+    Compares nsight_python_time with cuda_event and host_time to verify accuracy.
+    All times are reported in milliseconds for consistency.
     
-#     Args:
-#         num_trials: Number of timing trials to run
-#         num_warmup: Number of warmup iterations
-#         device: CUDA device to use, defaults to current device
-#         verbose: Whether to print detailed comparison results
+    Args:
+        num_trials: Number of timing trials to run
+        num_warmup: Number of warmup iterations
+        device: CUDA device to use, defaults to current device
+        verbose: Whether to print detailed comparison results
     
-#     Returns:
-#         Dictionary with timing results and comparison statistics
-#     """
-#     if not torch.cuda.is_available():
-#         raise RuntimeError("CUDA is not available. This test requires a CUDA device.")
+    Returns:
+        Dictionary with timing results and comparison statistics
+    """
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is not available. This test requires a CUDA device.")
     
-#     if device is None:
-#         device = torch.cuda.current_device()
+    if device is None:
+        device = torch.cuda.current_device()
     
-#     # Create a simple test kernel (matrix multiplication)
-#     M, N, K = 1024, 1024, 1024
-#     a = torch.randn(M, K, device=device, dtype=torch.float32)
-#     b = torch.randn(K, N, device=device, dtype=torch.float32)
+    # Create a simple test kernel (matrix multiplication)
+    M, N, K = 1024, 1024, 1024
+    a = torch.randn(M, K, device=device, dtype=torch.float32)
+    b = torch.randn(K, N, device=device, dtype=torch.float32)
     
-#     def matmul_kernel(a, b):
-#         return torch.matmul(a, b)
+    def matmul_kernel(a, b):
+        return torch.matmul(a, b)
     
-#     args = [a, b]
+    args = [a, b]
     
-#     if verbose:
-#         print("=" * 80)
-#         print("Testing nsight_python_time accuracy against other timing methods")
-#         print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication")
-#         print(f"Device: {device} ({torch.cuda.get_device_name(device)})")
-#         print(f"Trials: {num_trials}, Warmup: {num_warmup}")
-#         print("=" * 80)
+    if verbose:
+        print("=" * 80)
+        print("Testing nsight_python_time accuracy against other timing methods")
+        print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication")
+        print(f"Device: {device} ({torch.cuda.get_device_name(device)})")
+        print(f"Trials: {num_trials}, Warmup: {num_warmup}")
+        print("=" * 80)
     
-#     results = {}
+    results = {}
     
-#     # Test each timing method
-#     timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"]
+    # Test each timing method
+    timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"]
     
-#     for method in timing_methods:
-#         if verbose:
-#             print(f"\n[Testing] {method}...")
+    for method in timing_methods:
+        if verbose:
+            print(f"\n[Testing] {method}...")
         
-#         try:
-#             timing_func = get_timing_function(method)
-#             elapsed_times = timing_func(
-#                 matmul_kernel,
-#                 args=args,
-#                 num_warmup=num_warmup,
-#                 num_trials=num_trials,
-#                 discard_first=1,
-#                 verbose=False,
-#                 device=device
-#             )
+        try:
+            timing_func = get_timing_function(method)
+            elapsed_times = timing_func(
+                matmul_kernel,
+                args=args,
+                num_warmup=num_warmup,
+                num_trials=num_trials,
+                discard_first=1,
+                verbose=False,
+                device=device
+            )
             
-#             # Get statistics
-#             stats = get_timing_stats(elapsed_times, device=device)
-#             results[method] = {
-#                 "times": elapsed_times,
-#                 "stats": stats
-#             }
+            # Get statistics
+            stats = get_timing_stats(elapsed_times, device=device)
+            results[method] = {
+                "times": elapsed_times,
+                "stats": stats
+            }
             
-#             if verbose:
-#                 print(f"  Mean: {stats['mean']:.3f} ms")
-#                 print(f"  Std:  {stats['std']:.3f} ms")
-#                 print(f"  Min:   {stats['min']:.3f} ms")
-#                 print(f"  Max:   {stats['max']:.3f} ms")
+            if verbose:
+                print(f"  Mean: {stats['mean']:.3f} ms")
+                print(f"  Std:  {stats['std']:.3f} ms")
+                print(f"  Min:   {stats['min']:.3f} ms")
+                print(f"  Max:   {stats['max']:.3f} ms")
                 
-#         except Exception as e:
-#             if verbose:
-#                 print(f"  ERROR: {e}")
-#             results[method] = {"error": str(e)}
+        except Exception as e:
+            if verbose:
+                print(f"  ERROR: {e}")
+            results[method] = {"error": str(e)}
     
-#     # Compare results
-#     if verbose:
-#         print("\n" + "=" * 80)
-#         print("Comparison Summary (all times in milliseconds):")
-#         print("=" * 80)
+    # Compare results
+    if verbose:
+        print("\n" + "=" * 80)
+        print("Comparison Summary (all times in milliseconds):")
+        print("=" * 80)
         
-#         if "cuda_event" in results and "stats" in results["cuda_event"]:
-#             cuda_mean = results["cuda_event"]["stats"]["mean"]
+        if "cuda_event" in results and "stats" in results["cuda_event"]:
+            cuda_mean = results["cuda_event"]["stats"]["mean"]
             
-#             if "nsight_python_time" in results and "stats" in results["nsight_python_time"]:
-#                 nsight_mean = results["nsight_python_time"]["stats"]["mean"]
-#                 diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100
-#                 print(f"\nnsight_python_time vs cuda_event:")
-#                 print(f"  cuda_event:        {cuda_mean:.3f} ms (mean of {num_trials} trials)")
-#                 print(f"  nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)")
-#                 print(f"  Difference:         {diff_pct:+.2f}%")
+            if "nsight_python_time" in results and "stats" in results["nsight_python_time"]:
+                nsight_mean = results["nsight_python_time"]["stats"]["mean"]
+                diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100
+                print(f"\nnsight_python_time vs cuda_event:")
+                print(f"  cuda_event:        {cuda_mean:.3f} ms (mean of {num_trials} trials)")
+                print(f"  nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)")
+                print(f"  Difference:         {diff_pct:+.2f}%")
                 
-#                 # Check if within reasonable range (within 20% of cuda_event)
-#                 # Note: nsight measures pure GPU time, cuda_event includes some overhead
-#                 # So nsight should be slightly lower or similar
-#                 if abs(diff_pct) < 20:
-#                     print(f"  ✓ Accuracy check PASSED (within 20% of cuda_event)")
-#                 else:
-#                     print(f"  ⚠ Accuracy check WARNING (difference > 20%)")
-#                     print(f"     Note: nsight measures pure GPU time, may differ from cuda_event")
+                # Check if within reasonable range (within 20% of cuda_event)
+                # Note: nsight measures pure GPU time, cuda_event includes some overhead
+                # So nsight should be slightly lower or similar
+                if abs(diff_pct) < 20:
+                    print(f"  ✓ Accuracy check PASSED (within 20% of cuda_event)")
+                else:
+                    print(f"  ⚠ Accuracy check WARNING (difference > 20%)")
+                    print(f"     Note: nsight measures pure GPU time, may differ from cuda_event")
             
-#             if "host_time" in results and "stats" in results["host_time"]:
-#                 host_mean = results["host_time"]["stats"]["mean"]
-#                 print(f"\nhost_time vs cuda_event:")
-#                 print(f"  cuda_event: {cuda_mean:.3f} ms")
-#                 print(f"  host_time:  {host_mean:.3f} ms")
-#                 print(f"  (host_time typically higher due to CPU overhead)")
+            if "host_time" in results and "stats" in results["host_time"]:
+                host_mean = results["host_time"]["stats"]["mean"]
+                print(f"\nhost_time vs cuda_event:")
+                print(f"  cuda_event: {cuda_mean:.3f} ms")
+                print(f"  host_time:  {host_mean:.3f} ms")
+                print(f"  (host_time typically higher due to CPU overhead)")
     
-#     return results
+    return results
 
 
 if __name__ == "__main__":

From 6a16b50757ed3ad7021067ee5b523b13d07e7cb1 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 06:53:32 +0000
Subject: [PATCH 09/18] merge in main


From 3b4b342783dc8ccdc5dd039c4f0d2f104e891154 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 07:09:56 +0000
Subject: [PATCH 10/18] clean up comments

---
 src/kernelbench/profile.py | 18 ++++++++++--------
 src/kernelbench/timing.py  |  3 ++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 160ac69b..40d3da02 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -1,6 +1,6 @@
 ####
-# Profiling Related Functions
-# TODO: @kesavan @simon @arya
+# Nsight Profiling Related Functions
+# Currently showcases how to profile a kernelbench model with nsight-python
 ####
 
 import os
@@ -11,7 +11,7 @@
 
 
 # Check if nsight-python is available
-# To patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug
+# Monkey-patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug
 try:
     import nsight
     NSIGHT_AVAILABLE = True
@@ -62,7 +62,7 @@ def profile_with_nsight(func, metrics=None, num_trials=1):
         metric=metrics,
         runs=num_trials,
         configs=[(0,)],
-        combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0),
+        combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), # NOTE: some torch ops launch multiple kernels, so we need to combine metrics
     )
     def profiled(_):
         with nsight.annotate("kernel"):
@@ -169,7 +169,7 @@ def profile_kernelbench_model_with_nsight(
     Returns:
         Dictionary mapping metric names to their values
     """
-    from eval import (
+    from kernelbench.eval import (
         load_custom_model,
         load_custom_model_with_tempfile,
         load_original_model_and_inputs,
@@ -177,7 +177,7 @@ def profile_kernelbench_model_with_nsight(
         set_seed,
         graceful_eval_cleanup,
     )
-    from utils import set_gpu_arch
+    from kernelbench.utils import set_gpu_arch
     
     device = device or torch.device("cuda:0")
     metrics = metrics or ['sm__cycles_active.avg']
@@ -228,10 +228,12 @@ def profile_kernelbench_model_with_nsight(
     if verbose:
         print(f"[Profile] Profiling with nsight (metrics: {metrics})...")
     
+    # Wrap model forward pass
     def model_forward():
         with torch.no_grad():
             return custom_model(*inputs)
     
+    # Profile with nsight
     metric_values = profile_with_nsight(model_forward, metrics=metrics, num_trials=num_trials)
     
     if verbose:
@@ -247,7 +249,7 @@ def test_flash_attention_profile():
     Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example.
     """
     import os
-    from utils import read_file
+    from kernelbench.utils import read_file
     
     # Get the paths to the reference and custom model files
     REPO_ROOT = os.path.dirname(__file__)
@@ -306,6 +308,6 @@ def test_flash_attention_profile():
         
     
     # test the example_ncu_python_profile
-    # example_ncu_python_profile()
+    example_ncu_python_profile()
     test_flash_attention_profile()
     
\ No newline at end of file
diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index 7a2b1cd2..5cc8da3a 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -429,10 +429,11 @@ def time_execution_with_nsight_python(
             print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}")
         
         # Profile with nsight - returns average time in nanoseconds
-        # Wrap kernel function to avoid argument passing confusion
+        # Wrap kernel function
         def wrapped_kernel():
             return kernel_fn(*args)
         
+        # Profile with nsight, use gpu_time_duration.sum metric for GPU time
         metric_values = profile_with_nsight(
             wrapped_kernel,
             metrics=["gpu__time_duration.sum"],

From 14658ab3c6e95c6678fcb628cfee8592ce9616df Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 07:11:10 +0000
Subject: [PATCH 11/18] reference nsight python

---
 src/kernelbench/profile.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 40d3da02..397bc146 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -12,6 +12,7 @@
 
 # Check if nsight-python is available
 # Monkey-patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug
+# Reference: https://docs.nvidia.com/nsight-python
 try:
     import nsight
     NSIGHT_AVAILABLE = True

From e6729abd6236b6f242399877e2fbaeccdb0fb030 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Wed, 7 Jan 2026 07:16:12 +0000
Subject: [PATCH 12/18] clean up

---
 src/kernelbench/profile.py |   2 +-
 src/kernelbench/timing.py  | 127 -------------------------------------
 2 files changed, 1 insertion(+), 128 deletions(-)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 397bc146..3f0913d9 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -308,7 +308,7 @@ def test_flash_attention_profile():
         exit(1)
         
     
-    # test the example_ncu_python_profile
+    # test the example_ncu_python_profile, and flash attention profile
     example_ncu_python_profile()
     test_flash_attention_profile()
     
\ No newline at end of file
diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index 5cc8da3a..b48698cb 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -507,130 +507,3 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) ->
 
     return stats
 
-
-def test_nsight_timing_accuracy(
-    num_trials: int = 10,
-    num_warmup: int = 3,
-    device: torch.device | None = None,
-    verbose: bool = True
-):
-    """
-    Test and benchmark time_execution_with_nsight_python against other timing methods.
-    
-    Compares nsight_python_time with cuda_event and host_time to verify accuracy.
-    All times are reported in milliseconds for consistency.
-    
-    Args:
-        num_trials: Number of timing trials to run
-        num_warmup: Number of warmup iterations
-        device: CUDA device to use, defaults to current device
-        verbose: Whether to print detailed comparison results
-    
-    Returns:
-        Dictionary with timing results and comparison statistics
-    """
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA is not available. This test requires a CUDA device.")
-    
-    if device is None:
-        device = torch.cuda.current_device()
-    
-    # Create a simple test kernel (matrix multiplication)
-    M, N, K = 1024, 1024, 1024
-    a = torch.randn(M, K, device=device, dtype=torch.float32)
-    b = torch.randn(K, N, device=device, dtype=torch.float32)
-    
-    def matmul_kernel(a, b):
-        return torch.matmul(a, b)
-    
-    args = [a, b]
-    
-    if verbose:
-        print("=" * 80)
-        print("Testing nsight_python_time accuracy against other timing methods")
-        print(f"Kernel: {M}x{K} @ {K}x{N} matrix multiplication")
-        print(f"Device: {device} ({torch.cuda.get_device_name(device)})")
-        print(f"Trials: {num_trials}, Warmup: {num_warmup}")
-        print("=" * 80)
-    
-    results = {}
-    
-    # Test each timing method
-    timing_methods = ["cuda_event", "do_bench", "host_time", "nsight_python_time"]
-    
-    for method in timing_methods:
-        if verbose:
-            print(f"\n[Testing] {method}...")
-        
-        try:
-            timing_func = get_timing_function(method)
-            elapsed_times = timing_func(
-                matmul_kernel,
-                args=args,
-                num_warmup=num_warmup,
-                num_trials=num_trials,
-                discard_first=1,
-                verbose=False,
-                device=device
-            )
-            
-            # Get statistics
-            stats = get_timing_stats(elapsed_times, device=device)
-            results[method] = {
-                "times": elapsed_times,
-                "stats": stats
-            }
-            
-            if verbose:
-                print(f"  Mean: {stats['mean']:.3f} ms")
-                print(f"  Std:  {stats['std']:.3f} ms")
-                print(f"  Min:   {stats['min']:.3f} ms")
-                print(f"  Max:   {stats['max']:.3f} ms")
-                
-        except Exception as e:
-            if verbose:
-                print(f"  ERROR: {e}")
-            results[method] = {"error": str(e)}
-    
-    # Compare results
-    if verbose:
-        print("\n" + "=" * 80)
-        print("Comparison Summary (all times in milliseconds):")
-        print("=" * 80)
-        
-        if "cuda_event" in results and "stats" in results["cuda_event"]:
-            cuda_mean = results["cuda_event"]["stats"]["mean"]
-            
-            if "nsight_python_time" in results and "stats" in results["nsight_python_time"]:
-                nsight_mean = results["nsight_python_time"]["stats"]["mean"]
-                diff_pct = ((nsight_mean - cuda_mean) / cuda_mean) * 100
-                print(f"\nnsight_python_time vs cuda_event:")
-                print(f"  cuda_event:        {cuda_mean:.3f} ms (mean of {num_trials} trials)")
-                print(f"  nsight_python_time: {nsight_mean:.3f} ms (average across {num_trials} trials)")
-                print(f"  Difference:         {diff_pct:+.2f}%")
-                
-                # Check if within reasonable range (within 20% of cuda_event)
-                # Note: nsight measures pure GPU time, cuda_event includes some overhead
-                # So nsight should be slightly lower or similar
-                if abs(diff_pct) < 20:
-                    print(f"  ✓ Accuracy check PASSED (within 20% of cuda_event)")
-                else:
-                    print(f"  ⚠ Accuracy check WARNING (difference > 20%)")
-                    print(f"     Note: nsight measures pure GPU time, may differ from cuda_event")
-            
-            if "host_time" in results and "stats" in results["host_time"]:
-                host_mean = results["host_time"]["stats"]["mean"]
-                print(f"\nhost_time vs cuda_event:")
-                print(f"  cuda_event: {cuda_mean:.3f} ms")
-                print(f"  host_time:  {host_mean:.3f} ms")
-                print(f"  (host_time typically higher due to CPU overhead)")
-    
-    return results
-
-
-if __name__ == "__main__":
-    # Run the test if executed directly
-    if torch.cuda.is_available():
-        test_nsight_timing_accuracy(num_trials=10, num_warmup=3, verbose=True)
-    else:
-        print("CUDA not available. Skipping timing accuracy test.")

From 7b018455cff64512b96148e51376c5744bc39f34 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Wed, 7 Jan 2026 08:13:50 +0000
Subject: [PATCH 13/18] put wheel pkd data fix in this PR

---
 pyproject.toml                             |  5 +-
 src/kernelbench/prompt_constructor_toml.py | 34 ++++++--------
 src/kernelbench/utils.py                   | 54 ++++++++++++++++++++++
 3 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7c0bccb8..bed37150 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,4 +52,7 @@ dev = [
 
 [tool.setuptools.packages.find]
 where = ["src"]
-include = ["kernelbench*"]
\ No newline at end of file
+include = ["kernelbench*"]
+
+[tool.setuptools.package-data]
+kernelbench = ["prompts/**/*"]
\ No newline at end of file
diff --git a/src/kernelbench/prompt_constructor_toml.py b/src/kernelbench/prompt_constructor_toml.py
index 7b8b1bdf..4349a74d 100644
--- a/src/kernelbench/prompt_constructor_toml.py
+++ b/src/kernelbench/prompt_constructor_toml.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
-from kernelbench.utils import read_file 
+from kernelbench.utils import read_file, get_package_resource_path, resolve_path, REPO_TOP_PATH
 
 """
 TOML-based prompt constructor for managing prompt templates and configurations.
@@ -14,11 +14,9 @@
 You can easily check some of the prompt templates we have provided and create your own.
 """
 
-REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-PROMPTS_TOML = os.path.join(REPO_TOP_PATH, "src/kernelbench/prompts/prompts.toml")
-
-assert os.path.exists(PROMPTS_TOML), f"Prompts.toml not found at {PROMPTS_TOML}" 
-GPU_SPECS_PY = "src/kernelbench/prompts/hardware/gpu_specs.py"
+# Resolve paths using the helper from utils
+PROMPTS_TOML = get_package_resource_path("prompts/prompts.toml")
+GPU_SPECS_PY = get_package_resource_path("prompts/hardware/gpu_specs.py")
 HARDWARE_COMPONENT_KEYS = [
     "hardware_header",
     "hardware_specs",
@@ -26,12 +24,6 @@
     "hardware_best_practices",
 ]
 
-def _abs_path(rel: str) -> str:
-    """Convert relative path to absolute path from repo root."""
-    if os.path.isabs(rel):
-        return rel
-    return os.path.join(REPO_TOP_PATH, rel)
-
 @dataclass
 class PromptConfig:
     """
@@ -255,17 +247,17 @@ def render_example_entry(input_code: str, output_code: str, example_label: str)
                 # Use multiple examples (true few-shot)
                 examples_intro = intro_few_shot
                 for i, (input_path, output_path) in enumerate(few_shot_examples, 1):
-                    input_code = read_file(_abs_path(input_path))
-                    output_code = read_file(_abs_path(output_path))
+                    input_code = read_file(resolve_path(input_path))
+                    output_code = read_file(resolve_path(output_path))
                     examples_entries.append(
                         render_example_entry(input_code, output_code, f"Example {i}:")
                     )
             else:
                 # Fall back to one-shot
-                ex_arch_path = _abs_path(
+                ex_arch_path = resolve_path(
                     backend_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch")
                 )
-                ex_new_path = _abs_path(backend_data["one_shot_new_arch"])
+                ex_new_path = resolve_path(backend_data["one_shot_new_arch"])
                 input_code = read_file(ex_arch_path)
                 output_code = read_file(ex_new_path)
                 examples_entries.append(
@@ -274,10 +266,10 @@ def render_example_entry(input_code: str, output_code: str, example_label: str)
 
         elif requires_example == "one_shot":
             # Always use one-shot
-            ex_arch_path = _abs_path(
+            ex_arch_path = resolve_path(
                 backend_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch")
             )
-            ex_new_path = _abs_path(backend_data["one_shot_new_arch"])
+            ex_new_path = resolve_path(backend_data["one_shot_new_arch"])
             input_code = read_file(ex_arch_path)
             output_code = read_file(ex_new_path)
             examples_entries.append(
@@ -296,7 +288,7 @@ def render_example_entry(input_code: str, output_code: str, example_label: str)
             raise ValueError(
                 f"Hardware info requested for option '{option}'; provide gpu_specs_py and gpu_name"
             )
-        context = {**context, **_gpu_context_from_gpu_specs(_abs_path(gpu_specs_py), gpu_name)}
+        context = {**context, **_gpu_context_from_gpu_specs(resolve_path(gpu_specs_py), gpu_name)}
     
     # Builds the prompt from the components in the toml file. 
     prompt_parts = []
@@ -416,10 +408,10 @@ def test_prompt():
     generation. Customize the reference architecture or custom_prompt_key
     if you want to try different inputs.
     """
-    REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
     ref_arch_src = read_file(os.path.join(REPO_TOP_PATH, "KernelBench", "level1", "1_Square_matrix_multiplication_.py"))
     assert len(ref_arch_src) > 0, "ref_arch_src is empty"   
-    
+
+    print("Testing prompt construction...")
     scratch_dir = os.path.join(REPO_TOP_PATH, "scratch")
     # baseline prompt
     baseline_prompt = get_prompt_for_backend(
diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py
index abe067b1..cf8b0ad8 100644
--- a/src/kernelbench/utils.py
+++ b/src/kernelbench/utils.py
@@ -15,6 +15,7 @@
 import os
 import json
 from tqdm import tqdm
+from importlib.resources import files, as_file
 
 # API clients
 from openai import OpenAI
@@ -273,6 +274,59 @@ def read_file(file_path) -> str:
         return ""
 
 
+########################################################
+# Path Resolution Helpers
+########################################################
+REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+
+def get_package_resource_path(relative_path: str) -> str:
+    """
+    Get absolute path to a kernelbench package resource.
+    Works for all three usage modes:
+      - Running from repo directly
+      - As a git submodule
+      - As an installed pip/uv dependency
+    
+    Args:
+        relative_path: Path relative to kernelbench/, e.g. "prompts/prompts.toml"
+    """
+    # Try importlib.resources first (installed package)
+    try:
+        resource = files("kernelbench").joinpath(relative_path)
+        with as_file(resource) as path:
+            if path.exists():
+                return str(path)
+    except (TypeError, FileNotFoundError):
+        pass
+    
+    # Try repo path (running from source / submodule)
+    repo_path = os.path.join(REPO_TOP_PATH, "src/kernelbench", relative_path)
+    if os.path.exists(repo_path):
+        return repo_path
+        
+    raise FileNotFoundError(f"Could not find resource: {relative_path}")
+
+
+def resolve_path(rel: str) -> str:
+    """
+    Resolve a relative path to absolute. Handles paths like "src/kernelbench/prompts/..."
+    from prompts.toml which reference files relative to repo root.
+    """
+    if os.path.isabs(rel):
+        return rel
+    
+    # Convert "src/kernelbench/..." paths to package-relative
+    if rel.startswith("src/kernelbench/"):
+        return get_package_resource_path(rel[len("src/kernelbench/"):])
+    
+    # Otherwise treat as repo-relative
+    repo_path = os.path.join(REPO_TOP_PATH, rel)
+    if os.path.exists(repo_path):
+        return repo_path
+    
+    raise FileNotFoundError(f"Could not resolve path: {rel}")
+
+
 def print_messages(messages):
     for message in messages:
         print(message["role"])

From 977ecb6f434d862afa88c81744fcf6fa6f267bc4 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Thu, 8 Jan 2026 00:57:19 +0000
Subject: [PATCH 14/18] update to new nsight version

---
 src/kernelbench/profile.py | 52 ++++++++------------------------------
 1 file changed, 11 insertions(+), 41 deletions(-)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 3f0913d9..e490c369 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -6,48 +6,17 @@
 import os
 import torch
 import pandas as pd
+import numpy as np
 
 # wrapper with tool to measure hardware metric
 
 
 # Check if nsight-python is available
-# Monkey-patch nsight-python to support multiple metrics and fix the "cannot insert Annotation" bug
 # Reference: https://docs.nvidia.com/nsight-python
 try:
     import nsight
     NSIGHT_AVAILABLE = True
     
-    # Patch 1: Multiple metrics support
-    _orig_ncu_init = nsight.collection.ncu.NCUCollector.__init__
-    nsight.collection.ncu.NCUCollector.__init__ = lambda self, metric="gpu__time_duration.sum", *a, **kw: \
-        _orig_ncu_init(self, ",".join(metric) if isinstance(metric, (list, tuple)) else metric, *a, **kw)
-    
-    # Patch 2: Extract all metrics from comma-separated string
-    _orig_extract = nsight.extraction.extract_df_from_report
-    def _patched_extract(path, metric, *a, **kw):
-        if "," not in metric:
-            return _orig_extract(path, metric, *a, **kw)
-        rows = []
-        for m in metric.split(","):
-            df = _orig_extract(path, m.strip(), *a, **kw)
-            if df is not None and not df.empty:
-                df = df.copy()
-                df['Metric Name'] = m.strip()
-                rows.extend(df.to_dict('records'))
-        return pd.DataFrame(rows) if rows else None
-    nsight.extraction.extract_df_from_report = _patched_extract
-    
-    # Patch 3: Fix "cannot insert Annotation" bug
-    _orig_agg = nsight.transformation.aggregate_data
-    def _patched_agg(df, func, norm, progress):
-        _orig_gb = pd.DataFrame.groupby
-        pd.DataFrame.groupby = lambda self, *a, **kw: _orig_gb(self, *a, **{**kw, 'as_index': False})
-        try:
-            return _orig_agg(df, func, norm, progress)
-        finally:
-            pd.DataFrame.groupby = _orig_gb
-    nsight.transformation.aggregate_data = _patched_agg
-
 except ImportError:
     NSIGHT_AVAILABLE = False
 
@@ -60,7 +29,7 @@ def profile_with_nsight(func, metrics=None, num_trials=1):
     metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg'])
     
     @nsight.analyze.kernel(
-        metric=metrics,
+        metrics=metrics,
         runs=num_trials,
         configs=[(0,)],
         combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), # NOTE: some torch ops launch multiple kernels, so we need to combine metrics
@@ -120,14 +89,15 @@ def check_ncu_available() -> bool:
     return which('ncu') is not None
 
 
-@nsight.analyze.kernel
-def benchmark_matmul(n):
-    """Standard benchmark following nsight-python docs."""
-    a = torch.randn(n, n, device="cuda")
-    b = torch.randn(n, n, device="cuda")
-    with nsight.annotate("matmul"):
-        c = a @ b
-    return c
+if NSIGHT_AVAILABLE:
+    @nsight.analyze.kernel
+    def benchmark_matmul(n):
+        """Standard benchmark following nsight-python docs."""
+        a = torch.randn(n, n, device="cuda")
+        b = torch.randn(n, n, device="cuda")
+        with nsight.annotate("matmul"):
+            c = a @ b
+        return c
 
 
From 8d9461d51ec2eafba57e0cdba244307febbd294a Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Thu, 8 Jan 2026 01:34:24 +0000
Subject: [PATCH 15/18] for kesavan to further fix

---
 src/kernelbench/profile.py                    | 45 ++++++++++++-------
 .../unit_tests/test_eval_timing.py            |  7 ++-
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index e490c369..659c4593 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -44,14 +44,20 @@ def profiled(_):
             return {m: None for m in metrics}
         
         df = result.to_dataframe()
-        if df is None or df.empty:
+        if df is None or (hasattr(df, 'empty') and df.empty):
             return {m: None for m in metrics}
         
-        if 'Metric Name' in df.columns:
+        if isinstance(df, pd.DataFrame) and 'Metric Name' in df.columns:
             return {row['Metric Name']: float(row['AvgValue']) for _, row in df.iterrows()}
-        return {metrics[0]: float(df['AvgValue'].iloc[0])}
+        elif isinstance(df, pd.DataFrame):
+            return {metrics[0]: float(df['AvgValue'].iloc[0])}
+        else:
+            # Handle case where df is not a DataFrame
+            return {m: None for m in metrics}
     except Exception as e:
         print(f"Error profiling: {e}")
+        import traceback
+        traceback.print_exc()
         return {m: None for m in metrics}
 
 
@@ -89,18 +95,6 @@ def check_ncu_available() -> bool:
     return which('ncu') is not None
 
 
-if NSIGHT_AVAILABLE:
-    @nsight.analyze.kernel
-    def benchmark_matmul(n):
-        """Standard benchmark following nsight-python docs."""
-        a = torch.randn(n, n, device="cuda")
-        b = torch.randn(n, n, device="cuda")
-        with nsight.annotate("matmul"):
-            c = a @ b
-        return c
-
-
-    
 
 # pytorch profiler
 # migrate from old repo during ICML / caesar repo
@@ -215,6 +209,24 @@ def model_forward():
     
     return metric_values
 
+
+
+if NSIGHT_AVAILABLE:
+    @nsight.analyze.kernel
+    def benchmark_matmul(n):
+        """Standard benchmark following nsight-python docs."""
+        a = torch.randn(n, n, device="cuda")
+        b = torch.randn(n, n, device="cuda")
+        with nsight.annotate("matmul"):
+            c = a @ b
+        return c
+
+
+
+
+###########
+# 
+############
 def test_flash_attention_profile():
     """
     Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example.
@@ -246,7 +258,8 @@ def test_flash_attention_profile():
         custom_model_src=custom_model_src,
         ref_model_src=ref_model_src,
         metrics=[
-            'gpu__time_duration.sum'
+            'gpu__time_duration.sum',
+            'sm__cycles_elapsed.sum'
         ],
         seed=42,
         backend="cuda",
diff --git a/src/kernelbench/unit_tests/test_eval_timing.py b/src/kernelbench/unit_tests/test_eval_timing.py
index 8be8824b..e524ee46 100644
--- a/src/kernelbench/unit_tests/test_eval_timing.py
+++ b/src/kernelbench/unit_tests/test_eval_timing.py
@@ -10,7 +10,7 @@
 Test Timing
 We want to systematically study different timing methodologies.
 """
-REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
 
 # use exampls in the few shot directory
 EXAMPLES_PATH = os.path.join(REPO_PATH, "src", "kernelbench", "prompts", "few_shot")
@@ -83,6 +83,11 @@ def run_all_timing_tests(device="cuda"):
         _run_timing_smoke_test_matmul(timing_method, device=device)
 
 
+# TODO: make the ncu profiling a spearte one
+# because not veeryone can have hardware counter
+# def run_ncu_against_cuda_event():
+#     pass
+      # need ncu install first (can call the helper function in profile.py to check if ncu exists)
 
 test_device = torch.device("cuda:5")
 run_all_timing_tests(test_device)

From a9a293290f6022de7b70468220d7a3e87bfec487 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Thu, 8 Jan 2026 01:51:32 +0000
Subject: [PATCH 16/18] clean up profile and make verbose

---
 src/kernelbench/profile.py | 407 +++++++++++++++++++++++++------------
 1 file changed, 272 insertions(+), 135 deletions(-)

diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 659c4593..2a3243c0 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -1,105 +1,152 @@
-####
-# Nsight Profiling Related Functions
-# Currently showcases how to profile a kernelbench model with nsight-python
-####
+"""
+Nsight Profiling Module for KernelBench
+========================================
+
+This module provides GPU profiling capabilities using NVIDIA Nsight Compute (ncu).
+It allows collecting hardware-level metrics from kernels.
+
+Key Features:
+- Profile arbitrary PyTorch functions with hardware metrics
+- Profile KernelBench models (ModelNew) with automatic setup/teardown
+- Combine metrics from multi-kernel operations (common in PyTorch)
+
+Requirements:
+- NVIDIA Nsight Compute CLI (ncu) must be installed and in PATH
+- nsight-python package
+
+Common Metrics:
+- gpu__time_duration.sum: Total GPU time in nanoseconds
+- sm__cycles_elapsed.sum: Total SM cycles elapsed
+- sm__cycles_active.avg: Average active cycles per SM
+
+Reference: https://docs.nvidia.com/nsight-python
+"""
 
 import os
-import torch
-import pandas as pd
-import numpy as np
+from shutil import which
 
-# wrapper with tool to measure hardware metric
+import torch
 
+# =============================================================================
+# Nsight Availability Check
+# =============================================================================
 
-# Check if nsight-python is available
-# Reference: https://docs.nvidia.com/nsight-python
 try:
     import nsight
     NSIGHT_AVAILABLE = True
-    
 except ImportError:
     NSIGHT_AVAILABLE = False
 
 
+# =============================================================================
+# Utility Functions
+# =============================================================================
+
+def check_ncu_available() -> bool:
+    """
+    Check if NVIDIA Nsight Compute CLI (ncu) is available in PATH.
+    
+    The ncu command-line tool is required for collecting GPU hardware metrics.
+    It's typically installed with the CUDA Toolkit or NVIDIA Nsight Compute.
+    
+    Returns:
+        True if ncu is found in PATH, False otherwise.
+    """
+    return which('ncu') is not None
+
+
+# =============================================================================
+# Core Profiling Functions
+# =============================================================================
+
 def profile_with_nsight(func, metrics=None, num_trials=1):
-    """Profile a PyTorch function. Returns {metric_name: value}."""
-    if not NSIGHT_AVAILABLE:
-        raise RuntimeError("nsight-python not available")
+    """
+    Profile a PyTorch function and collect hardware metrics.
     
-    metrics = [metrics] if isinstance(metrics, str) else (metrics or ['sm__cycles_active.avg'])
+    Handles complexity:
+    - Setting up the Nsight kernel analyzer
+    - Combining metrics when PyTorch ops launch multiple CUDA kernels
+    - Extracting results from Nsight's DataFrame format
     
+    Args:
+        func: A callable (no arguments) that executes the code to profile.
+              Typically a closure that captures the model and inputs.
+        metrics: List of Nsight metric names to collect. If None, defaults to
+                 ['sm__cycles_active.avg']. Can also pass a single string.
+        num_trials: Number of times to run the function for averaging.
+    
+    Returns:
+        Dictionary mapping metric names to their values (float).
+        Returns None for metrics that couldn't be collected.
+    
+    Example:
+        >>> def my_kernel():
+        ...     return torch.matmul(a, b)
+        >>> results = profile_with_nsight(my_kernel, ['gpu__time_duration.sum'])
+        >>> print(results['gpu__time_duration.sum'])  # Time in nanoseconds
+    
+    Raises:
+        RuntimeError: If nsight-python is not installed.
+    """
+    if not NSIGHT_AVAILABLE:
+        raise RuntimeError(
+            "nsight-python not available."
+        )
+    
+    # Normalize metrics to a list
+    if metrics is None:
+        metrics = ['sm__cycles_active.avg']
+    elif isinstance(metrics, str):
+        metrics = [metrics]
+    
+    # Define the profiled function with Nsight decorator
+    # NOTE: PyTorch operations often launch multiple CUDA kernels (e.g., a matmul
+    # might have separate kernels for the computation and memory operations).
+    # We use combine_kernel_metrics to sum these together for a single measurement.
     @nsight.analyze.kernel(
         metrics=metrics,
         runs=num_trials,
-        configs=[(0,)],
-        combine_kernel_metrics=lambda a, b: (a or 0) + (b or 0), # NOTE: some torch ops launch multiple kernels, so we need to combine metrics
+        configs=[(0,)],  # Use default GPU config
+        combine_kernel_metrics=lambda a, b: (0 if a is None else a) + (0 if b is None else b),
     )
     def profiled(_):
+        # The nsight.annotate context marks the region we care about
         with nsight.annotate("kernel"):
             return func()
     
     try:
+        # Run profiling - this invokes ncu under the hood
         result = profiled()
-        if result is None:
-            return {m: None for m in metrics}
         
-        df = result.to_dataframe()
-        if df is None or (hasattr(df, 'empty') and df.empty):
+        # Convert results to DataFrame
+        df = result.to_dataframe() if result else None
+        if df is None or df.empty:
             return {m: None for m in metrics}
         
-        if isinstance(df, pd.DataFrame) and 'Metric Name' in df.columns:
-            return {row['Metric Name']: float(row['AvgValue']) for _, row in df.iterrows()}
-        elif isinstance(df, pd.DataFrame):
-            return {metrics[0]: float(df['AvgValue'].iloc[0])}
-        else:
-            # Handle case where df is not a DataFrame
+        # Nsight returns a DataFrame with columns like:
+        # - 'Metric': The metric name (e.g., 'gpu__time_duration.sum')
+        # - 'AvgValue': The measured value
+        # We need to find these columns (names may vary slightly)
+        metric_col = next((c for c in df.columns if c.lower() == 'metric'), None)
+        value_col = next((c for c in df.columns if 'value' in c.lower()), None)
+        
+        if not metric_col or not value_col:
             return {m: None for m in metrics}
+        
+        # Build a dictionary of all metrics in the DataFrame
+        metric_dict = {
+            row[metric_col]: float(row[value_col]) 
+            for _, row in df.iterrows()
+        }
+        
+        # Return only the requested metrics (None if not found)
+        return {m: metric_dict.get(m) for m in metrics}
+        
     except Exception as e:
         print(f"Error profiling: {e}")
-        import traceback
-        traceback.print_exc()
         return {m: None for m in metrics}
 
 
-# example function to profile with nsight
-def example_ncu_python_profile():
-    # Test with simple kernel
-    def test_kernel(x, y):
-        """Simple matmul kernel."""
-        return x @ y
-    
-    print("Creating test tensors...")
-    a = torch.randn(256, 256, device="cuda")
-    b = torch.randn(256, 256, device="cuda")
-    
-    print("Running nsight profiling...")
-
-    # Wrap kernel
-    def test_kernel_forward():
-        return test_kernel(a, b)
-
-    metric_values = profile_with_nsight(
-        test_kernel_forward,
-        ['sm__cycles_active.avg', 'sm__cycles_elapsed.sum', "smsp__inst_executed_pipe_tensor_op_hmma.sum"],
-        num_trials=1,
-    )
-    
-    print("\nProfiling results:")
-    for metric_name, value in metric_values.items():
-        print(f"  {metric_name}: {value}")
-    return
-    
-
-def check_ncu_available() -> bool:
-    from shutil import which
-    return which('ncu') is not None
-
-
-
-# pytorch profiler
-# migrate from old repo during ICML / caesar repo
-
-
 def profile_kernelbench_model_with_nsight(
     custom_model_src: str,
     ref_model_src: str = None,
@@ -113,27 +160,46 @@ def profile_kernelbench_model_with_nsight(
     verbose: bool = False,
 ) -> dict:
     """
-    Profile a kernelbench model using nsight-python.
+    Profile a KernelBench model (ModelNew) using Nsight hardware metrics.
+    
+    This is the high-level profiling function designed for KernelBench workflows.
+    It handles the full lifecycle:
+    1. Load and compile the custom model from source code
+    2. Generate inputs using the model's get_inputs() function
+    3. Profile the forward pass with Nsight
+    4. Clean up resources
     
-    Assumes model has been validated and compiled successfully via eval.
-    No error checking is performed - this should only be called after eval proves correctness.
+    IMPORTANT: This function assumes the model has already been validated for
+    correctness via eval. No correctness checking is performed here.
     
     Args:
-        custom_model_src: Source code string for the custom model (ModelNew class)
-        ref_model_src: Optional source code string for reference model to get get_inputs/get_init_inputs.
-                      If None, will try to get these from custom_model_src.
-        metrics: List of nsight metrics to collect. Default: ['sm__cycles_active.avg']
-        num_trials: Number of trials to run. Default: 1
-        seed: Random seed for reproducible inputs
-        device: CUDA device to run on. Default: cuda:0
-        backend: Backend type ('cuda', 'triton', 'tilelang', 'cute'). Default: 'cuda'
-        precision: torch.dtype for computation. Default: torch.float32
-        build_dir: Build directory for compiled kernels. Default: None
-        verbose: Whether to print verbose output. Default: False
+        custom_model_src: Python source code string containing the ModelNew class.
+        ref_model_src: Optional source code for the reference model. Used to get
+                       get_inputs() and get_init_inputs() if they're not in
+                       custom_model_src. If None, uses custom_model_src.
+        metrics: List of Nsight metrics to collect. Defaults to ['sm__cycles_active.avg'].
+        num_trials: Number of profiling runs for averaging. Default: 1.
+        seed: Random seed for reproducible input generation. Default: 42.
+        device: CUDA device to run on. Default: cuda:0.
+        backend: Compilation backend ('cuda', 'triton', 'tilelang', 'cute').
+        precision: torch.dtype for computation. Default: torch.float32.
+        build_dir: Directory for compiled kernel artifacts. Default: None.
+        verbose: Print progress messages. Default: False.
     
     Returns:
-        Dictionary mapping metric names to their values
+        Dictionary mapping metric names to their measured values.
+        Values are None if the metric couldn't be collected.
+    
+    Example:
+        >>> results = profile_kernelbench_model_with_nsight(
+        ...     custom_model_src=my_model_code,
+        ...     ref_model_src=ref_model_code,
+        ...     metrics=['gpu__time_duration.sum', 'sm__cycles_elapsed.sum'],
+        ...     verbose=True
+        ... )
+        >>> print(f"GPU time: {results['gpu__time_duration.sum']} ns")
     """
+    # Import eval utilities (deferred to avoid circular imports)
     from kernelbench.eval import (
         load_custom_model,
         load_custom_model_with_tempfile,
@@ -142,40 +208,57 @@ def profile_kernelbench_model_with_nsight(
         set_seed,
         graceful_eval_cleanup,
     )
-    from kernelbench.utils import set_gpu_arch
     
+    # Set defaults
     device = device or torch.device("cuda:0")
-    metrics = metrics or ['sm__cycles_active.avg']
-    metrics = [metrics] if isinstance(metrics, str) else metrics
-    
-    # set_gpu_arch(["Ada"]) # NOTE: can set GPU arch here if needed
+    if metrics is None:
+        metrics = ['sm__cycles_active.avg']
+    elif isinstance(metrics, str):
+        metrics = [metrics]
     
     torch.cuda.set_device(device)
     
-    # Load input functions using existing eval function
+    # -------------------------------------------------------------------------
+    # Step 1: Load input generation functions from model source
+    # -------------------------------------------------------------------------
+    # The model source should define get_inputs() and get_init_inputs() functions
+    # that return the tensors needed to run the model.
     input_source = ref_model_src or custom_model_src
     context = {}
     _, get_init_inputs, get_inputs = load_original_model_and_inputs(input_source, context)
     
-    # Prepare inputs
+    # Generate initialization inputs (for model constructor)
     set_seed(seed)
-    init_inputs = [_process_input_tensor(x, device, backend, precision) for x in get_init_inputs()]
-    
-    # Load and compile model
+    init_inputs = [
+        _process_input_tensor(x, device, backend, precision) 
+        for x in get_init_inputs()
+    ]
+    
+    # -------------------------------------------------------------------------
+    # Step 2: Load and compile the custom model
+    # -------------------------------------------------------------------------
     if verbose:
         print("[Profile] Loading and compiling custom model...")
     
+    # Enable CUDA Device-Side Assertions for better error messages
     os.environ["TORCH_USE_CUDA_DSA"] = "1"
     tempfile = None
     
+    # Different backends require different loading mechanisms
     if backend.lower() in ["triton", "tilelang", "cute"]:
-        ModelNew, tempfile = load_custom_model_with_tempfile(custom_model_src, entry_point="ModelNew")
+        # These backends need a temp file for proper module loading
+        ModelNew, tempfile = load_custom_model_with_tempfile(
+            custom_model_src, entry_point="ModelNew"
+        )
     else:
+        # Standard CUDA backend
         ModelNew = load_custom_model(custom_model_src, {}, build_dir)
     
     torch.cuda.synchronize(device=device)
     
-    # Instantiate model
+    # -------------------------------------------------------------------------
+    # Step 3: Instantiate the model
+    # -------------------------------------------------------------------------
     with torch.no_grad():
         set_seed(seed)
         custom_model = ModelNew(*init_inputs)
@@ -185,94 +268,119 @@ def profile_kernelbench_model_with_nsight(
     if verbose:
         print("[Profile] Model instantiated successfully")
     
-    # Prepare profiling inputs
+    # -------------------------------------------------------------------------
+    # Step 4: Profile the forward pass
+    # -------------------------------------------------------------------------
+    # Generate forward pass inputs
     set_seed(seed)
-    inputs = [_process_input_tensor(x, device, backend, precision) for x in get_inputs()]
+    inputs = [
+        _process_input_tensor(x, device, backend, precision) 
+        for x in get_inputs()
+    ]
     
-    # Profile
     if verbose:
         print(f"[Profile] Profiling with nsight (metrics: {metrics})...")
     
-    # Wrap model forward pass
+    # Create a closure for the forward pass
     def model_forward():
         with torch.no_grad():
             return custom_model(*inputs)
     
-    # Profile with nsight
-    metric_values = profile_with_nsight(model_forward, metrics=metrics, num_trials=num_trials)
+    # Run profiling
+    metric_values = profile_with_nsight(
+        model_forward, 
+        metrics=metrics, 
+        num_trials=num_trials
+    )
     
     if verbose:
         print("[Profile] Profiling completed successfully")
     
-    # Cleanup using existing eval function
+    # -------------------------------------------------------------------------
+    # Step 5: Cleanup
+    # -------------------------------------------------------------------------
     graceful_eval_cleanup(context, device, tempfile)
     
     return metric_values
 
 
+# =============================================================================
+# Examples and Tests
+# =============================================================================
 
-if NSIGHT_AVAILABLE:
-    @nsight.analyze.kernel
-    def benchmark_matmul(n):
-        """Standard benchmark following nsight-python docs."""
-        a = torch.randn(n, n, device="cuda")
-        b = torch.randn(n, n, device="cuda")
-        with nsight.annotate("matmul"):
-            c = a @ b
-        return c
-
-
+def example_ncu_python_profile():
+    """
+    Simple example demonstrating how to profile a basic matrix multiplication.
+    
+    This shows the minimal setup needed to use profile_with_nsight().
+    """
+    print("Creating test tensors...")
+    a = torch.randn(256, 256, device="cuda")
+    b = torch.randn(256, 256, device="cuda")
+    
+    # Create a closure that captures the tensors
+    def matmul_kernel():
+        return a @ b
+    
+    print("Running nsight profiling...")
+    
+    metric_values = profile_with_nsight(
+        matmul_kernel,
+        metrics=[
+            'sm__cycles_active.avg',           # Average active cycles per SM
+            'sm__cycles_elapsed.sum',          # Total cycles elapsed
+            'smsp__inst_executed_pipe_tensor_op_hmma.sum',  # Tensor core ops
+        ],
+        num_trials=1,
+    )
+    
+    print("\nProfiling results:")
+    for metric_name, value in metric_values.items():
+        print(f"  {metric_name}: {value}")
 
 
-###########
-# 
-############
 def test_flash_attention_profile():
     """
-    Test the profile_kernelbench_model_with_nsight function using the tiled_matmul example.
+    Test profiling a Flash Attention model from the KernelBench examples.
+    
+    This demonstrates the full workflow of profiling a KernelBench model
+    using profile_kernelbench_model_with_nsight().
     """
-    import os
     from kernelbench.utils import read_file
     
-    # Get the paths to the reference and custom model files
+    # Locate the example model files
     REPO_ROOT = os.path.dirname(__file__)
     ref_model_path = os.path.join(
-        REPO_ROOT,
-        "prompts/few_shot/model_ex_flash_attn.py"
+        REPO_ROOT, "prompts/few_shot/model_ex_flash_attn.py"
     )
     custom_model_path = os.path.join(
-        REPO_ROOT,
-        "prompts/few_shot/model_new_ex_flash_attn.py"
+        REPO_ROOT, "prompts/few_shot/model_new_ex_flash_attn.py"
     )
     
-    # Read the model source files
     print("[Test] Reading model source files...")
     ref_model_src = read_file(ref_model_path)
     custom_model_src = read_file(custom_model_path)
     
-    
     print("[Test] Starting profiling with nsight...")
     
-    # Profile the custom model
     metrics = profile_kernelbench_model_with_nsight(
         custom_model_src=custom_model_src,
         ref_model_src=ref_model_src,
         metrics=[
-            'gpu__time_duration.sum',
-            'sm__cycles_elapsed.sum'
+            'gpu__time_duration.sum',   # Total GPU execution time (ns)
+            'sm__cycles_elapsed.sum',   # Total SM cycles
         ],
         seed=42,
         backend="cuda",
         precision=torch.float32,
         verbose=True
     )
-    print(metrics)
     
     print("\n[Test] Profiling results:")
     print("=" * 60)
     for metric_name, value in metrics.items():
         if value is not None:
-            print(f"  {metric_name}: {value}")
+            print(f"  {metric_name}: {value:,.0f}")
         else:
             print(f"  {metric_name}: <not available>")
     print("=" * 60)
@@ -280,18 +388,47 @@ def test_flash_attention_profile():
     return metrics
 
 
+# Optional: Decorated benchmark function for direct use with nsight
+if NSIGHT_AVAILABLE:
+    @nsight.analyze.kernel
+    def benchmark_matmul(n):
+        """
+        Standard benchmark following nsight-python documentation style.
+        
+        This shows how to use the @nsight.analyze.kernel decorator directly
+        for simple benchmarking scenarios.
+        """
+        a = torch.randn(n, n, device="cuda")
+        b = torch.randn(n, n, device="cuda")
+        with nsight.annotate("matmul"):
+            c = a @ b
+        return c
+
+
+# =============================================================================
+# Main Entry Point
+# =============================================================================
 
 if __name__ == "__main__":
+    # Verify prerequisites
     if not check_ncu_available():
-        print("ncu not found in PATH. Install Nsight Compute.")
+        print("ERROR: ncu not found in PATH.")
+        print("Install NVIDIA Nsight Compute from:")
+        print("  https://developer.nvidia.com/nsight-compute")
         exit(1)
     
     if not torch.cuda.is_available():
-        print("CUDA is not available.")
+        print("ERROR: CUDA is not available.")
         exit(1)
-        
     
-    # test the example_ncu_python_profile, and flash attention profile
+    print("=" * 60)
+    print("Running Nsight Profiling Examples")
+    print("=" * 60)
+    
+    # Run the simple example first
+    print("\n--- Example: Basic Matrix Multiplication ---\n")
     example_ncu_python_profile()
+    
+    # Run the full KernelBench model test
+    print("\n--- Test: Flash Attention Model ---\n")
     test_flash_attention_profile()
-    
\ No newline at end of file

From cd6415612ef6f5be431f233e8270dc2631bb3f79 Mon Sep 17 00:00:00 2001
From: Kesavan Ramakrishnan <kesavanr@stanford.edu>
Date: Thu, 8 Jan 2026 01:56:32 +0000
Subject: [PATCH 17/18] add in tests to test_eval_timing

---
 .../unit_tests/test_eval_timing.py            | 53 ++++++++++++++++---
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/kernelbench/unit_tests/test_eval_timing.py b/src/kernelbench/unit_tests/test_eval_timing.py
index e524ee46..6ceb3adb 100644
--- a/src/kernelbench/unit_tests/test_eval_timing.py
+++ b/src/kernelbench/unit_tests/test_eval_timing.py
@@ -5,6 +5,7 @@
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from kernelbench import timing
+from kernelbench.profile import NSIGHT_AVAILABLE, check_ncu_available
 
 """
 Test Timing
@@ -78,19 +79,59 @@ def matmul_kernel(a, b):
 # test all currently available timing methods
 def run_all_timing_tests(device="cuda"):
     timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"]
-    # timing_methods = ["cuda_event", "do_bench_impl"]
     for timing_method in timing_methods:
         _run_timing_smoke_test_matmul(timing_method, device=device)
 
 
-# TODO: make the ncu profiling a spearte one
-# because not veeryone can have hardware counter
-# def run_ncu_against_cuda_event():
-#     pass
-      # need ncu install first (can call the helper function in profile.py to check if ncu exists)
+def run_nsight_timing_test(device="cuda"):
+    """
+    Run nsight-python timing test if available.
+    
+    Nsight requires:
+    - nsight-python package installed
+    - ncu (Nsight Compute CLI) in PATH
+    
+    Compares nsight GPU time against cuda_event timing for the same matmul operation.
+    """
+    if not NSIGHT_AVAILABLE:
+        print("[SKIP] nsight-python not installed")
+        return None
+    
+    if not check_ncu_available():
+        print("[SKIP] ncu not found in PATH")
+        return None
+    
+    print("\n" + "=" * 60)
+    print("Running nsight-python timing benchmark")
+    print("=" * 60)
+    
+    # Run nsight timing
+    print("\n--- nsight_python_time ---")
+    _run_timing_smoke_test_matmul("nsight_python_time", device=device)
+    
+    # Run cuda_event for comparison
+    print("\n--- cuda_event (for comparison) ---")
+    _run_timing_smoke_test_matmul("cuda_event", device=device)
+    
+    print("\n" + "=" * 60)
+    print("nsight-python timing benchmark complete")
+    print("=" * 60)
+
+
+def run_all_timing_tests_with_nsight(device="cuda"):
+    """
+    Run all timing methods including nsight-python if available.
+    """
+    # Standard timing methods (always available)
+    run_all_timing_tests(device=device)
+    
+    # Nsight timing (requires ncu + nsight-python)
+    run_nsight_timing_test(device=device)
+
 
 test_device = torch.device("cuda:5")
 run_all_timing_tests(test_device)
+run_nsight_timing_test(test_device)
 
 
From 95468c30a67157ad944fc982371328480b520bc0 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Thu, 8 Jan 2026 02:07:26 +0000
Subject: [PATCH 18/18] get ready for merge

---
 EVAL.md                                        | 3 +++
 README.md                                      | 4 +++-
 src/kernelbench/profile.py                     | 4 ++++
 src/kernelbench/timing.py                      | 2 ++
 src/kernelbench/unit_tests/test_eval_timing.py | 1 +
 5 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/EVAL.md b/EVAL.md
index b9880560..faebda6f 100644
--- a/EVAL.md
+++ b/EVAL.md
@@ -36,6 +36,9 @@ We have (and continue to) implement various approaches to conduct kernel timing
 
 Check out `timing.py` to see available timing methods and `src/unit_tests/test_eval_timing.py` to test out various timing methods (including leveraging `cuda_event` marker, Triton `do_bench`, `host_time` E2E time). @palic and team is working on a blogpost explaining the different tradeoffs soon. 
 
+### Profiling
+We have experimental profiling support leveraging NVIDIA NCU in `profile.py`.
+
 ### Checkers
 There are potentially many ways model might reward hack and we would like to catch the known ways through checkers [experimental and WIP]. We start with `kernel_static_checker.py`, which is a regex-based checker on the genenrated code against set of rules. We plan to add AST-based, LM-as-a-judge, and more runtime checks in the future. We welcome suggestions and contributions here.
 
diff --git a/README.md b/README.md
index 3686574a..7343e73b 100644
--- a/README.md
+++ b/README.md
@@ -66,13 +66,15 @@ We organize the repo into the following structure:
 KernelBench/
 ├── assets/
 ├── KernelBench/ # Benchmark dataset files
-├── src/ # KernelBench logic code
+├── src/kernelbench/ # KernelBench logic code
 │   ├── unit_tests/  
 │   ├── prompts/
 │   ├── ....
 ├── scripts/ # helpful scripts to run the benchmark
 ├── results/ # baseline times across hardware 
 ├── runs/ # where your runs will be stored
+├── notebooks/ # example notebooks for analysis
+├── pyproject.toml # Project configuration and dependencies
 ```
 
 ## 🔧 Set up
diff --git a/src/kernelbench/profile.py b/src/kernelbench/profile.py
index 2a3243c0..8326324e 100644
--- a/src/kernelbench/profile.py
+++ b/src/kernelbench/profile.py
@@ -5,6 +5,10 @@
 This module provides GPU profiling capabilities using NVIDIA Nsight Compute (ncu).
 It allows collecting hardware-level metrics from kernels.
 
+NOTE: this is an experimental module, not part of the default eval pass. 
+You need hardware counter access (usually requires sudo) to accesss hardware counter.
+We only support local mode with this feature, not avaliable on Modal.
+
 Key Features:
 - Profile arbitrary PyTorch functions with hardware metrics
 - Profile KernelBench models (ModelNew) with automatic setup/teardown
diff --git a/src/kernelbench/timing.py b/src/kernelbench/timing.py
index b48698cb..83908391 100644
--- a/src/kernelbench/timing.py
+++ b/src/kernelbench/timing.py
@@ -452,6 +452,8 @@ def wrapped_kernel():
         if verbose:
             print(f"Average GPU time: {gpu_time_ms:.3f} ms (across {num_trials} trials)")
         
+        # NOTE: nsight only returns average time across num_trials, so we return a single value in a list
+        # it did run num_trials times, but we only return the average (1 item)
         # Return list with single average value for API consistency
         return [gpu_time_ms]
 
diff --git a/src/kernelbench/unit_tests/test_eval_timing.py b/src/kernelbench/unit_tests/test_eval_timing.py
index 6ceb3adb..470c4254 100644
--- a/src/kernelbench/unit_tests/test_eval_timing.py
+++ b/src/kernelbench/unit_tests/test_eval_timing.py
@@ -129,6 +129,7 @@ def run_all_timing_tests_with_nsight(device="cuda"):
     run_nsight_timing_test(device=device)
 
 
+# select a free GPU here or set CUDA_VISIBLE_DEVICES
 test_device = torch.device("cuda:5")
 run_all_timing_tests(test_device)
 run_nsight_timing_test(test_device)