ScalingIntelligence · simonguozirui · Jan 6, 2026 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/EVAL.md b/EVAL.md
@@ -36,6 +36,9 @@ We have (and continue to) implement various approaches to conduct kernel timing
 
 Check out `timing.py` to see available timing methods and `src/unit_tests/test_eval_timing.py` to test out various timing methods (including leveraging `cuda_event` marker, Triton `do_bench`, `host_time` E2E time). @palic and team is working on a blogpost explaining the different tradeoffs soon. 
 
+### Checkers
+There are potentially many ways model might reward hack and we would like to catch the known ways through checkers [experimental and WIP]. We start with `kernel_static_checker.py`, which is a regex-based checker on the genenrated code against set of rules. We plan to add AST-based, LM-as-a-judge, and more runtime checks in the future. We welcome suggestions and contributions here.
+
 ### Unit Tests with Adversarial Examples
 We've included some unit tests for the eval script in `src/unit_tests/test_eval_adversarial.py`. These tests run adversarial kernels (see `src/unit_tests/test_kernels/`) that contain examples of reward hacking that we've seen from LLMs and ensures that the eval script catches them, either by failing their correctness checks or flagging them for excessive speedups. Examples include:
 - Reusing computations cached during the PyTorch reference

diff --git a/README.md b/README.md
@@ -117,24 +117,10 @@ uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface
 * **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
 *  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`.
 
-Check the config fields for comprehensive set of options. Note we provide the model with a one-shot example by default along with the minimum set of info; you can check out other prompt settings or construct your own in `src/prompt_constructor_toml.py`.
-
-### Running Thunderkittens Locally
-If you plan on using `scripts/generate_and_eval_single_sample.py` using `backend=thunderkittens`, make sure to git clone the ThunderKittens repo and you set the following environment variable to point to your local ThunderKittens directory:
-
-```bash
-export THUNDERKITTENS_ROOT=/Users/willychan/Desktop/projects/KernelBench/ThunderKittens
-```
 
-As seen in `src/kernelbench/prompts/model_new_ex_add_thunderkittens.py`, the generated kernels should have the following line:
+Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=<PATH to ThunderKittens folder>`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now.
 
-```bash
-tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")
-```
-
-This allows the kernel to include the right TK primitives.
-
-*NOTE*: Right now, all generated ThunderKittens kernels are required to be in datatype format BF16. FP16 support is TBD.
+Check the config fields for comprehensive set of options. Note we provide the model with a one-shot example by default along with the minimum set of info; you can check out other prompt settings or construct your own in `src/prompt_constructor_toml.py`.
 
 ### Run on all problems 
 

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
@@ -83,6 +83,8 @@ def __init__(self):
         self.hardware_gpu_name = None
         self.custom_prompt_key = None
 
+        self.check_kernel = True  # [experimental] optional static checker catching potential hacking patterns
+
     def verbose_logging(self):
         self.log = True
         self.log_prompt = True
@@ -260,6 +262,19 @@ def main(config: EvalConfig):
         custom_kernel is not None
     ), f"Custom {config.backend} kernel code generation failed"
 
+    # Optional: static code checker for kernel code using regex matching
+    # NOTE: by no means is this checker complete, but it could help catch some potential hacks
+    if config.check_kernel:
+        from kernelbench.kernel_static_checker import validate_kernel_static
+        static_check_status, errors, warnings = validate_kernel_static(
+            custom_kernel,
+            backend=config.backend,
+            precision=config.precision,
+        )
+        assert static_check_status, f"Static check failed for level {config.level} problem {config.problem_id}. Errors: {errors}. Warnings: {warnings}"
+        if warnings:
+            print(f"Static check warnings for level {config.level} problem {config.problem_id}: {warnings}")
+
     # this should be optional
     if config.log:
         with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
@@ -80,6 +80,8 @@ def __init__(self):
         self.hardware_gpu_name = None
         self.custom_prompt_key = None
 
+        self.check_kernel = True  # [experimental] optional static checker catching potential hacking patterns
+
     def verbose_logging(self):
         self.log = True
         self.log_prompt = True
@@ -283,6 +285,19 @@ def main(config: EvalConfig):
     # check LLM is able to generate custom kernel code
     assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
 
+    # Optional: static code checker for kernel code using regex matching
+    # NOTE: by no means is this checker complete, but it could help catch some potential hacks
+    if config.check_kernel:
+        from kernelbench.kernel_static_checker import validate_kernel_static
+        static_check_status, errors, warnings = validate_kernel_static(
+            custom_kernel,
+            backend=config.backend,
+            precision=config.precision,
+        )
+        assert static_check_status, f"Static check failed for level {config.level} problem {config.problem_id}. Errors: {errors}. Warnings: {warnings}"
+        if warnings:
+            print(f"Static check warnings for level {config.level} problem {config.problem_id}: {warnings}")
+
     # this should be optional
     if config.log:
         with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:

diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
@@ -18,6 +18,7 @@
     read_file,
     set_gpu_arch,
 )
+from kernelbench.kernel_static_checker import validate_kernel_static
 
 """
 Batch Generate Samples for Particular Level
@@ -84,6 +85,8 @@ def __init__(self):
         self.hardware_gpu_name = None
         self.custom_prompt_key = None
 
+        self.check_kernel = True  # [experimental] optional static checker catching potential hacking patterns
+
     def greedy(self):
         # For greedy decoding, epsecially baseline eval
         self.greedy_sample = True
@@ -162,6 +165,19 @@ def generate_sample_single(
     # check LLM is able to generate custom CUDA code
     assert custom_kernel is not None, "Custom CUDA code generation failed"
 
+    # Optional: we provide a static code checker for kernel code using regex matching
+    # NOTE: by no means, is this checker complete, but it might could help catch some potential hacks and issues
+    if config.check_kernel:
+        static_check_status, error, warnings = validate_kernel_static(custom_kernel,
+            backend=config.backend,
+            precision=config.precision, 
+            # uses the default set of forbidden and warning patterns, 
+            # you could adapt the patterns to your own setting (degree of banning cuda stream, allowing some torch ops)
+        )
+        assert static_check_status, f"Static check failed for sample {work.sample_id} for problem {problem_number}: {problem_name}. Error: {error}. Warnings: {warnings}"
+        if warnings:
+            print(f"Static check warnings for sample {work.sample_id} for problem {problem_number}: {problem_name}. Warnings: {warnings}")
+
     if config.verbose:
         print(
             f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}"

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
@@ -10,6 +10,7 @@
 from kernelbench import utils as kernel_utils
 from scripts.generate_baseline_time import measure_program_time
 from kernelbench.utils import read_file
+from kernelbench.kernel_static_checker import validate_kernel_static
 
 # Modal setup
 app = modal.App("run_and_check")
@@ -120,6 +121,8 @@ def __init__(self):
         self.precision = "fp32"
         self.backend = "cuda"
 
+        self.check_kernel = True  # [experimental] optional static checker catching potential hacking patterns
+
     def __repr__(self):
         return f"ScriptConfig({self.to_dict()})"
 
@@ -279,6 +282,18 @@ def main(config: ScriptConfig):
 
     kernel_src = read_file(config.kernel_src_path)
 
+    # Optional: static code checker for kernel code using regex matching
+    # NOTE: by no means is this checker complete, but it could help catch some potential hacks
+    if config.check_kernel:
+        static_check_status, errors, warnings = validate_kernel_static(
+            kernel_src,
+            backend=config.backend,
+            precision=config.precision,
+        )
+        assert static_check_status, f"Static check failed. Errors: {errors}. Warnings: {warnings}"
+        if warnings:
+            print(f"[WARN] Static check warnings: {warnings}")
+
     # Start Evaluation
     assert config.eval_mode in ["local", "modal"], "eval_mode must be either 'local' or 'modal'"