From 044db0ee7285edc2fce4c5c801d3efee9f945ee1 Mon Sep 17 00:00:00 2001
From: Sokserey Sun <sokserey@matx1.stanford.edu>
Date: Mon, 3 Nov 2025 18:27:21 -0800
Subject: [PATCH 1/5] preliminary toml file change for prompt constructor. Will
 add tilelang

---
 scripts/generate_and_eval_single_sample.py    |   4 +-
 .../generate_and_eval_single_sample_modal.py  |   4 +-
 scripts/generate_samples.py                   |   6 +-
 src/loader.py                                 | 117 ++++
 src/prompt_constructor_multilang.py           | 600 ++----------------
 src/prompts/prompts.toml                      | 291 +++++++++
 6 files changed, 476 insertions(+), 546 deletions(-)
 create mode 100644 src/loader.py
 create mode 100644 src/prompts/prompts.toml

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index ff71e4bc..79e670bd 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -146,9 +146,7 @@ def main(config: EvalConfig):
     )
 
     # Use appropriate prompt constructor based on backend
-    if config.backend == "cuda":
-        custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
-    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+    if config.backend in ["cuda", "triton", "cute"]:
         custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
         raise ValueError(
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index e9e0866a..7c8031ce 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -192,9 +192,7 @@ def main(config: EvalConfig):
 
 
     # Use appropriate prompt constructor based on backend
-    if config.backend == "cuda":
-        custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
-    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+    if config.backend in ["cuda", "triton", "cute"]:
         custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
         raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'.")
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 5ee217cf..c214cc6a 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -120,11 +120,7 @@ def generate_sample_single(
     ), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
 
     # Construct Prompt
-    if config.backend == "cuda":
-        custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(
-            ref_arch_src
-        )
-    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+    if config.backend in ["cuda", "triton", "cute"]:
         custom_cuda_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
         raise ValueError(
diff --git a/src/loader.py b/src/loader.py
new file mode 100644
index 00000000..d19de9a2
--- /dev/null
+++ b/src/loader.py
@@ -0,0 +1,117 @@
+# src/loader.py
+import os
+import runpy
+import tomli  # pip install tomli
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from .utils import read_file  # your existing util
+
+REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+
+def _abs_path(rel: str) -> str:
+    if os.path.isabs(rel):
+        return rel
+    return os.path.join(REPO_TOP_PATH, rel)
+
+@dataclass
+class PromptConfig:
+    data: Dict[str, Any]
+
+    @classmethod
+    def from_toml(cls, path: str) -> "PromptConfig":
+        with open(path, "rb") as f:
+            data = tomli.load(f)
+        return cls(data)
+
+    def compose_blocks(self, keys: List[str]) -> str:
+        text_parts = []
+        for key in keys:
+            node: Any = self.data
+            for part in key.split("."):
+                if part not in node:
+                    raise KeyError(f"compose key not found: {key}")
+                node = node[part]
+            if not isinstance(node, str):
+                raise TypeError(f"compose key must resolve to string: {key}")
+            text_parts.append(node.strip() + "\n")
+        return "\n".join(text_parts).strip() + "\n"
+
+    def get_template_node(self, backend: str, template: str) -> Dict[str, Any]:
+        try:
+            return self.data["backends"][backend]["templates"][template]
+        except KeyError as e:
+            raise KeyError(f"Unknown backend/template: {backend}/{template}") from e
+
+def _gpu_context_from_py(py_path: str, gpu_name: str) -> Dict[str, str]:
+    """
+    Load GPU_* dicts from a Python file (no exec of raw strings; use runpy).
+    Expected globals:
+      - GPU_SPEC_INFO: dict[str, dict]
+      - GPU_DEFINITIONS: dict[str, str]
+      - GPU_BEST_PRACTICES: list[str]  OR {"list": [...]} for compatibility
+    """
+    mod = runpy.run_path(py_path)
+    spec_info = mod.get("GPU_SPEC_INFO", {})
+    definitions = mod.get("GPU_DEFINITIONS", {})
+    best = mod.get("GPU_BEST_PRACTICES", [])
+
+    if not spec_info or not definitions or best is None:
+        raise ValueError("GPU_SPEC_INFO / GPU_DEFINITIONS / GPU_BEST_PRACTICES missing in gpu specs .py")
+
+    if isinstance(best, dict) and "list" in best:
+        best = best["list"]
+
+    if gpu_name not in spec_info:
+        raise KeyError(f"GPU name {gpu_name} not found in GPU_SPEC_INFO")
+
+    curr = spec_info[gpu_name]
+    gpu_architecture = curr.get("GPU Architecture", "Unknown")
+    specs_bullets = "\n".join([f"- We have {v} of {k}." for k, v in curr.items() if k != "GPU Architecture"])
+    defs_bullets = "\n".join([f"- {k}: {v}" for k, v in definitions.items()])
+    best_bullets = "\n".join([f"- {x}" for x in (best or [])])
+
+    return {
+        "gpu_name": gpu_name,
+        "gpu_architecture": gpu_architecture,
+        "gpu_specs_bullets": specs_bullets,
+        "gpu_definitions_bullets": defs_bullets,
+        "gpu_best_practices_bullets": best_bullets,
+    }
+
+def render_prompt(
+    *,
+    prompts_toml: str,
+    backend: str,
+    template: str,
+    context: Dict[str, str],
+    gpu_specs_py: Optional[str] = None,
+    gpu_name: Optional[str] = None,
+) -> str:
+    cfg = PromptConfig.from_toml(prompts_toml)
+    node = cfg.get_template_node(backend, template)
+
+    # Load example files if requested
+    if node.get("requires_example"):
+        ex_arch_path = _abs_path(node["example_arch_path"])
+        ex_new_path = _abs_path(node["example_new_arch_path"])
+        context = {
+            **context,
+            "example_arch_src": read_file(ex_arch_path),
+            "example_new_arch_src": read_file(ex_new_path),
+        }
+
+    # Load GPU details (from .py) if requested
+    if node.get("requires_gpu"):
+        if not (gpu_specs_py and gpu_name):
+            raise ValueError("Template requires GPU info; provide gpu_specs_py and gpu_name")
+        context = {**context, **_gpu_context_from_py(_abs_path(gpu_specs_py), gpu_name)}
+
+    # Compose & fill
+    compose_keys = node["compose"]
+    prompt_text = cfg.compose_blocks(compose_keys)
+
+    try:
+        return prompt_text.format(**context).strip() + "\n"
+    except KeyError as e:
+        raise KeyError(f"Missing placeholder in context: {e.args[0]}") from e
diff --git a/src/prompt_constructor_multilang.py b/src/prompt_constructor_multilang.py
index 39d16243..4a744cee 100644
--- a/src/prompt_constructor_multilang.py
+++ b/src/prompt_constructor_multilang.py
@@ -1,553 +1,83 @@
+# src/prompts/prompt_constructor.py  (public facade; keep old imports working)
 import os
-from .utils import read_file
+from .loader import render_prompt, _abs_path
 
-"""
-Multi-Language Prompt Constructor
+REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+PROMPTS_TOML = _abs_path("src/prompts/prompts.toml")
+GPU_SPECS_PY = "src/prompts/hardware/gpu_specs.py"  # still a Python file
 
-Supports: Triton, CuTe (TileLang currently disabled/commented out)
-
-Design principles: 
-- To evaluate base model performance on KernelBench, we use the simplest prompt possible to guide model output to generated desired output format.
-- However, we do not do extensive prompt engineering or few-shot examples in the LLM to steer behaviour. 
-"""
-
-REPO_TOP_PATH = os.path.abspath(
-    os.path.join(
-        os.path.dirname(__file__),
-        "..",
+def get_prompt_for_backend(ref_arch_src: str, backend: str = "triton") -> str:
+    return render_prompt(
+        prompts_toml=PROMPTS_TOML,
+        backend=backend.lower(),
+        template="default",
+        context={"ref_arch_src": ref_arch_src},
     )
-)
-KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
-
-
-def get_arch_definition_from_file(arch_path):
-    arch_src = read_file(arch_path)
-    return get_arch_definition(arch_src)
-
-
-def get_arch_definition(arch_src):
-    """
-    Construct torch definition from original torch nn.Module definition
-    """
-    prompt = f"Here is a pytorch defintion of a neural network architecture in the file model.py: ```{arch_src}```\n"
-    return prompt
-
-
-################################################################################
-# Triton Backend
-################################################################################
-
-TRITON_PROBLEM_STATEMENT = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups. \n
-    You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-"""
-
-TRITON_PROBLEM_INSTRUCTION = """
-Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-"""
-
-TRITON_PROBLEM_STATEMENT_CLEANED = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-"""
-
-TRITON_PROBLEM_INSTRUCTION_CLEANED = """
-Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-"""
-
-
-def prompt_generate_custom_triton(
-    arc_src: str, example_arch_src: str, example_new_arch_src: str
-) -> str:
-    prompt = TRITON_PROBLEM_STATEMENT
-
-    assert (
-        "@triton.jit" in example_new_arch_src
-    ), "Example new arch must contain Triton kernel"
-
-    if example_arch_src != "" and example_new_arch_src != "":
-        prompt += f"""
-        Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is: \n
-        ``` \n
-        {example_arch_src}
-        ``` \n
-        The example new arch with custom Triton kernels looks like this: \n
-        ```
-        {example_new_arch_src}
-        ``` \n
-        """
-
-    prompt += f"""
-    You are given the following architecture: \n
-    ```
-    {arc_src}
-    ```
-    """
-    prompt += TRITON_PROBLEM_INSTRUCTION
-    return prompt
-
-
-def prompt_generate_custom_triton_fewshot_and_template(
-    ref_arch_src: str, shots: list
-) -> str:
-    raise NotImplementedError("This function has not been implemented yet")
-
 
-def prompt_generate_ex_with_CoT_template_triton(ref_arch_src: str, cot_example: str) -> str:
-    raise NotImplementedError("This function has not been implemented yet")
-
-
-def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str:
-    """
-    Using prompt example (an element-wise addition) for prompt templates
-    The most basic form of example just to show LLM the task and the expected output format
-    """
-    arch = ref_arch_src
-
-    # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom Triton kernels)
-    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
-    example_new_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
+def get_prompt_with_hardware(ref_arch_src: str, backend: str, gpu_name: str) -> str:
+    return render_prompt(
+        prompts_toml=PROMPTS_TOML,
+        backend=backend.lower(),
+        template="with_hardware",
+        context={"ref_arch_src": ref_arch_src},
+        gpu_specs_py=GPU_SPECS_PY,  # <-- python file, not TOML
+        gpu_name=gpu_name,
     )
 
-    if not os.path.exists(example_arch_path):
-        raise FileNotFoundError(
-            f"Example architecture file not found: {example_arch_path}"
-        )
-    if not os.path.exists(example_new_arch_path):
-        raise FileNotFoundError(
-            f"Example new architecture file not found: {example_new_arch_path}"
-        )
-
-    example_arch = read_file(example_arch_path)
-    example_new_arch = read_file(example_new_arch_path)
-
-    return prompt_generate_custom_triton(arch, example_arch, example_new_arch)
-
-
-def prompt_generate_prompt_with_hardware_info_from_template_triton(
-    ref_arch_src: str, gpu_name: str
-) -> str:
-    """
-    Similar to prompt_generate_custom_triton_from_prompt_template,
-    but with hardware information for the given GPU
-    """
-    arch = ref_arch_src
-
-    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
-    example_new_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
+def prompt_fix_compile(backend: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
+    return render_prompt(
+        prompts_toml=PROMPTS_TOML,
+        backend=backend.lower(),
+        template="fix_compile",
+        context={
+            "ref_arch_src": ref_arch_src,
+            "custom_kernel": custom_kernel,
+            "metadata": metadata,
+            "failure_type": "to compile",
+        },
     )
-    gpu_spec_file_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/hardware/gpu_specs.py"
-    )
-
-    example_arch = read_file(example_arch_path)
-    example_new_arch = read_file(example_new_arch_path)
-    gpu_spec_info = read_file(gpu_spec_file_path)
 
-    return prompt_generate_prompt_with_hardware_info_triton(
-        ref_arch_src=arch,
-        gpu_name=gpu_name,
-        example_arch_src=example_arch,
-        example_new_arch_src=example_new_arch,
-        gpu_spec_info_src=gpu_spec_info,
+def prompt_fix_correctness(backend: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
+    return render_prompt(
+        prompts_toml=PROMPTS_TOML,
+        backend=backend.lower(),
+        template="fix_correctness",
+        context={
+            "ref_arch_src": ref_arch_src,
+            "custom_kernel": custom_kernel,
+            "metadata": metadata,
+            "failure_type": "correctness",
+        },
     )
 
-
-def prompt_generate_prompt_with_hardware_info_triton(
-    ref_arch_src: str,
-    gpu_name: str,
-    example_arch_src: str,
-    example_new_arch_src: str,
-    gpu_spec_info_src: str,
-) -> str:
-    """
-    Generate a prompt with hardware information for the given GPU
-    gpu_spec_info_src: str of the gpu spec src file
-    """
-    local_dict = {}
-    exec(gpu_spec_info_src, {}, local_dict)
-
-    GPU_SPEC_INFO = local_dict.get("GPU_SPEC_INFO")
-    GPU_DEFINITIONS = local_dict.get("GPU_DEFINITIONS")
-    GPU_BEST_PRACTICES = local_dict.get("GPU_BEST_PRACTICES")
-
-    if not GPU_SPEC_INFO or not GPU_DEFINITIONS or not GPU_BEST_PRACTICES:
-        raise ValueError(
-            "GPU_SPEC_INFO or GPU_DEFINITIONS or GPU_BEST_PRACTICES not found in gpu_spec_info_src"
-        )
-
-    assert gpu_name in GPU_SPEC_INFO, f"GPU name {gpu_name} not found in GPU_SPEC_INFO"
-
-    prompt = TRITON_PROBLEM_STATEMENT
-
-    if example_arch_src != "" and example_new_arch_src != "":
-        prompt += f"""
-        Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is: \n
-        ``` \n
-        {example_arch_src}
-        ``` \n
-        The example new arch with custom Triton kernels looks like this: 
-        ```
-        {example_new_arch_src}
-        ``` \n
-        """
-
-    curr_gpu_spec_info = GPU_SPEC_INFO[gpu_name]
-    gpu_architecture = curr_gpu_spec_info.get("GPU Architecture")
-    prompt += f"""
-    Here is some information about the underlying hardware that you should keep in mind. \n\n
-The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.\n\n"""
-
-    for key, value in curr_gpu_spec_info.items():
-        if key == "GPU Architecture":
-            continue
-        prompt += f"""- We have {value} of {key}.\n"""
-
-    prompt += f"""\n\n
-Here are some concepts about the GPU architecture that could be helpful: \n\n"""
-    for key, value in GPU_DEFINITIONS.items():
-        prompt += f"""- {key}: {value}\n"""
-
-    prompt += f"""\n\n
-Here are some best practices for writing Triton kernels on GPU: \n\n"""
-    for best_practice in GPU_BEST_PRACTICES:
-        prompt += f"""- {best_practice}\n"""
-
-    prompt += f"""
-    You are given the following architecture: \n
-    ```
-    {ref_arch_src}
-    ```
-    """
-
-    prompt += TRITON_PROBLEM_INSTRUCTION
-    return prompt
-
-
+# Optional legacy convenience wrappers (if callers use backend-specific names)
 def prompt_fix_compile_triton(ref_arch_src, custom_kernel, metadata):
-    prompt = TRITON_PROBLEM_STATEMENT
-    prompt += f"""
-    With the following architecture:
-    ```
-    {ref_arch_src}
-    ```
-    You generated the following solution and it failed to compile:
-    ```
-    {custom_kernel}
-    ```
-    Here's the metadata of the compilation error:
-    ```
-    {metadata}
-    ```
-    
-    Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
-    """
-    return prompt
-
+    return prompt_fix_compile("triton", ref_arch_src, custom_kernel, metadata)
 
 def prompt_fix_correctness_triton(ref_arch_src, custom_kernel, metadata):
-    prompt = TRITON_PROBLEM_STATEMENT
-    prompt += f"""
-    With the following architecture:
-    ```
-    {ref_arch_src}
-    ```
-    You generated the following solution and it failed correctness:
-    ```
-    {custom_kernel}
-    ```
-    Here's the metadata of the correctness error:
-    ```
-    {metadata}
-    ```
-    Please consider how your custom Triton kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
-    """
-    return prompt
-
-
-################################################################################
-# TileLang Backend - COMMENTED OUT (not working currently)
-################################################################################
-
-# TILELANG_PROBLEM_STATEMENT = """You write custom TileLang kernels to replace the pytorch operators in the given architecture to get speedups. \n
-#     You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom TileLang kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-# """
-# 
-# TILELANG_PROBLEM_INSTRUCTION = """
-# Optimize the architecture named Model with custom TileLang kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-# """
-# 
-# TILELANG_PROBLEM_STATEMENT_CLEANED = """You write custom TileLang kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom TileLang kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-# """
-# 
-# TILELANG_PROBLEM_INSTRUCTION_CLEANED = """
-# Optimize the architecture named Model with custom TileLang kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-# """
-# 
-# 
-# def prompt_generate_custom_tilelang(
-#     arc_src: str, example_arch_src: str, example_new_arch_src: str
-# ) -> str:
-#     prompt = TILELANG_PROBLEM_STATEMENT
-# 
-#     if example_arch_src != "" and example_new_arch_src != "":
-#         prompt += f"""
-#         Here's an example to show you the syntax of inline embedding custom TileLang kernels in torch: The example given architecture is: \n
-#         ``` \n
-#         {example_arch_src}
-#         ``` \n
-#         The example new arch with custom TileLang kernels looks like this: \n
-#         ```
-#         {example_new_arch_src}
-#         ``` \n
-#         """
-# 
-#     prompt += f"""
-#     You are given the following architecture: \n
-#     ```
-#     {arc_src}
-#     ```
-#     """
-#     prompt += TILELANG_PROBLEM_INSTRUCTION
-#     return prompt
-# 
-# 
-# def prompt_generate_custom_tilelang_from_prompt_template(ref_arch_src: str) -> str:
-#     """
-#     Using prompt example for TileLang
-#     Note: You'll need to create a TileLang example file similar to the Triton one
-#     """
-#     arch = ref_arch_src
-# 
-#     # TODO: Create model_new_ex_add_tilelang.py example file
-#     example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
-#     example_new_arch_path = os.path.join(
-#         REPO_TOP_PATH, f"src/prompts/model_new_ex_add_tilelang.py"
-#     )
-# 
-#     if not os.path.exists(example_arch_path):
-#         raise FileNotFoundError(
-#             f"Example architecture file not found: {example_arch_path}"
-#         )
-#     if not os.path.exists(example_new_arch_path):
-#         # For now, use a basic template without examples if file doesn't exist
-#         return prompt_generate_custom_tilelang(arch, "", "")
-# 
-#     example_arch = read_file(example_arch_path)
-#     example_new_arch = read_file(example_new_arch_path)
-# 
-#     return prompt_generate_custom_tilelang(arch, example_arch, example_new_arch)
-# 
-# 
-# def prompt_fix_compile_tilelang(ref_arch_src, custom_kernel, metadata):
-#     prompt = TILELANG_PROBLEM_STATEMENT
-#     prompt += f"""
-#     With the following architecture:
-#     ```
-#     {ref_arch_src}
-#     ```
-#     You generated the following solution and it failed to compile:
-#     ```
-#     {custom_kernel}
-#     ```
-#     Here's the metadata of the compilation error:
-#     ```
-#     {metadata}
-#     ```
-#     
-#     Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
-#     """
-#     return prompt
-# 
-# 
-# def prompt_fix_correctness_tilelang(ref_arch_src, custom_kernel, metadata):
-#     prompt = TILELANG_PROBLEM_STATEMENT
-#     prompt += f"""
-#     With the following architecture:
-#     ```
-#     {ref_arch_src}
-#     ```
-#     You generated the following solution and it failed correctness:
-#     ```
-#     {custom_kernel}
-#     ```
-#     Here's the metadata of the correctness error:
-#     ```
-#     {metadata}
-#     ```
-#     Please consider how your custom TileLang kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
-#     """
-#     return prompt
-
-
-################################################################################
-# CuTe Backend
-################################################################################
-
-CUTE_PROBLEM_STATEMENT = """You write custom CuTe (CUTLASS) kernels to replace the pytorch operators in the given architecture to get speedups. \n
-    You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CuTe kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-"""
-
-CUTE_PROBLEM_INSTRUCTION = """
-Optimize the architecture named Model with custom CuTe (CUTLASS) kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-"""
-
-CUTE_PROBLEM_STATEMENT_CLEANED = """You write custom CuTe (CUTLASS) kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CuTe kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-"""
-
-CUTE_PROBLEM_INSTRUCTION_CLEANED = """
-Optimize the architecture named Model with custom CuTe (CUTLASS) kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-"""
-
-
-def prompt_generate_custom_cute(
-    arc_src: str, example_arch_src: str, example_new_arch_src: str
-) -> str:
-    prompt = CUTE_PROBLEM_STATEMENT
-
-    if example_arch_src != "" and example_new_arch_src != "":
-        prompt += f"""
-        Here's an example to show you the syntax of inline embedding custom CuTe (CUTLASS) kernels in torch: The example given architecture is: \n
-        ``` \n
-        {example_arch_src}
-        ``` \n
-        The example new arch with custom CuTe kernels looks like this: \n
-        ```
-        {example_new_arch_src}
-        ``` \n
-        """
-
-    prompt += f"""
-    You are given the following architecture: \n
-    ```
-    {arc_src}
-    ```
-    """
-    prompt += CUTE_PROBLEM_INSTRUCTION
-    return prompt
-
-
-def prompt_generate_custom_cute_from_prompt_template(ref_arch_src: str) -> str:
-    """
-    Using prompt example for CuTe
-    Note: You'll need to create a CuTe example file
-    """
-    arch = ref_arch_src
-
-    # TODO: Create model_new_ex_add_cute.py example file
-    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
-    example_new_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_cute.py"
-    )
-
-    if not os.path.exists(example_arch_path):
-        raise FileNotFoundError(
-            f"Example architecture file not found: {example_arch_path}"
-        )
-    if not os.path.exists(example_new_arch_path):
-        # For now, use a basic template without examples if file doesn't exist
-        return prompt_generate_custom_cute(arch, "", "")
-
-    example_arch = read_file(example_arch_path)
-    example_new_arch = read_file(example_new_arch_path)
-
-    return prompt_generate_custom_cute(arch, example_arch, example_new_arch)
-
+    return prompt_fix_correctness("triton", ref_arch_src, custom_kernel, metadata)
 
 def prompt_fix_compile_cute(ref_arch_src, custom_kernel, metadata):
-    prompt = CUTE_PROBLEM_STATEMENT
-    prompt += f"""
-    With the following architecture:
-    ```
-    {ref_arch_src}
-    ```
-    You generated the following solution and it failed to compile:
-    ```
-    {custom_kernel}
-    ```
-    Here's the metadata of the compilation error:
-    ```
-    {metadata}
-    ```
-    
-    Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
-    """
-    return prompt
-
+    return prompt_fix_compile("cute", ref_arch_src, custom_kernel, metadata)
 
 def prompt_fix_correctness_cute(ref_arch_src, custom_kernel, metadata):
-    prompt = CUTE_PROBLEM_STATEMENT
-    prompt += f"""
-    With the following architecture:
-    ```
-    {ref_arch_src}
-    ```
-    You generated the following solution and it failed correctness:
-    ```
-    {custom_kernel}
-    ```
-    Here's the metadata of the correctness error:
-    ```
-    {metadata}
-    ```
-    Please consider how your custom CuTe kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
-    """
-    return prompt
-
-
-################################################################################
-# Unified API
-################################################################################
-
-def get_prompt_for_backend(ref_arch_src: str, backend: str = "triton") -> str:
-    """
-    Unified API to get prompt for any supported backend
-    
-    Args:
-        ref_arch_src: Reference architecture source code
-        backend: One of 'triton', 'cute'  (tilelang removed - not working)
-    
-    Returns:
-        Prompt string for the specified backend
-    """
-    backend_lower = backend.lower()
-    
-    if backend_lower == "triton":
-        return prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
-    # elif backend_lower == "tilelang":
-    #     return prompt_generate_custom_tilelang_from_prompt_template(ref_arch_src)
-    elif backend_lower == "cute":
-        return prompt_generate_custom_cute_from_prompt_template(ref_arch_src)
-    else:
-        raise ValueError(
-            f"Unsupported backend: {backend}. Must be one of: 'triton', 'cute'"
-        )
-
-
-################################################################################
-# Main (for testing)
-################################################################################
-
-def main():
-    gpu_name = "L40S"
-    backend = "triton"  # Change this to test different backends
-
-    ref_arch_src = read_file(os.path.join(KERNEL_BENCH_PATH, f"level1/19_ReLU.py"))
-    assert len(ref_arch_src) > 0, "ref_arch_src is empty"
-    
-    prompt = get_prompt_for_backend(ref_arch_src, backend)
-    print(f"\n{'='*80}\n{backend.upper()} PROMPT:\n{'='*80}\n")
-    print(prompt)
-    
-    # Write prompt to temp file
-    temp_file_path = os.path.join(REPO_TOP_PATH, "scratch", f"prompt_{backend}_draft.txt")
-    os.makedirs(os.path.dirname(temp_file_path), exist_ok=True)
-    with open(temp_file_path, "w") as f:
-        f.write(prompt)
-    print(f"\nPrompt written to: {temp_file_path}")
-
-
-if __name__ == "__main__":
-    main()
-
-
-
+    return prompt_fix_correctness("cute", ref_arch_src, custom_kernel, metadata)
+
+def prompt_fix_compile_cuda(ref_arch_src, custom_kernel, metadata):
+    return prompt_fix_compile("cuda", ref_arch_src, custom_kernel, metadata)
+
+def prompt_fix_correctness_cuda(ref_arch_src, custom_kernel, metadata):
+    return prompt_fix_correctness("cuda", ref_arch_src, custom_kernel, metadata)
+
+__all__ = [
+    "get_prompt_for_backend",
+    "get_prompt_with_hardware",
+    "prompt_fix_compile",
+    "prompt_fix_correctness",
+    "prompt_fix_compile_triton",
+    "prompt_fix_correctness_triton",
+    "prompt_fix_compile_cute",
+    "prompt_fix_correctness_cute",
+    "prompt_fix_compile_cuda",
+    "prompt_fix_correctness_cuda",
+]
diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml
new file mode 100644
index 00000000..aeda733a
--- /dev/null
+++ b/src/prompts/prompts.toml
@@ -0,0 +1,291 @@
+[meta]
+version = "1.0"
+default_backend = "triton"
+
+[shared]
+# Centralized text blocks reused by backends
+triton_problem_statement = """
+You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups.
+
+You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
+"""
+
+triton_instruction = """
+Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
+"""
+
+cute_problem_statement = """
+You write custom CuTe (CUTLASS) kernels to replace the pytorch operators in the given architecture to get speedups.
+
+You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CuTe kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
+"""
+
+cute_instruction = """
+Optimize the architecture named Model with custom CuTe (CUTLASS) kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
+"""
+
+cuda_problem_statement = """
+You write custom CUDA kernels to replace the pytorch operators in the given architecture to get speedups.
+
+You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CUDA kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
+"""
+
+cuda_instruction = """
+Optimize the architecture named Model with custom CUDA operators! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
+"""
+
+# -------------------------------------------------------------------------
+# Backends
+# -------------------------------------------------------------------------
+
+[backends.triton.templates.default]
+compose = ["shared.triton_problem_statement", "templates.triton.example_block", "templates.triton.arch_block", "shared.triton_instruction"]
+requires_example = true
+example_arch_path = "src/prompts/model_ex_add.py"
+example_new_arch_path = "src/prompts/model_new_ex_add_triton.py"
+
+[backends.triton.templates.with_hardware]
+compose = ["shared.triton_problem_statement", "templates.triton.example_block", "templates.hardware.header", "templates.hardware.specs", "templates.hardware.definitions", "templates.hardware.best_practices", "templates.triton.arch_block", "shared.triton_instruction"]
+requires_example = true
+requires_gpu = true
+example_arch_path = "src/prompts/model_ex_add.py"
+example_new_arch_path = "src/prompts/model_new_ex_add_triton.py"
+
+[backends.triton.templates.fix_compile]
+compose = ["shared.triton_problem_statement", "templates.triton.with_arch", "templates.triton.failed_kernel", "templates.triton.compile_metadata", "templates.triton.fix_footer"]
+
+[backends.triton.templates.fix_correctness]
+compose = ["shared.triton_problem_statement", "templates.triton.with_arch", "templates.triton.failed_kernel", "templates.triton.correctness_metadata", "templates.triton.fix_footer"]
+
+[backends.cute.templates.default]
+compose = ["shared.cute_problem_statement", "templates.cute.example_block", "templates.cute.arch_block", "shared.cute_instruction"]
+requires_example = true
+example_arch_path = "src/prompts/model_ex_add.py"
+example_new_arch_path = "src/prompts/model_new_ex_add_cute.py"
+
+[backends.cute.templates.fix_compile]
+compose = ["shared.cute_problem_statement", "templates.cute.with_arch", "templates.cute.failed_kernel", "templates.cute.compile_metadata", "templates.cute.fix_footer"]
+
+[backends.cute.templates.fix_correctness]
+compose = ["shared.cute_problem_statement", "templates.cute.with_arch", "templates.cute.failed_kernel", "templates.cute.correctness_metadata", "templates.cute.fix_footer"]
+
+[backends.cuda.templates.default]
+compose = ["shared.cuda_problem_statement", "templates.cuda.example_block", "templates.cuda.arch_block", "shared.cuda_instruction"]
+requires_example = true
+example_arch_path = "src/prompts/model_ex_add.py"
+example_new_arch_path = "src/prompts/model_new_ex_add.py"
+
+[backends.cuda.templates.with_hardware]
+compose = ["shared.cuda_problem_statement", "templates.cuda.example_block", "templates.hardware.header", "templates.hardware.specs", "templates.hardware.definitions", "templates.hardware.best_practices", "templates.cuda.arch_block", "shared.cuda_instruction"]
+requires_example = true
+requires_gpu = true
+example_arch_path = "src/prompts/model_ex_add.py"
+example_new_arch_path = "src/prompts/model_new_ex_add.py"
+
+[backends.cuda.templates.fix_compile]
+compose = ["shared.cuda_problem_statement", "templates.cuda.with_arch", "templates.cuda.failed_kernel", "templates.cuda.compile_metadata", "templates.cuda.fix_footer"]
+
+[backends.cuda.templates.fix_correctness]
+compose = ["shared.cuda_problem_statement", "templates.cuda.with_arch", "templates.cuda.failed_kernel", "templates.cuda.correctness_metadata", "templates.cuda.fix_footer"]
+
+# -------------------------------------------------------------------------
+# Reusable partials (templated blocks with placeholders)
+# -------------------------------------------------------------------------
+
+[templates.triton]
+example_block = """
+Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is:
+
+{example_arch_src}
+
+
+The example new arch with custom Triton kernels looks like this:
+
+
+{example_new_arch_src}
+
+"""
+
+arch_block = """
+You are given the following architecture:
+
+
+{ref_arch_src}
+
+"""
+
+with_arch = """
+With the following architecture:
+
+
+{ref_arch_src}
+
+"""
+
+failed_kernel = """
+You generated the following solution and it failed {failure_type}:
+
+
+{custom_kernel}
+
+"""
+
+compile_metadata = """
+Here's the metadata of the compilation error:
+
+
+{metadata}
+
+"""
+
+correctness_metadata = """
+Here's the metadata of the correctness error:
+
+
+{metadata}
+
+"""
+
+fix_footer = """
+Please fix the {failure_type} in the new model code. Please output the corrected code in codeblocks.
+"""
+
+[templates.cute]
+# Same structure as templates.triton but wording says CuTe (CUTLASS)
+example_block = """
+Here's an example to show you the syntax of inline embedding custom CuTe (CUTLASS) kernels in torch: The example given architecture is:
+
+{example_arch_src}
+
+
+The example new arch with custom CuTe kernels looks like this:
+
+
+{example_new_arch_src}
+
+"""
+
+arch_block = """
+You are given the following architecture:
+
+
+{ref_arch_src}
+
+"""
+
+with_arch = """
+With the following architecture:
+
+
+{ref_arch_src}
+
+"""
+
+failed_kernel = """
+You generated the following solution and it failed {failure_type}:
+
+
+{custom_kernel}
+
+"""
+
+compile_metadata = """
+Here's the metadata of the compilation error:
+
+
+{metadata}
+
+"""
+
+correctness_metadata = """
+Here's the metadata of the correctness error:
+
+
+{metadata}
+
+"""
+
+fix_footer = """
+Please fix the {failure_type} in the new model code. Please output the corrected code in codeblocks.
+"""
+
+[templates.hardware]
+header = """
+Here is some information about the underlying hardware that you should keep in mind.
+"""
+
+specs = """
+The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.
+
+{gpu_specs_bullets}
+"""
+
+definitions = """
+Here are some concepts about the GPU architecture that could be helpful:
+
+{gpu_definitions_bullets}
+"""
+
+best_practices = """
+Here are some best practices for writing Triton kernels on GPU:
+
+{gpu_best_practices_bullets}
+"""
+
+[templates.cuda]
+example_block = """
+Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is:
+
+{example_arch_src}
+
+
+The example new arch with custom CUDA kernels looks like this:
+
+
+{example_new_arch_src}
+
+"""
+
+arch_block = """
+You are given the following architecture:
+
+
+{ref_arch_src}
+
+"""
+
+with_arch = """
+With the following architecture:
+
+
+{ref_arch_src}
+
+"""
+
+failed_kernel = """
+You generated the following solution and it failed {failure_type}:
+
+
+{custom_kernel}
+
+"""
+
+compile_metadata = """
+Here's the metadata of the compilation error:
+
+
+{metadata}
+
+"""
+
+correctness_metadata = """
+Here's the metadata of the correctness error:
+
+
+{metadata}
+
+"""
+
+fix_footer = """
+Please fix the {failure_type} in the new model code. Please output the corrected code in codeblocks.
+"""
\ No newline at end of file

From 1f054c715193ac16670119b53342c77d0d53344b Mon Sep 17 00:00:00 2001
From: Sokserey Sun <sokserey@matx1.stanford.edu>
Date: Sat, 8 Nov 2025 17:31:30 -0800
Subject: [PATCH 2/5] Cleaned up the toml file and added logic to use the toml

---
 scripts/generate_and_eval_single_sample.py    |   4 +-
 .../generate_and_eval_single_sample_modal.py  |   4 +-
 scripts/generate_samples.py                   |   4 +-
 src/loader.py                                 | 100 +++++--
 src/prompt_constructor_multilang.py           |  99 ++++---
 src/prompts/prompts.toml                      | 262 +++++-------------
 6 files changed, 202 insertions(+), 271 deletions(-)

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 79e670bd..c7e637cb 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -10,7 +10,7 @@
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_multilang import get_prompt_for_backend
+from src.prompt_constructor_multilang import get_prompt_for_language
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
@@ -147,7 +147,7 @@ def main(config: EvalConfig):
 
     # Use appropriate prompt constructor based on backend
     if config.backend in ["cuda", "triton", "cute"]:
-        custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
+        custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
     else:
         raise ValueError(
             f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 7c8031ce..3e337547 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -16,7 +16,7 @@
 #from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_multilang import get_prompt_for_backend
+from src.prompt_constructor_multilang import get_prompt_for_language
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
 app = modal.App("eval_single_sample")
@@ -193,7 +193,7 @@ def main(config: EvalConfig):
 
     # Use appropriate prompt constructor based on backend
     if config.backend in ["cuda", "triton", "cute"]:
-        custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
+        custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
     else:
         raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'.")
         
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index c214cc6a..d5e04fe7 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -11,7 +11,7 @@
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_multilang import get_prompt_for_backend
+from src.prompt_constructor_multilang import get_prompt_for_language
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
@@ -121,7 +121,7 @@ def generate_sample_single(
 
     # Construct Prompt
     if config.backend in ["cuda", "triton", "cute"]:
-        custom_cuda_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
+        custom_cuda_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
     else:
         raise ValueError(
             f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
diff --git a/src/loader.py b/src/loader.py
index d19de9a2..f9156c37 100644
--- a/src/loader.py
+++ b/src/loader.py
@@ -37,12 +37,6 @@ def compose_blocks(self, keys: List[str]) -> str:
             text_parts.append(node.strip() + "\n")
         return "\n".join(text_parts).strip() + "\n"
 
-    def get_template_node(self, backend: str, template: str) -> Dict[str, Any]:
-        try:
-            return self.data["backends"][backend]["templates"][template]
-        except KeyError as e:
-            raise KeyError(f"Unknown backend/template: {backend}/{template}") from e
-
 def _gpu_context_from_py(py_path: str, gpu_name: str) -> Dict[str, str]:
     """
     Load GPU_* dicts from a Python file (no exec of raw strings; use runpy).
@@ -79,39 +73,97 @@ def _gpu_context_from_py(py_path: str, gpu_name: str) -> Dict[str, str]:
         "gpu_best_practices_bullets": best_bullets,
     }
 
-def render_prompt(
+def render_prompt_by_option(
     *,
     prompts_toml: str,
-    backend: str,
-    template: str,
+    language: str,
+    option: str,
     context: Dict[str, str],
     gpu_specs_py: Optional[str] = None,
     gpu_name: Optional[str] = None,
 ) -> str:
+    """
+    New function that uses languages.X and options.Y structure
+    
+    Args:
+        prompts_toml: Path to the prompts.toml file
+        language: The kernel language (triton, cuda, cute)
+        option: The prompt option (basic, few_shot, hardware_info, fix_compile, fix_correctness)
+        context: Variables to fill in the prompt template
+        gpu_specs_py: Optional path to GPU specs Python file
+        gpu_name: Optional GPU name (required if option requires_gpu)
+    """
     cfg = PromptConfig.from_toml(prompts_toml)
-    node = cfg.get_template_node(backend, template)
-
+    
+    # Get language-specific content
+    try:
+        lang_data = cfg.data["languages"][language]
+    except KeyError:
+        raise KeyError(f"Unknown language: {language}")
+    
+    # Get option configuration
+    try:
+        option_data = cfg.data["options"][option]
+    except KeyError:
+        raise KeyError(f"Unknown option: {option}")
+    
+    # Get shared templates
+    shared = cfg.data.get("shared", {})
+    language_display = lang_data.get("language_display", language.upper())
+    
+    # Fill in shared templates with language-specific terms
+    problem_statement = shared.get("problem_statement", "").format(language_display=language_display)
+    instruction = shared.get("instruction", "").format(language_display=language_display)
+    
+    # Add language-specific content to context
+    context = {
+        **context,
+        "language": language.upper() if language in ["cuda", "cute"] else language.capitalize(),
+        "language_display": language_display,
+        "problem_statement": problem_statement,
+        "instruction": instruction,
+    }
+    
     # Load example files if requested
-    if node.get("requires_example"):
-        ex_arch_path = _abs_path(node["example_arch_path"])
-        ex_new_path = _abs_path(node["example_new_arch_path"])
+    if option_data.get("requires_example"):
+        # Use language-specific example arch, or fall back to shared one
+        ex_arch_path = _abs_path(
+            lang_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch")
+        )
+        ex_new_path = _abs_path(lang_data["few_shot_new_arch"])
         context = {
             **context,
             "example_arch_src": read_file(ex_arch_path),
             "example_new_arch_src": read_file(ex_new_path),
         }
-
-    # Load GPU details (from .py) if requested
-    if node.get("requires_gpu"):
+    
+    # Load GPU details if requested
+    if option_data.get("requires_gpu"):
         if not (gpu_specs_py and gpu_name):
-            raise ValueError("Template requires GPU info; provide gpu_specs_py and gpu_name")
+            raise ValueError(f"Option '{option}' requires GPU info; provide gpu_specs_py and gpu_name")
         context = {**context, **_gpu_context_from_py(_abs_path(gpu_specs_py), gpu_name)}
-
-    # Compose & fill
-    compose_keys = node["compose"]
-    prompt_text = cfg.compose_blocks(compose_keys)
-
+    
+    # Build the prompt from components
+    prompt_parts = []
+    for component in option_data["components"]:
+        if component == "problem_statement":
+            # Use the already-formatted problem_statement from context
+            prompt_parts.append(context["problem_statement"])
+        elif component == "instruction":
+            # Use the already-formatted instruction from context
+            prompt_parts.append(context["instruction"])
+        elif component.startswith("hardware_"):
+            # Hardware components from templates.hardware
+            template_key = f"templates.hardware.{component}"
+            prompt_parts.append(cfg.compose_blocks([template_key]))
+        else:
+            # Other components from templates.common
+            template_key = f"templates.common.{component}"
+            prompt_parts.append(cfg.compose_blocks([template_key]))
+    
+    prompt_text = "\n".join(prompt_parts).strip() + "\n"
+    
     try:
         return prompt_text.format(**context).strip() + "\n"
     except KeyError as e:
-        raise KeyError(f"Missing placeholder in context: {e.args[0]}") from e
+        raise KeyError(f"Missing placeholder in context: {e.args[0]}. Available: {list(context.keys())}") from e
diff --git a/src/prompt_constructor_multilang.py b/src/prompt_constructor_multilang.py
index 4a744cee..c52e0eeb 100644
--- a/src/prompt_constructor_multilang.py
+++ b/src/prompt_constructor_multilang.py
@@ -1,34 +1,59 @@
-# src/prompts/prompt_constructor.py  (public facade; keep old imports working)
+# src/prompt_constructor_multilang.py  (new option-based prompt constructor)
 import os
-from .loader import render_prompt, _abs_path
+from .loader import render_prompt_by_option, _abs_path
 
 REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 PROMPTS_TOML = _abs_path("src/prompts/prompts.toml")
 GPU_SPECS_PY = "src/prompts/hardware/gpu_specs.py"  # still a Python file
 
-def get_prompt_for_backend(ref_arch_src: str, backend: str = "triton") -> str:
-    return render_prompt(
+def get_prompt_for_language(ref_arch_src: str, language: str = "triton", option: str = "few_shot") -> str:
+    """
+    Generate a prompt for a specific language and option.
+    
+    Args:
+        ref_arch_src: The reference architecture source code
+        language: The kernel language (triton, cuda, cute)
+        option: The prompt option (basic, few_shot, hardware_info)
+    """
+    return render_prompt_by_option(
         prompts_toml=PROMPTS_TOML,
-        backend=backend.lower(),
-        template="default",
+        language=language.lower(),
+        option=option,
         context={"ref_arch_src": ref_arch_src},
     )
 
-def get_prompt_with_hardware(ref_arch_src: str, backend: str, gpu_name: str) -> str:
-    return render_prompt(
+def get_prompt_with_hardware(ref_arch_src: str, language: str, gpu_name: str) -> str:
+    """
+    Generate a hardware-aware prompt for a specific language.
+    
+    Args:
+        ref_arch_src: The reference architecture source code
+        language: The kernel language (triton, cuda, cute)
+        gpu_name: The name of the GPU (e.g., "A100", "H100")
+    """
+    return render_prompt_by_option(
         prompts_toml=PROMPTS_TOML,
-        backend=backend.lower(),
-        template="with_hardware",
+        language=language.lower(),
+        option="hardware_info",
         context={"ref_arch_src": ref_arch_src},
-        gpu_specs_py=GPU_SPECS_PY,  # <-- python file, not TOML
+        gpu_specs_py=GPU_SPECS_PY,
         gpu_name=gpu_name,
     )
 
-def prompt_fix_compile(backend: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
-    return render_prompt(
+def prompt_fix_compile(language: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
+    """
+    Generate a prompt to fix compilation errors.
+    
+    Args:
+        language: The kernel language (triton, cuda, cute)
+        ref_arch_src: The reference architecture source code
+        custom_kernel: The custom kernel code that failed
+        metadata: Compilation error metadata
+    """
+    return render_prompt_by_option(
         prompts_toml=PROMPTS_TOML,
-        backend=backend.lower(),
-        template="fix_compile",
+        language=language.lower(),
+        option="fix_compile",
         context={
             "ref_arch_src": ref_arch_src,
             "custom_kernel": custom_kernel,
@@ -37,11 +62,20 @@ def prompt_fix_compile(backend: str, ref_arch_src: str, custom_kernel: str, meta
         },
     )
 
-def prompt_fix_correctness(backend: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
-    return render_prompt(
+def prompt_fix_correctness(language: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
+    """
+    Generate a prompt to fix correctness errors.
+    
+    Args:
+        language: The kernel language (triton, cuda, cute)
+        ref_arch_src: The reference architecture source code
+        custom_kernel: The custom kernel code that failed
+        metadata: Correctness error metadata
+    """
+    return render_prompt_by_option(
         prompts_toml=PROMPTS_TOML,
-        backend=backend.lower(),
-        template="fix_correctness",
+        language=language.lower(),
+        option="fix_correctness",
         context={
             "ref_arch_src": ref_arch_src,
             "custom_kernel": custom_kernel,
@@ -50,34 +84,9 @@ def prompt_fix_correctness(backend: str, ref_arch_src: str, custom_kernel: str,
         },
     )
 
-# Optional legacy convenience wrappers (if callers use backend-specific names)
-def prompt_fix_compile_triton(ref_arch_src, custom_kernel, metadata):
-    return prompt_fix_compile("triton", ref_arch_src, custom_kernel, metadata)
-
-def prompt_fix_correctness_triton(ref_arch_src, custom_kernel, metadata):
-    return prompt_fix_correctness("triton", ref_arch_src, custom_kernel, metadata)
-
-def prompt_fix_compile_cute(ref_arch_src, custom_kernel, metadata):
-    return prompt_fix_compile("cute", ref_arch_src, custom_kernel, metadata)
-
-def prompt_fix_correctness_cute(ref_arch_src, custom_kernel, metadata):
-    return prompt_fix_correctness("cute", ref_arch_src, custom_kernel, metadata)
-
-def prompt_fix_compile_cuda(ref_arch_src, custom_kernel, metadata):
-    return prompt_fix_compile("cuda", ref_arch_src, custom_kernel, metadata)
-
-def prompt_fix_correctness_cuda(ref_arch_src, custom_kernel, metadata):
-    return prompt_fix_correctness("cuda", ref_arch_src, custom_kernel, metadata)
-
 __all__ = [
-    "get_prompt_for_backend",
+    "get_prompt_for_language",
     "get_prompt_with_hardware",
     "prompt_fix_compile",
     "prompt_fix_correctness",
-    "prompt_fix_compile_triton",
-    "prompt_fix_correctness_triton",
-    "prompt_fix_compile_cute",
-    "prompt_fix_correctness_cute",
-    "prompt_fix_compile_cuda",
-    "prompt_fix_correctness_cuda",
 ]
diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml
index aeda733a..38763c2a 100644
--- a/src/prompts/prompts.toml
+++ b/src/prompts/prompts.toml
@@ -1,111 +1,77 @@
 [meta]
 version = "1.0"
-default_backend = "triton"
+default_language = "cuda"
 
+# -------------------------------------------------------------------------
+# Shared Templates: Used by all languages with placeholders
+# -------------------------------------------------------------------------
 [shared]
-# Centralized text blocks reused by backends
-triton_problem_statement = """
-You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups.
+problem_statement = """
+You write custom {language_display} to replace the pytorch operators in the given architecture to get speedups.
 
-You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
+You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom {language_display} and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
 """
 
-triton_instruction = """
-Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
+instruction = """
+Optimize the architecture named Model with custom {language_display}! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
 """
 
-cute_problem_statement = """
-You write custom CuTe (CUTLASS) kernels to replace the pytorch operators in the given architecture to get speedups.
+# Shared example architecture (same for all languages)
+few_shot_example_arch = "src/prompts/model_ex_add.py"
 
-You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CuTe kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
-"""
-
-cute_instruction = """
-Optimize the architecture named Model with custom CuTe (CUTLASS) kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
-"""
+# -------------------------------------------------------------------------
+# Languages: Language-specific configuration (minimal, just what varies)
+# -------------------------------------------------------------------------
+[languages.triton]
+language_display = "Triton kernels"
+few_shot_new_arch = "src/prompts/model_new_ex_add_triton.py"
 
-cuda_problem_statement = """
-You write custom CUDA kernels to replace the pytorch operators in the given architecture to get speedups.
+[languages.cuda]
+language_display = "CUDA operators"
+few_shot_new_arch = "src/prompts/model_new_ex_add.py"
 
-You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CUDA kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.
-"""
-
-cuda_instruction = """
-Optimize the architecture named Model with custom CUDA operators! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code!
-"""
+[languages.cute]
+language_display = "CuTe (CUTLASS) kernels"
+few_shot_new_arch = "src/prompts/model_new_ex_add_cute.py"
 
 # -------------------------------------------------------------------------
-# Backends
+# Options: Different prompt construction modes
 # -------------------------------------------------------------------------
-
-[backends.triton.templates.default]
-compose = ["shared.triton_problem_statement", "templates.triton.example_block", "templates.triton.arch_block", "shared.triton_instruction"]
+[options.basic]
+# Basic prompt: problem statement + architecture + instruction
+description = "Minimal prompt with just problem statement and architecture"
+components = ["problem_statement", "arch_block", "instruction"]
+
+[options.few_shot]
+# With few-shot examples
+description = "Includes few-shot examples to demonstrate the task"
+components = ["problem_statement", "few_shot_block", "arch_block", "instruction"]
 requires_example = true
-example_arch_path = "src/prompts/model_ex_add.py"
-example_new_arch_path = "src/prompts/model_new_ex_add_triton.py"
 
-[backends.triton.templates.with_hardware]
-compose = ["shared.triton_problem_statement", "templates.triton.example_block", "templates.hardware.header", "templates.hardware.specs", "templates.hardware.definitions", "templates.hardware.best_practices", "templates.triton.arch_block", "shared.triton_instruction"]
-requires_example = true
+[options.hardware_info]
+# Hardware-aware prompt
+description = "Includes GPU hardware specifications and best practices"
+components = ["problem_statement", "few_shot_block", "hardware_header", "hardware_specs", "hardware_definitions", "hardware_best_practices", "arch_block", "instruction"]
 requires_gpu = true
-example_arch_path = "src/prompts/model_ex_add.py"
-example_new_arch_path = "src/prompts/model_new_ex_add_triton.py"
-
-[backends.triton.templates.fix_compile]
-compose = ["shared.triton_problem_statement", "templates.triton.with_arch", "templates.triton.failed_kernel", "templates.triton.compile_metadata", "templates.triton.fix_footer"]
-
-[backends.triton.templates.fix_correctness]
-compose = ["shared.triton_problem_statement", "templates.triton.with_arch", "templates.triton.failed_kernel", "templates.triton.correctness_metadata", "templates.triton.fix_footer"]
-
-[backends.cute.templates.default]
-compose = ["shared.cute_problem_statement", "templates.cute.example_block", "templates.cute.arch_block", "shared.cute_instruction"]
 requires_example = true
-example_arch_path = "src/prompts/model_ex_add.py"
-example_new_arch_path = "src/prompts/model_new_ex_add_cute.py"
 
-[backends.cute.templates.fix_compile]
-compose = ["shared.cute_problem_statement", "templates.cute.with_arch", "templates.cute.failed_kernel", "templates.cute.compile_metadata", "templates.cute.fix_footer"]
+[options.fix_compile]
+# For fixing compilation errors
+description = "Prompt for fixing compilation errors"
+components = ["problem_statement", "arch_with_context", "failed_kernel", "compile_metadata", "fix_footer"]
 
-[backends.cute.templates.fix_correctness]
-compose = ["shared.cute_problem_statement", "templates.cute.with_arch", "templates.cute.failed_kernel", "templates.cute.correctness_metadata", "templates.cute.fix_footer"]
-
-[backends.cuda.templates.default]
-compose = ["shared.cuda_problem_statement", "templates.cuda.example_block", "templates.cuda.arch_block", "shared.cuda_instruction"]
-requires_example = true
-example_arch_path = "src/prompts/model_ex_add.py"
-example_new_arch_path = "src/prompts/model_new_ex_add.py"
-
-[backends.cuda.templates.with_hardware]
-compose = ["shared.cuda_problem_statement", "templates.cuda.example_block", "templates.hardware.header", "templates.hardware.specs", "templates.hardware.definitions", "templates.hardware.best_practices", "templates.cuda.arch_block", "shared.cuda_instruction"]
-requires_example = true
-requires_gpu = true
-example_arch_path = "src/prompts/model_ex_add.py"
-example_new_arch_path = "src/prompts/model_new_ex_add.py"
-
-[backends.cuda.templates.fix_compile]
-compose = ["shared.cuda_problem_statement", "templates.cuda.with_arch", "templates.cuda.failed_kernel", "templates.cuda.compile_metadata", "templates.cuda.fix_footer"]
-
-[backends.cuda.templates.fix_correctness]
-compose = ["shared.cuda_problem_statement", "templates.cuda.with_arch", "templates.cuda.failed_kernel", "templates.cuda.correctness_metadata", "templates.cuda.fix_footer"]
+[options.fix_correctness]
+# For fixing correctness errors
+description = "Prompt for fixing correctness errors"
+components = ["problem_statement", "arch_with_context", "failed_kernel", "correctness_metadata", "fix_footer"]
 
 # -------------------------------------------------------------------------
-# Reusable partials (templated blocks with placeholders)
+# Templates: Reusable text blocks with placeholders
 # -------------------------------------------------------------------------
+[templates.common]
 
-[templates.triton]
-example_block = """
-Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is:
-
-{example_arch_src}
-
-
-The example new arch with custom Triton kernels looks like this:
-
-
-{example_new_arch_src}
-
-"""
-
+# --- Architecture Presentation ---
+# Used in prompts to present the reference architecture that needs optimization
 arch_block = """
 You are given the following architecture:
 
@@ -114,7 +80,8 @@ You are given the following architecture:
 
 """
 
-with_arch = """
+# Used in fix prompts to reference the architecture with contextual phrasing
+arch_with_context = """
 With the following architecture:
 
 
@@ -122,65 +89,23 @@ With the following architecture:
 
 """
 
-failed_kernel = """
-You generated the following solution and it failed {failure_type}:
-
-
-{custom_kernel}
-
-"""
-
-compile_metadata = """
-Here's the metadata of the compilation error:
-
-
-{metadata}
-
-"""
-
-correctness_metadata = """
-Here's the metadata of the correctness error:
-
-
-{metadata}
-
-"""
-
-fix_footer = """
-Please fix the {failure_type} in the new model code. Please output the corrected code in codeblocks.
-"""
-
-[templates.cute]
-# Same structure as templates.triton but wording says CuTe (CUTLASS)
-example_block = """
-Here's an example to show you the syntax of inline embedding custom CuTe (CUTLASS) kernels in torch: The example given architecture is:
+# --- Few-Shot Learning ---
+# Shows an example of input architecture and its optimized version
+few_shot_block = """
+Here's an example to show you the syntax of inline embedding custom {language_display} in torch: The example given architecture is:
 
 {example_arch_src}
 
 
-The example new arch with custom CuTe kernels looks like this:
+The example new arch with custom {language_display} looks like this:
 
 
 {example_new_arch_src}
 
 """
 
-arch_block = """
-You are given the following architecture:
-
-
-{ref_arch_src}
-
-"""
-
-with_arch = """
-With the following architecture:
-
-
-{ref_arch_src}
-
-"""
-
+# --- Error Fix Templates ---
+# Presents a kernel that failed (used in fix_compile and fix_correctness options)
 failed_kernel = """
 You generated the following solution and it failed {failure_type}:
 
@@ -209,83 +134,28 @@ fix_footer = """
 Please fix the {failure_type} in the new model code. Please output the corrected code in codeblocks.
 """
 
+# -------------------------------------------------------------------------
+# Hardware Templates: GPU-specific information blocks
+# -------------------------------------------------------------------------
 [templates.hardware]
-header = """
+hardware_header = """
 Here is some information about the underlying hardware that you should keep in mind.
 """
 
-specs = """
+hardware_specs = """
 The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.
 
 {gpu_specs_bullets}
 """
 
-definitions = """
+hardware_definitions = """
 Here are some concepts about the GPU architecture that could be helpful:
 
 {gpu_definitions_bullets}
 """
 
-best_practices = """
-Here are some best practices for writing Triton kernels on GPU:
+hardware_best_practices = """
+Here are some best practices for writing kernels on GPU:
 
 {gpu_best_practices_bullets}
 """
-
-[templates.cuda]
-example_block = """
-Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is:
-
-{example_arch_src}
-
-
-The example new arch with custom CUDA kernels looks like this:
-
-
-{example_new_arch_src}
-
-"""
-
-arch_block = """
-You are given the following architecture:
-
-
-{ref_arch_src}
-
-"""
-
-with_arch = """
-With the following architecture:
-
-
-{ref_arch_src}
-
-"""
-
-failed_kernel = """
-You generated the following solution and it failed {failure_type}:
-
-
-{custom_kernel}
-
-"""
-
-compile_metadata = """
-Here's the metadata of the compilation error:
-
-
-{metadata}
-
-"""
-
-correctness_metadata = """
-Here's the metadata of the correctness error:
-
-
-{metadata}
-
-"""
-
-fix_footer = """
-Please fix the {failure_type} in the new model code. Please output the corrected code in codeblocks.
-"""
\ No newline at end of file

From b72a7ddcc17f89f859a8c86906a9b3fad31e4f4c Mon Sep 17 00:00:00 2001
From: chilinghan <chilinghan2007@gmail.com>
Date: Mon, 10 Nov 2025 16:59:45 -0800
Subject: [PATCH 3/5] Integrate with prompt-toml branch for TT-Pykernel
 generation

---
 .../generate_and_eval_single_sample_modal.py  |   6 +-
 scripts/generate_samples.py                   |   8 +-
 src/loader.py                                 | 112 ++++++----
 src/prompt_constructor_multilang.py           |  15 +-
 src/prompts/hardware/hardware_specs.toml      | 202 +++++++++++++++++
 src/prompts/model_new_ex_add_pykernel.py      | 207 ++++++++++++++++++
 src/prompts/prompts.toml                      |  20 +-
 7 files changed, 498 insertions(+), 72 deletions(-)
 create mode 100644 src/prompts/hardware/hardware_specs.toml
 create mode 100644 src/prompts/model_new_ex_add_pykernel.py

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 3e337547..82b7b545 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -15,7 +15,6 @@
 
 #from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
-from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
 from src.prompt_constructor_multilang import get_prompt_for_language
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
@@ -192,10 +191,7 @@ def main(config: EvalConfig):
 
 
     # Use appropriate prompt constructor based on backend
-    if config.backend in ["cuda", "triton", "cute"]:
-        custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
-    else:
-        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'.")
+    custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
         
     if config.log_prompt:
         with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index d5e04fe7..cb6b8aa9 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -10,7 +10,6 @@
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
-from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
 from src.prompt_constructor_multilang import get_prompt_for_language
 from src.utils import (
     create_inference_server_from_presets,
@@ -120,12 +119,7 @@ def generate_sample_single(
     ), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
 
     # Construct Prompt
-    if config.backend in ["cuda", "triton", "cute"]:
-        custom_cuda_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
-    else:
-        raise ValueError(
-            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
-        )
+    custom_cuda_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
     if config.log_prompt:
         prompt_path = os.path.join(
             run_dir,
diff --git a/src/loader.py b/src/loader.py
index f9156c37..68ad1c04 100644
--- a/src/loader.py
+++ b/src/loader.py
@@ -15,11 +15,12 @@ def _abs_path(rel: str) -> str:
     return os.path.join(REPO_TOP_PATH, rel)
 
 @dataclass
-class PromptConfig:
+class TomlConfig:
     data: Dict[str, Any]
 
     @classmethod
-    def from_toml(cls, path: str) -> "PromptConfig":
+    def from_toml(cls, path: str) -> "TomlConfig":
+        """Load a TOML file and return a TomlConfig instance."""
         with open(path, "rb") as f:
             data = tomli.load(f)
         return cls(data)
@@ -37,40 +38,56 @@ def compose_blocks(self, keys: List[str]) -> str:
             text_parts.append(node.strip() + "\n")
         return "\n".join(text_parts).strip() + "\n"
 
-def _gpu_context_from_py(py_path: str, gpu_name: str) -> Dict[str, str]:
+def _hardware_context_from_path(specs_path: str, hardware_type: str, hardware_name: str) -> Dict[str, str]:
     """
-    Load GPU_* dicts from a Python file (no exec of raw strings; use runpy).
-    Expected globals:
-      - GPU_SPEC_INFO: dict[str, dict]
-      - GPU_DEFINITIONS: dict[str, str]
-      - GPU_BEST_PRACTICES: list[str]  OR {"list": [...]} for compatibility
+    Load hardware spec dicts from a TOML file (.toml).
+    - HARDWARE_SPEC_INFO = { type: { name: { ... } } }
+
+    For TOML files we expect structure like [hardware_type.hardware_name], plus optional
+    [hardware_type.definitions], [hardware_type.best_practices], etc.
     """
-    mod = runpy.run_path(py_path)
-    spec_info = mod.get("GPU_SPEC_INFO", {})
-    definitions = mod.get("GPU_DEFINITIONS", {})
-    best = mod.get("GPU_BEST_PRACTICES", [])
+    cfg = TomlConfig.from_toml(specs_path)
+    data = cfg.data
+
+    # Resolve hardware_type default from meta if not provided
+    if not hardware_type:
+        hardware_type = data.get("meta", {}).get("default_hardware_type")
+
+    hw_section = data.get(hardware_type)
+    if not isinstance(hw_section, dict):
+        raise KeyError(f"Hardware type '{hardware_type}' not found in specs TOML")
 
-    if not spec_info or not definitions or best is None:
-        raise ValueError("GPU_SPEC_INFO / GPU_DEFINITIONS / GPU_BEST_PRACTICES missing in gpu specs .py")
+    curr = None
+    if isinstance(hw_section.get(hardware_name), dict):
+        curr = hw_section[hardware_name]
+    if curr is None:
+        raise KeyError(f"Hardware '{hardware_name}' not found under type '{hardware_type}' in {specs_path}")
 
-    if isinstance(best, dict) and "list" in best:
-        best = best["list"]
+    # definitions
+    definitions = {}
+    if isinstance(hw_section.get("definitions"), dict):
+        definitions.update(hw_section.get("definitions"))
 
-    if gpu_name not in spec_info:
-        raise KeyError(f"GPU name {gpu_name} not found in GPU_SPEC_INFO")
+    # best practices
+    best_list: List[str] = []
+    if isinstance(hw_section.get("best_practices"), dict):
+        best_list.extend(hw_section.get("best_practices", {}).get("items", []))
 
-    curr = spec_info[gpu_name]
-    gpu_architecture = curr.get("GPU Architecture", "Unknown")
-    specs_bullets = "\n".join([f"- We have {v} of {k}." for k, v in curr.items() if k != "GPU Architecture"])
+    # Derive architecture name from common keys
+    hardware_architecture = curr.get("architecture") or "Unknown"
+
+    # Build human-readable bullets for specs/definitions/best practices
+    specs_bullets = "\n".join([f"- {k}: {v}" for k, v in curr.items() if k != "architecture"])
     defs_bullets = "\n".join([f"- {k}: {v}" for k, v in definitions.items()])
-    best_bullets = "\n".join([f"- {x}" for x in (best or [])])
+    best_bullets = "\n".join([f"- {x}" for x in best_list])
 
     return {
-        "gpu_name": gpu_name,
-        "gpu_architecture": gpu_architecture,
-        "gpu_specs_bullets": specs_bullets,
-        "gpu_definitions_bullets": defs_bullets,
-        "gpu_best_practices_bullets": best_bullets,
+        "hardware_type": hardware_type,
+        "hardware_name": hardware_name,
+        "hardware_architecture": hardware_architecture,
+        "hardware_specs_bullets": specs_bullets,
+        "hardware_definitions_bullets": defs_bullets,
+        "hardware_best_practices_bullets": best_bullets,
     }
 
 def render_prompt_by_option(
@@ -79,21 +96,23 @@ def render_prompt_by_option(
     language: str,
     option: str,
     context: Dict[str, str],
-    gpu_specs_py: Optional[str] = None,
-    gpu_name: Optional[str] = None,
+    hardware_specs_toml: Optional[str] = None,
+    hardware_type: Optional[str] = None,
+    hardware_name: Optional[str] = None,
 ) -> str:
     """
-    New function that uses languages.X and options.Y structure
-    
+    New function that uses languages.X and options.Y structure from prompts.toml
+
     Args:
         prompts_toml: Path to the prompts.toml file
         language: The kernel language (triton, cuda, cute)
         option: The prompt option (basic, few_shot, hardware_info, fix_compile, fix_correctness)
         context: Variables to fill in the prompt template
-        gpu_specs_py: Optional path to GPU specs Python file
-        gpu_name: Optional GPU name (required if option requires_gpu)
+        hardware_specs_py: Optional path to hardware specs file (.py or .toml)
+        hardware_type: Hardware type (e.g., "GPU", "Tenstorrent")
+        hardware_name: Hardware name (e.g., "A100", "H100")
     """
-    cfg = PromptConfig.from_toml(prompts_toml)
+    cfg = TomlConfig.from_toml(prompts_toml)
     
     # Get language-specific content
     try:
@@ -131,18 +150,19 @@ def render_prompt_by_option(
             lang_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch")
         )
         ex_new_path = _abs_path(lang_data["few_shot_new_arch"])
-        context = {
-            **context,
-            "example_arch_src": read_file(ex_arch_path),
-            "example_new_arch_src": read_file(ex_new_path),
-        }
-    
-    # Load GPU details if requested
-    if option_data.get("requires_gpu"):
-        if not (gpu_specs_py and gpu_name):
-            raise ValueError(f"Option '{option}' requires GPU info; provide gpu_specs_py and gpu_name")
-        context = {**context, **_gpu_context_from_py(_abs_path(gpu_specs_py), gpu_name)}
-    
+        context.update(
+            {
+                "example_arch_src": read_file(ex_arch_path),
+                "example_new_arch_src": read_file(ex_new_path),
+            }
+        )
+
+    # Load hardware details if requested
+    if option_data.get("requires_hardware"):
+        if not (hardware_specs_toml and hardware_type and hardware_name):
+            raise ValueError(f"Option '{option}' requires hardware info; provide hardware_specs_toml, hardware_type, and hardware_name")
+        context.update(**_hardware_context_from_path(_abs_path(hardware_specs_toml), hardware_type, hardware_name),)
+
     # Build the prompt from components
     prompt_parts = []
     for component in option_data["components"]:
diff --git a/src/prompt_constructor_multilang.py b/src/prompt_constructor_multilang.py
index c52e0eeb..0c210ec4 100644
--- a/src/prompt_constructor_multilang.py
+++ b/src/prompt_constructor_multilang.py
@@ -4,7 +4,8 @@
 
 REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 PROMPTS_TOML = _abs_path("src/prompts/prompts.toml")
-GPU_SPECS_PY = "src/prompts/hardware/gpu_specs.py"  # still a Python file
+# Default hardware specs TOML file (contains HARDWARE_SPEC_INFO)
+HARDWARE_SPECS_TOML = _abs_path("src/prompts/hardware/hardware_specs.toml")
 
 def get_prompt_for_language(ref_arch_src: str, language: str = "triton", option: str = "few_shot") -> str:
     """
@@ -22,22 +23,24 @@ def get_prompt_for_language(ref_arch_src: str, language: str = "triton", option:
         context={"ref_arch_src": ref_arch_src},
     )
 
-def get_prompt_with_hardware(ref_arch_src: str, language: str, gpu_name: str) -> str:
+def get_prompt_with_hardware(ref_arch_src: str, language: str, hardware_name: str, hardware_type: str = "GPU") -> str:
     """
     Generate a hardware-aware prompt for a specific language.
-    
+
     Args:
         ref_arch_src: The reference architecture source code
         language: The kernel language (triton, cuda, cute)
-        gpu_name: The name of the GPU (e.g., "A100", "H100")
+        hardware_name: The name of the hardware (e.g., "A100", "H100", "TENSILICA")
+        hardware_type: The type of the hardware (e.g., "GPU", "Tenstorrent")
     """
     return render_prompt_by_option(
         prompts_toml=PROMPTS_TOML,
         language=language.lower(),
         option="hardware_info",
         context={"ref_arch_src": ref_arch_src},
-        gpu_specs_py=GPU_SPECS_PY,
-        gpu_name=gpu_name,
+        hardware_specs_py=HARDWARE_SPECS_TOML,
+        hardware_type=hardware_type,
+        hardware_name=hardware_name,
     )
 
 def prompt_fix_compile(language: str, ref_arch_src: str, custom_kernel: str, metadata: str) -> str:
diff --git a/src/prompts/hardware/hardware_specs.toml b/src/prompts/hardware/hardware_specs.toml
new file mode 100644
index 00000000..982f2d7b
--- /dev/null
+++ b/src/prompts/hardware/hardware_specs.toml
@@ -0,0 +1,202 @@
+[meta]
+version = "1.0"
+default_hardware_type = "GPU"
+
+# -------------------------------------------------------------------------
+# Hardware Types: GPU, Tenstorrent, etc.
+# -------------------------------------------------------------------------
+[GPU]
+description = "NVIDIA GPU specifications and best practices"
+
+[GPU.L40S]
+architecture = "Ada"
+GPU_Memory = "48GB GDDR6 with ECC"
+Memory_Bandwidth = "864 GB/s"
+RT_Core_Performance_TFLOPS = "212"
+FP32_TFLOPS = "91.6"
+TF32_Tensor_Core_TFLOPS = "183.2 (366 with sparsity)"
+FP16_Tensor_Core_TFLOPS = "362.05 (733 with sparsity)"
+FP8_Tensor_Core_TFLOPS = "733 (1466 with sparsity)"
+Peak_INT8_Tensor_TOPS = "733 (1466 with sparsity)"
+Peak_INT4_Tensor_TOPS = "733 (1466 with sparsity)"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "24"
+Shared_memory_capacity_per_SM = "100 KB"
+Maximum_shared_memory_per_thread_block = "99 KB"
+
+[GPU.H100]
+architecture = "Hopper"
+GPU_Memory = "80GB"
+Memory_Bandwidth = "3.35 TB/s"
+FP64_TFLOPS = "34"
+FP64_Tensor_Core_TFLOPS = "67"
+FP32_TFLOPS = "67"
+TF32_Tensor_Core_TFLOPS = "989 with sparsity"
+BFLOAT16_Tensor_Core_TFLOPS = "1979 with sparsity"
+FP16_Tensor_Core_TFLOPS = "1979 with sparsity"
+FP8_Tensor_Core_TFLOPS = "3958 with sparsity"
+INT8_Tensor_Core_TOPS = "3958 with sparsity"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "32"
+Shared_memory_capacity_per_SM = "228 KB"
+Maximum_shared_memory_per_thread_block = "227 KB"
+
+[GPU.A100]
+architecture = "Ampere"
+GPU_Memory = "40GB"
+Memory_Bandwidth = "1935 GB/s"
+FP64_TFLOPS = "9.7"
+FP64_Tensor_Core_TFLOPS = "19.5"
+FP32_TFLOPS = "19.5"
+TF32_Tensor_Core_TFLOPS = "156 (312 with sparsity)"
+BFLOAT16_Tensor_Core_TFLOPS = "312 (624 with sparsity)"
+FP16_Tensor_Core_TFLOPS = "312 (624 with sparsity)"
+INT8_Tensor_Core_TOPS = "624 (1248 with sparsity)"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "32"
+Shared_memory_capacity_per_SM = "164 KB"
+Maximum_shared_memory_per_thread_block = "163 KB"
+
+[GPU.A100-80GB]
+architecture = "Ampere"
+GPU_Memory = "80GB"
+Memory_Bandwidth = "1935 GB/s"
+FP64_TFLOPS = "9.7"
+FP64_Tensor_Core_TFLOPS = "19.5"
+FP32_TFLOPS = "19.5"
+TF32_Tensor_Core_TFLOPS = "156 (312 with sparsity)"
+BFLOAT16_Tensor_Core_TFLOPS = "312 (624 with sparsity)"
+FP16_Tensor_Core_TFLOPS = "312 (624 with sparsity)"
+INT8_Tensor_Core_TOPS = "624 (1248 with sparsity)"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "32"
+Shared_memory_capacity_per_SM = "164 KB"
+Maximum_shared_memory_per_thread_block = "163 KB"
+
+[GPU.L4]
+architecture = "Ada"
+GPU_Memory = "24GB"
+Memory_Bandwidth = "300 GB/s"
+FP32_TFLOPS = "30.3"
+TF32_Tensor_Core_TFLOPS = "120 with sparsity"
+BFLOAT16_Tensor_Core_TFLOPS = "242 with sparsity"
+FP8_Tensor_Core_TFLOPS = "485 with sparsity"
+INT8_Tensor_Core_TOPS = "485 with sparsity"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "24"
+Shared_memory_capacity_per_SM = "100 KB"
+Maximum_shared_memory_per_thread_block = "99 KB"
+
+[GPU.T4]
+architecture = "Turing"
+GPU_Memory = "16 GB GDDR6"
+Memory_Bandwidth = "300 GB/s"
+Single_Precision_TFLOPS = "8.1"
+Mixed_Precision_FP16_FP32_TFLOPS = "65"
+INT8_TOPS = "130"
+INT4_TOPS = "260"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "16"
+Shared_memory_capacity_per_SM = "64 KB"
+
+[GPU.A10G]
+architecture = "Ampere"
+GPU_Memory = "24GB GDDR6"
+Memory_Bandwidth = "600 GB/s"
+FP32_TFLOPS = "31.2"
+TF32_Tensor_Core_TFLOPS = "62.5 (125 with sparsity)"
+BFLOAT16_Tensor_Core_TFLOPS = "125 (250 with sparsity)"
+FP16_Tensor_Core_TFLOPS = "125 (250 with sparsity)"
+INT8_Tensor_Core_TOPS = "250 (500 with sparsity)"
+INT4_Tensor_Core_TOPS = "500 (1000 with sparsity)"
+Register_File_Size = "64K 32-bit registers per SM"
+Maximum_registers_per_thread = "255"
+Maximum_thread_blocks_per_SM = "32"
+Shared_memory_capacity_per_SM = "164 KB"
+Maximum_shared_memory_per_thread_block = "163 KB"
+
+# -------------------------------------------------------------------------
+# GPU-specific Definitions and Best Practices
+# -------------------------------------------------------------------------
+
+[GPU.definitions]
+Thread = "A thread is a single execution unit that can run a single instruction at a time."
+Thread_Block = "A thread block is a group of threads that can cooperate with each other."
+Warp = "A warp is a group of threads that are scheduled together and execute in parallel."
+Shared_Memory = "Shared memory is a memory space that can be accessed by all threads in a thread block."
+Register = "A register is a small memory space that can be accessed by a single thread."
+Memory_Hierarchy = "Memory hierarchy is a pyramid of memory types with different speeds and sizes."
+Memory_Bandwidth = "Memory bandwidth is the rate at which data can be read from or stored into memory."
+Cache = "Cache is a small memory space that stores frequently accessed data."
+HBM = "HBM is a high-bandwidth memory technology that uses 3D-stacked DRAM."
+
+[GPU.best_practices]
+items = [
+  "Find ways to parallelize sequential code.",
+  "Minimize data transfers between the host and the device.",
+  "Adjust kernel launch configuration to maximize device utilization.",
+  "Ensure that global memory accesses are coalesced.",
+  "Minimize redundant accesses to global memory whenever possible.",
+  "Avoid long sequences of diverged execution by threads within the same warp.",
+  "Use specialized instructions based on the specific GPU architecture",
+]
+
+# -------------------------------------------------------------------------
+# Tenstorrent Hardware Type (example)
+# -------------------------------------------------------------------------
+
+[TT]
+description = "Tenstorrent accelerator specifications and best practices"
+
+[TT.Wormhole]
+architecture = "Wormhole"
+compute_units = "144 Tensix cores"
+memory_capacity = "16GB LPDDR4x"
+memory_bandwidth = "100 GB/s"
+mesh_size = "12x12 mesh"
+peak_bfloat16_tops = "150 TOPS"
+peak_int8_tops = "300 TOPS"
+on_chip_network_bandwidth = "6 TB/s"
+
+[TT.Blackhole]
+architecture = "Blackhole"
+compute_units = "288 Tensix cores"
+memory_capacity = "32GB LPDDR5"
+memory_bandwidth = "200 GB/s"
+mesh_size = "18x16 mesh"
+peak_bfloat16_tops = "600 TOPS"
+peak_int8_tops = "1200 TOPS"
+on_chip_network_bandwidth = "12 TB/s"
+
+[TT.Blackhole_Pro]
+architecture = "Blackhole Pro"
+compute_units = "512 Tensix cores"
+memory_capacity = "64GB LPDDR5"
+memory_bandwidth = "400 GB/s"
+mesh_size = "24x24 mesh"
+peak_bfloat16_tops = "900 TOPS"
+peak_int8_tops = "1800 TOPS"
+on_chip_network_bandwidth = "20 TB/s"
+
+# -------------------------------------------------------------------------
+# TT-specific Definitions and Best Practices
+# -------------------------------------------------------------------------
+
+[TT.definitions]
+Thread = "A thread is a compute unit in the Tenstorrent mesh architecture."
+Tensor_Core = "A compute unit optimized for tensor operations."
+Mesh_Grid = "A 2D grid of compute units in Tenstorrent architecture."
+
+[TT.best_practices]
+items = [
+  "Optimize for tensor-parallel execution.",
+  "Maximize data locality in mesh architecture.",
+  "Use host-device data transfer efficiently.",
+  "Leverage mesh interconnect bandwidth.",
+]
diff --git a/src/prompts/model_new_ex_add_pykernel.py b/src/prompts/model_new_ex_add_pykernel.py
new file mode 100644
index 00000000..ea139fec
--- /dev/null
+++ b/src/prompts/model_new_ex_add_pykernel.py
@@ -0,0 +1,207 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from pykernel import (
+    PyKernelOp,
+    reader_thread,
+    writer_thread,
+    compute_thread,
+    CircularBuffer,
+)
+
+from math import ceil
+
+import ttnn
+import torch
+import torch.nn as nn
+
+
+class VecAddMulticorePyKernelOp(PyKernelOp):
+    def __init__(self, max_core_ranges=None):
+        super().__init__()
+        self.max_core_ranges = max_core_ranges
+
+    # KERNEL DEFINITIONS
+    @compute_thread()
+    def add_multicore(
+        cb_in0: CircularBuffer,
+        cb_in1: CircularBuffer,
+        cb_out: CircularBuffer,
+        num_tiles,
+        start_tile_id,
+    ):
+        binary_op_init_common(cb_in0, cb_in1, cb_out)
+        add_tiles_init(cb_in0, cb_in1)
+
+        end_tile_id = start_tile_id + num_tiles
+        dst_reg = 0
+
+        for i in range(start_tile_id, end_tile_id, 1):
+            cb_wait_front(cb_in0, 1)
+            cb_wait_front(cb_in1, 1)
+            tile_regs_acquire()
+            add_tiles(cb_in0, cb_in1, 0, 0, dst_reg)
+            tile_regs_commit()
+
+            cb_reserve_back(cb_out, 1)
+            tile_regs_wait()
+            pack_tile(dst_reg, cb_out, 0)
+            tile_regs_release()
+
+            cb_push_back(cb_out, 1)
+            cb_pop_front(cb_in0, 1)
+            cb_pop_front(cb_in1, 1)
+            tile_regs_release()
+        return
+
+    @writer_thread()
+    def writer_multicore(
+        cb_out: CircularBuffer,
+        dst_addr,
+        num_tiles,
+        start_id,
+    ):
+        onetile = 1
+
+        tile_bytes = get_tile_size(cb_out)
+        tensor_accessor_args = TensorAccessorArgs(1, 0)
+        s0 = TensorAccessor(tensor_accessor_args, dst_addr, tile_bytes)
+
+        end_id = start_id + num_tiles
+        for i in range(start_id, end_id, onetile):
+            cb_wait_front(cb_out, onetile)
+            l1_read_addr = get_read_ptr(cb_out)
+            noc_async_write_tile(i, s0, l1_read_addr)
+            noc_async_write_barrier()
+            cb_pop_front(cb_out, onetile)
+        return
+
+    @reader_thread()
+    def reader_binary_interleaved(
+        cb_in0: CircularBuffer,
+        cb_in1: CircularBuffer,
+        src_addr0,
+        src_addr1,
+        num_tiles,
+        start_id,
+    ):
+        onetile = 1
+
+        tile_bytes0 = get_tile_size(cb_in0)
+        tensor_accessor_args = TensorAccessorArgs(2, 0)
+        s0 = TensorAccessor(tensor_accessor_args, src_addr0, tile_bytes0)
+
+        tile_bytes1 = get_tile_size(cb_in1)
+        tensor_accessor_args = TensorAccessorArgs(2, 0)
+        s1 = TensorAccessor(tensor_accessor_args, src_addr1, tile_bytes1)
+
+        end_id = start_id + num_tiles
+        for i in range(start_id, end_id, onetile):
+            cb_reserve_back(cb_in0, onetile)
+            cb_reserve_back(cb_in1, onetile)
+
+            src0_write_addr = get_write_ptr(cb_in0)
+            src1_write_addr = get_write_ptr(cb_in1)
+
+            noc_async_read_tile(i, s0, src0_write_addr)
+            noc_async_read_tile(i, s1, src1_write_addr)
+
+            noc_async_read_barrier()
+            cb_push_back(cb_in0, onetile)
+            cb_push_back(cb_in1, onetile)
+        return
+
+    def define_core_ranges(self, tensors, options):
+        core_0 = ttnn.CoreCoord(0, 0)
+        if self.max_core_ranges is None:
+            core_1 = ttnn.CoreCoord(1, 1)
+        else:
+            core_1 = self.max_core_ranges
+        return ttnn.CoreRangeSet([ttnn.CoreRange(core_0, core_1)])
+
+    def invoke(
+        self,  # super() has invoke signature (*tensors, **options)
+        a_tensor,
+        b_tensor,
+        out_tensor,  # Tensor Definitions are positional args
+    ):
+        cb_in0 = self.create_cb(a_tensor, 0)
+        cb_in1 = self.create_cb(b_tensor, 1)
+        cb_out = self.create_cb(out_tensor, 2)
+        start_id = 0
+
+        self.set_tensor_accessor_config(a_tensor)
+
+        num_tiles = ceil(
+            max(map(lambda t: t.volume(), [a_tensor, b_tensor, out_tensor])) / 1024
+        )
+
+        num_cores = self.get_core_ranges().num_cores()
+        num_tiles_per_core = num_tiles / num_cores
+
+        if num_tiles_per_core % 1 != 0:
+            # uneven distro of work, just break down and cry.
+            raise Exception(":sad_hamster:")
+
+        num_tiles_per_core = int(num_tiles_per_core)
+
+        # Define the multicore runtime arguments
+        start_id_multicore = []
+
+        # Go row-wise
+        bb = self.get_core_ranges().bounding_box()
+        for i in range(bb.start.x, bb.end.x + 1):
+            start_id_multicore.append([])
+            for j in range(bb.start.y, bb.end.y + 1):
+                # Set for each core
+                start_id_multicore[-1].append([start_id])
+                start_id += 1
+
+        kernels = [
+            self.create_kernel(
+                VecAddMulticorePyKernelOp.add_multicore,
+                cb_in0,
+                cb_in1,
+                cb_out,
+                num_tiles_per_core,
+                start_id_multicore,
+            ),
+            self.create_kernel(
+                VecAddMulticorePyKernelOp.writer_multicore,
+                cb_out,
+                out_tensor.buffer_address(),
+                num_tiles_per_core,
+                start_id_multicore,
+            ),
+            self.create_kernel(
+                VecAddMulticorePyKernelOp.reader_binary_interleaved,
+                cb_in0,
+                cb_in1,
+                a_tensor.buffer_address(),
+                b_tensor.buffer_address(),
+                num_tiles_per_core,
+                start_id_multicore,
+            ),
+        ]
+
+        return self.create_program(kernels, [cb_in0, cb_in1, cb_out])
+
+
+class ModelNew(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.vecadd_op = VecAddMulticorePyKernelOp()
+
+    def forward(self, A: ttnn.Tensor, B: ttnn.Tensor, device: ttnn.MeshDevice) -> ttnn.Tensor:
+        shape = list(A.shape)
+
+        output_tensor = ttnn.allocate_tensor_on_device(
+            ttnn.Shape(shape),
+            layout=ttnn.TILE_LAYOUT,
+            device=device,
+            memory_config=ttnn.DRAM_MEMORY_CONFIG,
+        )
+
+        output = self.vecadd_op(A, B, output_tensor)
+        return output
\ No newline at end of file
diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml
index 38763c2a..6b4f3c7d 100644
--- a/src/prompts/prompts.toml
+++ b/src/prompts/prompts.toml
@@ -34,6 +34,10 @@ few_shot_new_arch = "src/prompts/model_new_ex_add.py"
 language_display = "CuTe (CUTLASS) kernels"
 few_shot_new_arch = "src/prompts/model_new_ex_add_cute.py"
 
+[languages.pykernel]
+language_display = "Tenstorrent PyKernel kernels"
+few_shot_new_arch = "src/prompts/model_new_ex_add_pykernel.py"
+
 # -------------------------------------------------------------------------
 # Options: Different prompt construction modes
 # -------------------------------------------------------------------------
@@ -50,9 +54,9 @@ requires_example = true
 
 [options.hardware_info]
 # Hardware-aware prompt
-description = "Includes GPU hardware specifications and best practices"
+description = "Includes hardware specifications and best practices"
 components = ["problem_statement", "few_shot_block", "hardware_header", "hardware_specs", "hardware_definitions", "hardware_best_practices", "arch_block", "instruction"]
-requires_gpu = true
+requires_hardware = true
 requires_example = true
 
 [options.fix_compile]
@@ -143,19 +147,19 @@ Here is some information about the underlying hardware that you should keep in m
 """
 
 hardware_specs = """
-The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.
+The {hardware_type} that will run the kernel is {hardware_name}, {hardware_architecture} architecture.
 
-{gpu_specs_bullets}
+{hardware_specs_bullets}
 """
 
 hardware_definitions = """
-Here are some concepts about the GPU architecture that could be helpful:
+Here are some concepts about the hardware architecture that could be helpful:
 
-{gpu_definitions_bullets}
+{hardware_definitions_bullets}
 """
 
 hardware_best_practices = """
-Here are some best practices for writing kernels on GPU:
+Here are some best practices for writing kernels on this hardware:
 
-{gpu_best_practices_bullets}
+{hardware_best_practices_bullets}
 """

From 59ba4239300fbbf4e30629d1d0fb828e19094234 Mon Sep 17 00:00:00 2001
From: chilinghan <chilinghan2007@gmail.com>
Date: Mon, 10 Nov 2025 17:03:33 -0800
Subject: [PATCH 4/5] Clean up

---
 scripts/generate_and_eval_single_sample.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index c7e637cb..ffaeecbd 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -9,7 +9,6 @@
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
-from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
 from src.prompt_constructor_multilang import get_prompt_for_language
 from src.utils import (
     create_inference_server_from_presets,
@@ -146,12 +145,7 @@ def main(config: EvalConfig):
     )
 
     # Use appropriate prompt constructor based on backend
-    if config.backend in ["cuda", "triton", "cute"]:
-        custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
-    else:
-        raise ValueError(
-            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
-        )
+    custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
 
     if config.log_prompt:
         with open(

From 7564b26c4b33f7914fd2c0a5ea01a45138f3366b Mon Sep 17 00:00:00 2001
From: chilinghan <chilinghan2007@gmail.com>
Date: Tue, 11 Nov 2025 15:58:21 -0800
Subject: [PATCH 5/5] Let generate_and_eval_single_sample script to have option
 to change type of prompt generation

---
 scripts/generate_and_eval_single_sample.py |  16 ++-
 src/loader.py                              | 107 ++++++++++++++-------
 src/prompt_constructor_multilang.py        |  34 ++-----
 src/prompts/hardware/hardware_specs.toml   |   4 -
 4 files changed, 90 insertions(+), 71 deletions(-)

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index ffaeecbd..15519d91 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -10,6 +10,7 @@
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor_multilang import get_prompt_for_language
+from src.loader import get_hardware_architecture
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
@@ -44,9 +45,14 @@ def __init__(self):
         # Evaluation
         # local (requires a GPU), modal (cloud GPU) coming soon
         self.eval_mode = "local"
+
+        self.option = "few_shot"
+        self.hardware_type = "GPU"
+        self.hardware_name = "L40S"
+
         # Construct this from mapping from architecture name to torch cuda arch list in the future
         # you can either specify SM version or just use the name
-        self.gpu_arch = ["Ada"]
+        self.arch = get_hardware_architecture(hardware_type=self.hardware_type, hardware_name=self.hardware_name)
 
         # Inference config
         self.server_type = "deepseek"
@@ -90,8 +96,8 @@ def main(config: EvalConfig):
     elif config.dataset_src == "local":
         curr_level_dataset = construct_kernelbench_dataset(config.level)
 
-    if config.gpu_arch:
-        set_gpu_arch(config.gpu_arch)  # otherwise build for all architectures
+    if config.hardware_type == "GPU" and config.arch:
+        set_gpu_arch(config.arch)  # otherwise build for all architectures
 
     if config.log:
         os.makedirs(config.logdir, exist_ok=True)
@@ -145,7 +151,9 @@ def main(config: EvalConfig):
     )
 
     # Use appropriate prompt constructor based on backend
-    custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend, option="few_shot")
+    custom_prompt = get_prompt_for_language(ref_arch_src, language=config.backend,
+                                            option=config.option,
+                                            hardware_name=config.hardware_name, hardware_type=config.hardware_type)
 
     if config.log_prompt:
         with open(
diff --git a/src/loader.py b/src/loader.py
index 68ad1c04..28f7ace4 100644
--- a/src/loader.py
+++ b/src/loader.py
@@ -14,6 +14,9 @@ def _abs_path(rel: str) -> str:
         return rel
     return os.path.join(REPO_TOP_PATH, rel)
 
+PROMPTS_TOML = _abs_path("src/prompts/prompts.toml")
+HARDWARE_SPECS_TOML = _abs_path("src/prompts/hardware/hardware_specs.toml")
+
 @dataclass
 class TomlConfig:
     data: Dict[str, Any]
@@ -38,7 +41,10 @@ def compose_blocks(self, keys: List[str]) -> str:
             text_parts.append(node.strip() + "\n")
         return "\n".join(text_parts).strip() + "\n"
 
-def _hardware_context_from_path(specs_path: str, hardware_type: str, hardware_name: str) -> Dict[str, str]:
+prompt_cfg = TomlConfig.from_toml(PROMPTS_TOML)
+hardware_cfg = TomlConfig.from_toml(HARDWARE_SPECS_TOML)
+
+def _hardware_context_from_path(hardware_type: str, hardware_name: str) -> Dict[str, str]:
     """
     Load hardware spec dicts from a TOML file (.toml).
     - HARDWARE_SPEC_INFO = { type: { name: { ... } } }
@@ -46,14 +52,7 @@ def _hardware_context_from_path(specs_path: str, hardware_type: str, hardware_na
     For TOML files we expect structure like [hardware_type.hardware_name], plus optional
     [hardware_type.definitions], [hardware_type.best_practices], etc.
     """
-    cfg = TomlConfig.from_toml(specs_path)
-    data = cfg.data
-
-    # Resolve hardware_type default from meta if not provided
-    if not hardware_type:
-        hardware_type = data.get("meta", {}).get("default_hardware_type")
-
-    hw_section = data.get(hardware_type)
+    hw_section = hardware_cfg.data.get(hardware_type)
     if not isinstance(hw_section, dict):
         raise KeyError(f"Hardware type '{hardware_type}' not found in specs TOML")
 
@@ -61,7 +60,9 @@ def _hardware_context_from_path(specs_path: str, hardware_type: str, hardware_na
     if isinstance(hw_section.get(hardware_name), dict):
         curr = hw_section[hardware_name]
     if curr is None:
-        raise KeyError(f"Hardware '{hardware_name}' not found under type '{hardware_type}' in {specs_path}")
+        raise KeyError(
+            f"Hardware '{hardware_name}' not found under type '{hardware_type}' in {HARDWARE_SPECS_TOML}"
+        )
 
     # definitions
     definitions = {}
@@ -77,7 +78,9 @@ def _hardware_context_from_path(specs_path: str, hardware_type: str, hardware_na
     hardware_architecture = curr.get("architecture") or "Unknown"
 
     # Build human-readable bullets for specs/definitions/best practices
-    specs_bullets = "\n".join([f"- {k}: {v}" for k, v in curr.items() if k != "architecture"])
+    specs_bullets = "\n".join(
+        [f"- {k}: {v}" for k, v in curr.items() if k != "architecture"]
+    )
     defs_bullets = "\n".join([f"- {k}: {v}" for k, v in definitions.items()])
     best_bullets = "\n".join([f"- {x}" for x in best_list])
 
@@ -90,13 +93,36 @@ def _hardware_context_from_path(specs_path: str, hardware_type: str, hardware_na
         "hardware_best_practices_bullets": best_bullets,
     }
 
+def get_hardware_architecture(
+    hardware_type: str, hardware_name: str
+) -> str:
+    """
+    Convenience helper: return the architecture string for a given hardware.
+
+    Args:
+        hardware_type: Hardware type (e.g., 'GPU', 'TT').
+        hardware_name: Hardware name (e.g., 'A100', 'L4', 'Grayskull').
+
+    Returns:
+        The architecture name as a string (e.g., 'Ampere', 'Ada', 'Grayskull').
+
+    Raises:
+        KeyError / ValueError propagated from the underlying loader if the
+        hardware entry isn't found or the file is invalid. This keeps the
+        behavior explicit for callers.
+    """
+
+    # Use the existing loader to build the context and extract architecture
+    ctx = _hardware_context_from_path(hardware_type, hardware_name)
+    arch = ctx.get("hardware_architecture")
+    return arch if arch != "Unknown" else None
+
+
 def render_prompt_by_option(
     *,
-    prompts_toml: str,
     language: str,
     option: str,
     context: Dict[str, str],
-    hardware_specs_toml: Optional[str] = None,
     hardware_type: Optional[str] = None,
     hardware_name: Optional[str] = None,
 ) -> str:
@@ -112,42 +138,47 @@ def render_prompt_by_option(
         hardware_type: Hardware type (e.g., "GPU", "Tenstorrent")
         hardware_name: Hardware name (e.g., "A100", "H100")
     """
-    cfg = TomlConfig.from_toml(prompts_toml)
-    
     # Get language-specific content
     try:
-        lang_data = cfg.data["languages"][language]
+        lang_data = prompt_cfg.data["languages"][language]
     except KeyError:
         raise KeyError(f"Unknown language: {language}")
-    
+
     # Get option configuration
     try:
-        option_data = cfg.data["options"][option]
+        option_data = prompt_cfg.data["options"][option]
     except KeyError:
         raise KeyError(f"Unknown option: {option}")
-    
+
     # Get shared templates
-    shared = cfg.data.get("shared", {})
+    shared = prompt_cfg.data.get("shared", {})
     language_display = lang_data.get("language_display", language.upper())
-    
+
     # Fill in shared templates with language-specific terms
-    problem_statement = shared.get("problem_statement", "").format(language_display=language_display)
-    instruction = shared.get("instruction", "").format(language_display=language_display)
-    
+    problem_statement = shared.get("problem_statement", "").format(
+        language_display=language_display
+    )
+    instruction = shared.get("instruction", "").format(
+        language_display=language_display
+    )
+
     # Add language-specific content to context
     context = {
         **context,
-        "language": language.upper() if language in ["cuda", "cute"] else language.capitalize(),
+        "language": (
+            language.upper() if language in ["cuda", "cute"] else language.capitalize()
+        ),
         "language_display": language_display,
         "problem_statement": problem_statement,
         "instruction": instruction,
     }
-    
+
     # Load example files if requested
     if option_data.get("requires_example"):
         # Use language-specific example arch, or fall back to shared one
         ex_arch_path = _abs_path(
-            lang_data.get("few_shot_example_arch") or shared.get("few_shot_example_arch")
+            lang_data.get("few_shot_example_arch")
+            or shared.get("few_shot_example_arch")
         )
         ex_new_path = _abs_path(lang_data["few_shot_new_arch"])
         context.update(
@@ -159,9 +190,13 @@ def render_prompt_by_option(
 
     # Load hardware details if requested
     if option_data.get("requires_hardware"):
-        if not (hardware_specs_toml and hardware_type and hardware_name):
-            raise ValueError(f"Option '{option}' requires hardware info; provide hardware_specs_toml, hardware_type, and hardware_name")
-        context.update(**_hardware_context_from_path(_abs_path(hardware_specs_toml), hardware_type, hardware_name),)
+        if not (hardware_type and hardware_name):
+            raise ValueError(
+                f"Option '{option}' requires hardware info; provide hardware_type, and hardware_name"
+            )
+        context.update(
+            **_hardware_context_from_path(hardware_type, hardware_name),
+        )
 
     # Build the prompt from components
     prompt_parts = []
@@ -175,15 +210,17 @@ def render_prompt_by_option(
         elif component.startswith("hardware_"):
             # Hardware components from templates.hardware
             template_key = f"templates.hardware.{component}"
-            prompt_parts.append(cfg.compose_blocks([template_key]))
+            prompt_parts.append(prompt_cfg.compose_blocks([template_key]))
         else:
             # Other components from templates.common
             template_key = f"templates.common.{component}"
-            prompt_parts.append(cfg.compose_blocks([template_key]))
-    
+            prompt_parts.append(prompt_cfg.compose_blocks([template_key]))
+
     prompt_text = "\n".join(prompt_parts).strip() + "\n"
-    
+
     try:
         return prompt_text.format(**context).strip() + "\n"
     except KeyError as e:
-        raise KeyError(f"Missing placeholder in context: {e.args[0]}. Available: {list(context.keys())}") from e
+        raise KeyError(
+            f"Missing placeholder in context: {e.args[0]}. Available: {list(context.keys())}"
+        ) from e
diff --git a/src/prompt_constructor_multilang.py b/src/prompt_constructor_multilang.py
index 0c210ec4..6b3ed136 100644
--- a/src/prompt_constructor_multilang.py
+++ b/src/prompt_constructor_multilang.py
@@ -1,13 +1,12 @@
 # src/prompt_constructor_multilang.py  (new option-based prompt constructor)
 import os
-from .loader import render_prompt_by_option, _abs_path
+from .loader import render_prompt_by_option
 
-REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
-PROMPTS_TOML = _abs_path("src/prompts/prompts.toml")
-# Default hardware specs TOML file (contains HARDWARE_SPEC_INFO)
-HARDWARE_SPECS_TOML = _abs_path("src/prompts/hardware/hardware_specs.toml")
-
-def get_prompt_for_language(ref_arch_src: str, language: str = "triton", option: str = "few_shot") -> str:
+def get_prompt_for_language(ref_arch_src: str,
+                            language: str = "triton",
+                            option: str = "few_shot",
+                            hardware_name: str = None,
+                            hardware_type: str = None) -> str:
     """
     Generate a prompt for a specific language and option.
     
@@ -17,28 +16,9 @@ def get_prompt_for_language(ref_arch_src: str, language: str = "triton", option:
         option: The prompt option (basic, few_shot, hardware_info)
     """
     return render_prompt_by_option(
-        prompts_toml=PROMPTS_TOML,
         language=language.lower(),
         option=option,
         context={"ref_arch_src": ref_arch_src},
-    )
-
-def get_prompt_with_hardware(ref_arch_src: str, language: str, hardware_name: str, hardware_type: str = "GPU") -> str:
-    """
-    Generate a hardware-aware prompt for a specific language.
-
-    Args:
-        ref_arch_src: The reference architecture source code
-        language: The kernel language (triton, cuda, cute)
-        hardware_name: The name of the hardware (e.g., "A100", "H100", "TENSILICA")
-        hardware_type: The type of the hardware (e.g., "GPU", "Tenstorrent")
-    """
-    return render_prompt_by_option(
-        prompts_toml=PROMPTS_TOML,
-        language=language.lower(),
-        option="hardware_info",
-        context={"ref_arch_src": ref_arch_src},
-        hardware_specs_py=HARDWARE_SPECS_TOML,
         hardware_type=hardware_type,
         hardware_name=hardware_name,
     )
@@ -54,7 +34,6 @@ def prompt_fix_compile(language: str, ref_arch_src: str, custom_kernel: str, met
         metadata: Compilation error metadata
     """
     return render_prompt_by_option(
-        prompts_toml=PROMPTS_TOML,
         language=language.lower(),
         option="fix_compile",
         context={
@@ -76,7 +55,6 @@ def prompt_fix_correctness(language: str, ref_arch_src: str, custom_kernel: str,
         metadata: Correctness error metadata
     """
     return render_prompt_by_option(
-        prompts_toml=PROMPTS_TOML,
         language=language.lower(),
         option="fix_correctness",
         context={
diff --git a/src/prompts/hardware/hardware_specs.toml b/src/prompts/hardware/hardware_specs.toml
index 982f2d7b..fe43c6c4 100644
--- a/src/prompts/hardware/hardware_specs.toml
+++ b/src/prompts/hardware/hardware_specs.toml
@@ -1,7 +1,3 @@
-[meta]
-version = "1.0"
-default_hardware_type = "GPU"
-
 # -------------------------------------------------------------------------
 # Hardware Types: GPU, Tenstorrent, etc.
 # -------------------------------------------------------------------------